void pllInitParsimonyStructures(pllInstance *tr, partitionList *pr, boolean perSiteScores) { int i, *informative = (int *)rax_malloc(sizeof(int) * (size_t)tr->originalCrunchedLength); for (i = 0; i < pr->numberOfPartitions; ++ i) rax_free (pr->partitionData[i]->parsVect); rax_free (tr->parsimonyScore); determineUninformativeSites(tr, pr, informative); compressDNA(tr, pr, informative, perSiteScores); for(i = tr->mxtips + 1; i <= tr->mxtips + tr->mxtips - 1; i++) { nodeptr p = tr->nodep[i]; p->xPars = 1; p->next->xPars = 0; p->next->next->xPars = 0; } tr->ti = (int*)rax_malloc(sizeof(int) * 4 * (size_t)tr->mxtips); rax_free(informative); }
/** @brief Initialize a list of best trees Initialize a list that will contain the best \a newkeep tree topologies, i.e. the ones that yield the best likelihood. Inside the list initialize space for \a newkeep + 1 topologies of \a numsp tips. The additional topology is the starting one @param bt Pointer to \a bestlist to be initialized @param newkeep Number of new topologies to keep @param numsp Number of species (tips) @return number of tree topology slots in the list (minus the starting one) @todo Is there a reason that this function is so complex? Many of the checks are unnecessary as the function is called only at two places in the code with newkeep=1 and newkeep=20 */ int initBestTree (bestlist *bt, int newkeep, int numsp) { /* initBestTree */ int i; bt->nkeep = 0; if (bt->ninit <= 0) { if (! (bt->start = setupTopol(numsp))) return 0; bt->ninit = -1; bt->nvalid = 0; bt->numtrees = 0; bt->best = PLL_UNLIKELY; bt->improved = PLL_FALSE; bt->byScore = (topol **) rax_malloc((newkeep + 1) * sizeof(topol *)); bt->byTopol = (topol **) rax_malloc((newkeep + 1) * sizeof(topol *)); if (! bt->byScore || ! bt->byTopol) { printf( "initBestTree: malloc failure\n"); return 0; } } else if (PLL_ABS(newkeep) > bt->ninit) { if (newkeep < 0) newkeep = -(bt->ninit); else newkeep = bt->ninit; } if (newkeep < 1) { /* Use negative newkeep to clear list */ newkeep = -newkeep; if (newkeep < 1) newkeep = 1; bt->nvalid = 0; bt->best = PLL_UNLIKELY; } if (bt->nvalid >= newkeep) { bt->nvalid = newkeep; bt->worst = bt->byScore[newkeep]->likelihood; } else { bt->worst = PLL_UNLIKELY; } for (i = bt->ninit + 1; i <= newkeep; i++) { if (! (bt->byScore[i] = setupTopol(numsp))) break; bt->byTopol[i] = bt->byScore[i]; bt->ninit = i; } return (bt->nkeep = PLL_MIN(newkeep, bt->ninit)); } /* initBestTree */
/** @brief Allocates memory for recomputation structure * * * @todo this should not depend on tr (\a vectorRecomFraction should be a parameter) * PLL_TRUE if recomputation is currently applied * */ void allocRecompVectorsInfo(pllInstance *tr) { recompVectors *v = (recompVectors *) rax_malloc(sizeof(recompVectors)); int num_inner_nodes = tr->mxtips - 2, num_vectors, i; assert(tr->vectorRecomFraction > PLL_MIN_RECOM_FRACTION); assert(tr->vectorRecomFraction < PLL_MAX_RECOM_FRACTION); num_vectors = (int) (1 + tr->vectorRecomFraction * (float)num_inner_nodes); int theoretical_minimum_of_vectors = 3 + ((int)(log((double)tr->mxtips)/log(2.0))); //printBothOpen("Try to use %d ancestral vectors, min required %d\n", num_vectors, theoretical_minimum_of_vectors); assert(num_vectors >= theoretical_minimum_of_vectors); assert(num_vectors < tr->mxtips); v->numVectors = num_vectors; /* use minimum bound theoretical */ /* init vectors tracking */ v->iVector = (int *) rax_malloc((size_t)num_vectors * sizeof(int)); v->unpinnable = (boolean *) rax_malloc((size_t)num_vectors * sizeof(boolean)); for(i = 0; i < num_vectors; i++) { v->iVector[i] = PLL_SLOT_UNUSED; v->unpinnable[i] = PLL_FALSE; } v->iNode = (int *) rax_malloc((size_t)num_inner_nodes * sizeof(int)); v->stlen = (int *) rax_malloc((size_t)num_inner_nodes * sizeof(int)); for(i = 0; i < num_inner_nodes; i++) { v->iNode[i] = PLL_NODE_UNPINNED; v->stlen[i] = PLL_INNER_NODE_INIT_STLEN; } v->allSlotsBusy = PLL_FALSE; /* init nodes tracking */ v->maxVectorsUsed = 0; tr->rvec = v; }
/** @brief Initializes space as large as the tree * * @param rl * RELL * * @param tr * PLL instance * * @param n * Number of * * @todo * Don't know what is this used for. Something with RELL? * */ void initTL(topolRELL_LIST *rl, pllInstance *tr, int n) { int i; rl->max = n; rl->t = (topolRELL **)rax_malloc(sizeof(topolRELL *) * n); for(i = 0; i < n; i++) { rl->t[i] = (topolRELL *)rax_malloc(sizeof(topolRELL)); rl->t[i]->connect = (connectRELL *)rax_malloc((2 * tr->mxtips - 3) * sizeof(connectRELL)); rl->t[i]->likelihood = PLL_UNLIKELY; } }
void treeEvaluateProgressive(tree *tr) { int i, k; tr->branchCounter = 0; tr->numberOfBranches = 2 * tr->mxtips - 3; tr->bInf = (branchInfo*)rax_malloc(tr->numberOfBranches * sizeof(branchInfo)); setupBranches(tr, tr->start->back, tr->bInf); assert(tr->branchCounter == tr->numberOfBranches); for(i = 0; i < tr->numBranches; i++) tr->partitionConverged[i] = FALSE; for(i = 0; i < 10; i++) { for(k = 0; k < tr->numberOfBranches; k++) { update(tr, tr->bInf[k].oP); newviewGeneric(tr, tr->bInf[k].oP); } evaluateGenericInitrav(tr, tr->start); printf("It %d %f \n", i, tr->likelihood); } }
int treeOptimizeThorough(tree *tr, int mintrav, int maxtrav) { int i; bestlist *bestT; nodeRectifier(tr); bestT = (bestlist *) rax_malloc(sizeof(bestlist)); bestT->ninit = 0; initBestTree(bestT, 1, tr->mxtips); if (maxtrav > tr->ntips - 3) maxtrav = tr->ntips - 3; tr->startLH = tr->endLH = tr->likelihood; for(i = 1; i <= tr->mxtips + tr->mxtips - 2; i++) { tr->bestOfNode = unlikely; if(rearrangeBIG(tr, tr->nodep[i], mintrav, maxtrav)) { if((tr->endLH > tr->startLH) && (tr->bestOfNode != unlikely)) { restoreTreeFast(tr); quickSmoothLocal(tr, 3); tr->startLH = tr->endLH = tr->likelihood; } else { if(tr->bestOfNode != unlikely) { resetBestTree(bestT); saveBestTree(bestT, tr); restoreTreeFast(tr); quickSmoothLocal(tr, 3); if(tr->likelihood < tr->startLH) { int res; res = recallBestTree(bestT, 1, tr); assert(res > 0); } else tr->startLH = tr->endLH = tr->likelihood; } } } } freeBestTree(bestT); rax_free(bestT); return 1; }
void *rax_calloc(size_t n, size_t size) { void *ptr = rax_malloc(size * n); memset(ptr, 0, size * n); return ptr; }
void allocateMultifurcations(tree *tr, tree *smallTree) { int i, tips, inter; smallTree->numBranches = tr->numBranches; smallTree->mxtips = tr->mxtips; //printf("Small tree tiups: %d\n", smallTree->mxtips); smallTree->nameHash = tr->nameHash; smallTree->nameList = tr->nameList; tips = tr->mxtips; inter = tr->mxtips - 1; smallTree->nodep = (nodeptr *)rax_malloc((tips + 3 * inter) * sizeof(nodeptr)); smallTree->maxNodes = tips + 3 * inter; smallTree->nodep[0] = (node *) NULL; for (i = 1; i <= tips; i++) { smallTree->nodep[i] = (nodeptr)rax_malloc(sizeof(node)); memcpy(smallTree->nodep[i], tr->nodep[i], sizeof(node)); smallTree->nodep[i]->back = (node *) NULL; smallTree->nodep[i]->next = (node *) NULL; } for(i = tips + 1; i < tips + 3 * inter; i++) { smallTree->nodep[i] = (nodeptr)rax_malloc(sizeof(node)); smallTree->nodep[i]->number = i; smallTree->nodep[i]->back = (node *) NULL; smallTree->nodep[i]->next = (node *) NULL; } }
void allocTraversalCounter(pllInstance *tr) { traversalCounter *tc; int k; tc = (traversalCounter *)rax_malloc(sizeof(traversalCounter)); tc->travlenFreq = (unsigned int *)rax_malloc(tr->mxtips * sizeof(int)); for(k = 0; k < tr->mxtips; k++) tc->travlenFreq[k] = 0; tc->tt = 0; tc->ti = 0; tc->ii = 0; tc->numTraversals = 0; tr->travCounter = tc; }
void *rax_calloc(size_t n, size_t size) { void *ptr = rax_malloc(size * n); if (ptr == (void*)NULL) return (void*)NULL; memset(ptr, 0, size * n); return ptr; }
static double evaluatePartialGTRCATSECONDARY(int i, double ki, int counter, traversalInfo *ti, double qz, int w, double *EIGN, double *EI, double *EV, double *tipVector, unsigned char **yVector, int branchReference, int mxtips) { double lz, term; double d[16]; double *x1, *x2; int scale = 0, k, l; double *lVector = (double *)rax_malloc(sizeof(double) * 16 * mxtips); traversalInfo *trav = &ti[0]; assert(isTip(trav->pNumber, mxtips)); x1 = &(tipVector[16 * yVector[trav->pNumber][i]]); for(k = 1; k < counter; k++) computeVectorGTRCATSECONDARY(lVector, &scale, ki, i, ti[k].qz[branchReference], ti[k].rz[branchReference], &ti[k], EIGN, EI, EV, tipVector, yVector, mxtips); x2 = &lVector[16 * (trav->qNumber - mxtips)]; assert(0 <= (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips); if(qz < zmin) lz = zmin; lz = log(qz); lz *= ki; d[0] = 1.0; for(l = 1; l < 16; l++) d[l] = EXP (EIGN[l-1] * lz); term = 0.0; for(l = 0; l < 16; l++) term += x1[l] * x2[l] * d[l]; term = LOG(FABS(term)) + (scale * LOG(minlikelihood)); term = term * w; rax_free(lVector); return term; }
/** @brief Allocate and initialize space for a tree topology Allocate and initialize a \a topol structure for a tree topology of \a maxtips tips @param Number of tips of topology @return Pointer to the allocated \a topol structure */ static topol *setupTopol (int maxtips) { topol *tpl; if (! (tpl = (topol *) rax_malloc(sizeof(topol))) || ! (tpl->links = (connptr) rax_malloc((2*maxtips-3) * sizeof(connect)))) { printf("ERROR: Unable to get topology memory"); tpl = (topol *) NULL; } else { tpl->likelihood = PLL_UNLIKELY; tpl->start = (node *) NULL; tpl->nextlink = 0; tpl->ntips = 0; tpl->nextnode = 0; tpl->scrNum = 0; /* position in sorted list of scores */ tpl->tplNum = 0; /* position in sorted list of trees */ } return tpl; }
void initInfoList(int n) { int i; iList.n = n; iList.valid = 0; iList.list = (bestInfo *)rax_malloc(sizeof(bestInfo) * n); for(i = 0; i < n; i++) { iList.list[i].node = (nodeptr)NULL; iList.list[i].likelihood = unlikely; } }
/** @ingroup alignmentGroup @brief Parse a PHYLIP file Parses the PHYLIP file \a filename and returns a ::pllAlignmentData structure with the alignment. @param filename Name of file to be parsed @return Returns a structure of type ::pllAlignmentData that contains the alignment, or \b NULL in case of failure. */ static pllAlignmentData * pllParsePHYLIP (const char * filename) { int i, input, sequenceCount, sequenceLength; char * rawdata; long filesize; pllAlignmentData * alignmentData; rawdata = pllReadFile (filename, &filesize); if (!rawdata) { errno = PLL_ERROR_FILE_OPEN; return (NULL); } init_lexan (rawdata, filesize); input = get_next_symbol(); /* parse the header to obtain the number of taxa and sequence length */ if (!read_phylip_header (&input, &sequenceCount, &sequenceLength)) { rax_free (rawdata); fprintf (stderr, "Error while parsing PHYLIP header (number of taxa and sequence length)\n"); errno = PLL_ERROR_PHYLIP_HEADER_SYNTAX; return (NULL); } lex_table_amend_phylip(); /* allocate alignment structure */ alignmentData = pllInitAlignmentData (sequenceCount, sequenceLength); if (! parse_phylip (alignmentData, input)) { errno = PLL_ERROR_PHYLIP_BODY_SYNTAX; pllAlignmentDataDestroy (alignmentData); lex_table_restore(); rax_free (rawdata); return (NULL); } lex_table_restore(); rax_free (rawdata); alignmentData->siteWeights = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int)); for (i = 0; i < alignmentData->sequenceLength; ++ i) alignmentData->siteWeights[i] = 1; return (alignmentData); }
void addword(char *s, stringHashtable *h, int nodeNumber) { hashNumberType position = hashString(s, h->tableSize); stringEntry *p = h->table[position]; for(; p!= NULL; p = p->next) { if(strcmp(s, p->word) == 0) return; } p = (stringEntry *)rax_malloc(sizeof(stringEntry)); assert(p); p->nodeNumber = nodeNumber; p->word = (char *)rax_malloc((strlen(s) + 1) * sizeof(char)); strcpy(p->word, s); p->next = h->table[position]; h->table[position] = p; }
static pllBipartitionEntry *initEntry(void) { pllBipartitionEntry * e = (pllBipartitionEntry *)rax_malloc(sizeof(pllBipartitionEntry)); e->bitVector = (unsigned int*)NULL; e->treeVector = (unsigned int*)NULL; e->supportVector = (int*)NULL; e->bipNumber = 0; e->bipNumber2 = 0; e->supportFromTreeset[0] = 0; e->supportFromTreeset[1] = 0; e->next = (pllBipartitionEntry *)NULL; return e; }
void nodeRectifier(pllInstance *tr) { nodeptr *np = (nodeptr *)rax_malloc(2 * tr->mxtips * sizeof(nodeptr)); int i; int count = 0; tr->start = tr->nodep[1]; tr->rooted = PLL_FALSE; /* TODO why is tr->rooted set to PLL_FALSE here ?*/ for(i = tr->mxtips + 1; i <= (tr->mxtips + tr->mxtips - 1); i++) np[i] = tr->nodep[i]; reorderNodes(tr, np, tr->start->back, &count); rax_free(np); }
stringHashtable *initStringHashTable(hashNumberType n) { /* init with primes */ static const hashNumberType initTable[] = {53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741}; /* init with powers of two static const hashNumberType initTable[] = {64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648U}; */ stringHashtable *h = (stringHashtable*)rax_malloc(sizeof(stringHashtable)); hashNumberType tableSize, i, primeTableLength = sizeof(initTable)/sizeof(initTable[0]), maxSize = (hashNumberType)-1; assert(n <= maxSize); i = 0; while(initTable[i] < n && i < primeTableLength) i++; assert(i < primeTableLength); tableSize = initTable[i]; h->table = (stringEntry**)rax_calloc(tableSize, sizeof(stringEntry*)); h->tableSize = tableSize; return h; }
static pInfo *allocParams(tree *tr) { int i; pInfo *partBuffer = (pInfo*)rax_malloc(sizeof(pInfo) * tr->NumberOfModels); for(i = 0; i < tr->NumberOfModels; i++) { const partitionLengths *pl = getPartitionLengths(&(tr->partitionData[i])); partBuffer[i].EIGN = (double*)rax_malloc(pl->eignLength * sizeof(double)); partBuffer[i].EV = (double*)rax_malloc(pl->evLength * sizeof(double)); partBuffer[i].EI = (double*)rax_malloc(pl->eiLength * sizeof(double)); partBuffer[i].substRates = (double *)rax_malloc(pl->substRatesLength * sizeof(double)); partBuffer[i].frequencies = (double*)rax_malloc(pl->frequenciesLength * sizeof(double)); partBuffer[i].tipVector = (double *)rax_malloc(pl->tipVectorLength * sizeof(double)); } return partBuffer; }
static int parse_newick (pllStack ** stack, int * inp) { pllNewickNodeInfo * item = NULL; int item_active = 0; pllLexToken token; int input; pllLexToken prev_token; int nop = 0; /* number of open parentheses */ int depth = 0; prev_token.tokenType = PLL_TOKEN_UNKNOWN; input = *inp; NEXT_TOKEN while (token.tokenType != PLL_TOKEN_EOF && token.tokenType != PLL_TOKEN_UNKNOWN) { switch (token.tokenType) { case PLL_TOKEN_OPAREN: #ifdef PLLDEBUG printf ("PLL_TOKEN_OPAREN\n"); #endif ++nop; memcpy (&prev_token, &token, sizeof (pllLexToken)); ++depth; break; case PLL_TOKEN_CPAREN: #ifdef PLLDEBUG printf ("PLL_TOKEN_CPAREN\n"); #endif if (prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_UNKNOWN && prev_token.tokenType != PLL_TOKEN_STRING && prev_token.tokenType != PLL_TOKEN_NUMBER && prev_token.tokenType != PLL_TOKEN_FLOAT) return (0); if (!nop) return (0); --nop; memcpy (&prev_token, &token, sizeof (pllLexToken)); /* push to the stack */ if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nec //if (item->name == NULL) item->name = strdup ("INTERNAL_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char)); strcpy (item->name, "INTERNAL_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } item->depth = depth; pllStackPush (stack, item); item_active = 1; /* active = 1 */ item = NULL; --depth; break; case PLL_TOKEN_STRING: #ifdef PLLDEBUG printf ("PLL_TOKEN_STRING %.*s\n", token.len, token.lexeme); #endif if (prev_token.tokenType != PLL_TOKEN_OPAREN && prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_UNKNOWN && prev_token.tokenType != PLL_TOKEN_COMMA) return (0); if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); //item->name = strndup (token.lexeme, token.len); item->name = (char *) rax_malloc ((token.len + 1) * sizeof (char)); strncpy (item->name, token.lexeme, token.len); item->name[token.len] = 0; item_active = 1; item->depth = depth; if (prev_token.tokenType == PLL_TOKEN_COMMA || prev_token.tokenType == PLL_TOKEN_OPAREN || prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1; memcpy (&prev_token, &token, sizeof (pllLexToken)); break; case PLL_TOKEN_FLOAT: case PLL_TOKEN_NUMBER: #ifdef PLLDEBUG if (token.tokenType == PLL_TOKEN_FLOAT) printf ("PLL_TOKEN_FLOAT\n"); else printf ("PLL_TOKEN_NUMBER\n"); #endif if (prev_token.tokenType != PLL_TOKEN_OPAREN && prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_COLON && prev_token.tokenType != PLL_TOKEN_UNKNOWN && prev_token.tokenType != PLL_TOKEN_COMMA) return (0); if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); if (prev_token.tokenType == PLL_TOKEN_COLON) { //item->branch = strndup (token.lexeme, token.len); item->branch = (char *) rax_malloc ((token.len + 1) * sizeof (char)); strncpy (item->branch, token.lexeme, token.len); item->branch[token.len] = 0; } else { if (prev_token.tokenType == PLL_TOKEN_COMMA || prev_token.tokenType == PLL_TOKEN_OPAREN || prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1; //if (prev_token.tokenType != PLL_TOKEN_UNKNOWN) ++ indent; //item->name = strndup (token.lexeme, token.len); item->name = (char *) rax_malloc ((token.len + 1) * sizeof (char)); strncpy (item->name, token.lexeme, token.len); item->name[token.len] = 0; } item_active = 1; item->depth = depth; memcpy (&prev_token, &token, sizeof (pllLexToken)); break; case PLL_TOKEN_COLON: #ifdef PLLDEBUG printf ("PLL_TOKEN_COLON\n"); #endif if (prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_STRING && prev_token.tokenType != PLL_TOKEN_FLOAT && prev_token.tokenType != PLL_TOKEN_NUMBER) return (0); memcpy (&prev_token, &token, sizeof (pllLexToken)); break; case PLL_TOKEN_COMMA: #ifdef PLLDEBUG printf ("PLL_TOKEN_COMMA\n"); #endif if (prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_STRING && prev_token.tokenType != PLL_TOKEN_FLOAT && prev_token.tokenType != PLL_TOKEN_NUMBER) return (0); memcpy (&prev_token, &token, sizeof (pllLexToken)); /* push to the stack */ if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nece //if (item->name == NULL) item->name = strdup ("INTERNAL_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char)); strcpy (item->name, "INTERNAL_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } item->depth = depth; pllStackPush (stack, item); item_active = 0; item = NULL; break; case PLL_TOKEN_SEMICOLON: #ifdef PLLDEBUG printf ("PLL_TOKEN_SEMICOLON\n"); #endif /* push to the stack */ if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); //if (item->name == NULL) item->name = strdup ("ROOT_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char)); strcpy (item->name, "ROOT_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } pllStackPush (stack, item); item_active = 0; item = NULL; break; default: #ifdef __DEBUGGING_MODE printf ("Unknown token: %d\n", token.tokenType); #endif // TODO: Finish this part and add error codes break; } NEXT_TOKEN CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE); } if (item_active) { if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); //if (item->name == NULL) item->name = strdup ("ROOT_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char)); strcpy (item->name, "ROOT_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } pllStackPush (stack, item); item_active = 0; } if (nop || token.tokenType == PLL_TOKEN_UNKNOWN) { return (0); } return (1); }
void handleExcludeFile(tree *tr, analdef *adef, rawdata *rdta) { FILE *f; char buf[256]; int ch, j, value, i, state = 0, numberOfModels = 0, l = -1, excludeRegion = 0, excludedColumns = 0, modelCounter = 1; int *excludeArray, *countArray, *modelList; int **partitions; printf("\n\n"); f = myfopen(excludeFileName, "rb"); while((ch = getc(f)) != EOF) { if(ch == '-') numberOfModels++; } excludeArray = (int*)rax_malloc(sizeof(int) * (rdta->sites + 1)); countArray = (int*)rax_malloc(sizeof(int) * (rdta->sites + 1)); modelList = (int *)rax_malloc((rdta->sites + 1)* sizeof(int)); partitions = (int **)rax_malloc(sizeof(int *) * numberOfModels); for(i = 0; i < numberOfModels; i++) partitions[i] = (int *)rax_malloc(sizeof(int) * 2); rewind(f); while((ch = getc(f)) != EOF) { switch(state) { case 0: /* get first number */ if(!whitechar(ch)) { if(!isNum(ch)) { printf("exclude file must have format: number-number [number-number]*\n"); exit(-1); } l = 0; buf[l++] = ch; state = 1; } break; case 1: /*get the number or detect - */ if(!isNum(ch) && ch != '-') { printf("exclude file must have format: number-number [number-number]*\n"); exit(-1); } if(isNum(ch)) { buf[l++] = ch; } else { buf[l++] = '\0'; value = atoi(buf); partitions[excludeRegion][0] = value; state = 2; } break; case 2: /*get second number */ if(!isNum(ch)) { printf("exclude file must have format: number-number [number-number]*\n"); exit(-1); } l = 0; buf[l++] = ch; state = 3; break; case 3: /* continue second number or find end */ if(!isNum(ch) && !whitechar(ch)) { printf("exclude file must have format: number-number [number-number]*\n"); exit(-1); } if(isNum(ch)) { buf[l++] = ch; } else { buf[l++] = '\0'; value = atoi(buf); partitions[excludeRegion][1] = value; excludeRegion++; state = 0; } break; default: assert(0); } } if(state == 3) { buf[l++] = '\0'; value = atoi(buf); partitions[excludeRegion][1] = value; excludeRegion++; } assert(excludeRegion == numberOfModels); for(i = 0; i <= rdta->sites; i++) { excludeArray[i] = -1; countArray[i] = 0; modelList[i] = -1; } for(i = 0; i < numberOfModels; i++) { int lower = partitions[i][0]; int upper = partitions[i][1]; if(lower > upper) { printf("Misspecified exclude region %d\n", i); printf("lower bound %d is greater than upper bound %d\n", lower, upper); exit(-1); } if(lower == 0) { printf("Misspecified exclude region %d\n", i); printf("lower bound must be greater than 0\n"); exit(-1); } if(upper > rdta->sites) { printf("Misspecified exclude region %d\n", i); printf("upper bound %d must be smaller than %d\n", upper, (rdta->sites + 1)); exit(-1); } for(j = lower; j <= upper; j++) { if(excludeArray[j] != -1) { printf("WARNING: Exclude regions %d and %d overlap at position %d (already excluded %d times)\n", excludeArray[j], i, j, countArray[j]); } excludeArray[j] = i; countArray[j] = countArray[j] + 1; } } for(i = 1; i <= rdta->sites; i++) { if(excludeArray[i] != -1) excludedColumns++; else { modelList[modelCounter] = tr->model[i]; modelCounter++; } } printf("You have excluded %d out of %d columns\n", excludedColumns, rdta->sites); if(excludedColumns == rdta->sites) { printf("Error: You have excluded all sites\n"); exit(-1); } if(adef->useSecondaryStructure && (excludedColumns > 0)) { char mfn[2048]; int countColumns; FILE *newFile; assert(adef->useMultipleModel); strcpy(mfn, secondaryStructureFileName); strcat(mfn, "."); strcat(mfn, excludeFileName); newFile = myfopen(mfn, "wb"); printBothOpen("\nA secondary structure file with analogous structure assignments for non-excluded columns is printed to file %s\n", mfn); for(i = 1, countColumns = 0; i <= rdta->sites; i++) { if(excludeArray[i] == -1) fprintf(newFile, "%c", tr->secondaryStructureInput[i - 1]); else countColumns++; } assert(countColumns == excludedColumns); fprintf(newFile,"\n"); fclose(newFile); } if(adef->useMultipleModel && (excludedColumns > 0)) { char mfn[2048]; FILE *newFile; strcpy(mfn, modelFileName); strcat(mfn, "."); strcat(mfn, excludeFileName); newFile = myfopen(mfn, "wb"); printf("\nA partition file with analogous model assignments for non-excluded columns is printed to file %s\n", mfn); for(i = 0; i < tr->NumberOfModels; i++) { boolean modelStillExists = FALSE; for(j = 1; (j <= rdta->sites) && (!modelStillExists); j++) { if(modelList[j] == i) modelStillExists = TRUE; } if(modelStillExists) { int k = 1; int lower, upper; int parts = 0; switch(tr->partitionData[i].dataType) { case AA_DATA: { char AAmodel[1024]; if(tr->partitionData[i].ascBias) { strcpy(AAmodel, "ASC_"); strcat(AAmodel, protModels[tr->partitionData[i].protModels]); } else strcpy(AAmodel, protModels[tr->partitionData[i].protModels]); if(tr->partitionData[i].usePredefinedProtFreqs == FALSE) strcat(AAmodel, "F"); if(tr->partitionData[i].optimizeBaseFrequencies) strcat(AAmodel, "X"); assert(!(tr->partitionData[i].optimizeBaseFrequencies && tr->partitionData[i].usePredefinedProtFreqs)); fprintf(newFile, "%s, ", AAmodel); } break; case DNA_DATA: if(tr->partitionData[i].optimizeBaseFrequencies) { if(tr->partitionData[i].ascBias) fprintf(newFile, "ASC_DNAX, "); else fprintf(newFile, "DNAX, "); } else { if(tr->partitionData[i].ascBias) fprintf(newFile, "ASC_DNA, "); else fprintf(newFile, "DNA, "); } break; case BINARY_DATA: if(tr->partitionData[i].optimizeBaseFrequencies) { if(tr->partitionData[i].ascBias) fprintf(newFile, "ASC_BINX, "); else fprintf(newFile, "BINX, "); } else { if(tr->partitionData[i].ascBias) fprintf(newFile, "ASC_BIN, "); else fprintf(newFile, "BIN, "); } break; case GENERIC_32: if(tr->partitionData[i].optimizeBaseFrequencies) { if(tr->partitionData[i].ascBias) fprintf(newFile, "ASC_MULTIX, "); else fprintf(newFile, "MULTIX, "); } else { if(tr->partitionData[i].ascBias) fprintf(newFile, "ASC_MULTI, "); else fprintf(newFile, "MULTI, "); } break; case GENERIC_64: if(tr->partitionData[i].optimizeBaseFrequencies) { if(tr->partitionData[i].ascBias) fprintf(newFile, "ASC_CODONX, "); else fprintf(newFile, "CODONX, "); } else { if(tr->partitionData[i].ascBias) fprintf(newFile, "ASC_CODON, "); else fprintf(newFile, "CODON, "); } break; default: assert(0); } fprintf(newFile, "%s = ", tr->partitionData[i].partitionName); while(k <= rdta->sites) { if(modelList[k] == i) { lower = k; while((modelList[k + 1] == i) && (k <= rdta->sites)) k++; upper = k; if(lower == upper) { if(parts == 0) fprintf(newFile, "%d", lower); else fprintf(newFile, ",%d", lower); } else { if(parts == 0) fprintf(newFile, "%d-%d", lower, upper); else fprintf(newFile, ",%d-%d", lower, upper); } parts++; } k++; } fprintf(newFile, "\n"); } } fclose(newFile); } { FILE *newFile; char mfn[2048]; strcpy(mfn, seq_file); strcat(mfn, "."); strcat(mfn, excludeFileName); newFile = myfopen(mfn, "wb"); printf("\nAn alignment file with excluded columns is printed to file %s\n\n\n", mfn); fprintf(newFile, "%d %d\n", tr->mxtips, rdta->sites - excludedColumns); for(i = 1; i <= tr->mxtips; i++) { unsigned char *tipI = &(rdta->y[i][1]); fprintf(newFile, "%s ", tr->nameList[i]); for(j = 0; j < rdta->sites; j++) { if(excludeArray[j + 1] == -1) fprintf(newFile, "%c", getInverseMeaning(tr->dataVector[j + 1], tipI[j])); } fprintf(newFile, "\n"); } fclose(newFile); } fclose(f); for(i = 0; i < numberOfModels; i++) rax_free(partitions[i]); rax_free(partitions); rax_free(excludeArray); rax_free(countArray); rax_free(modelList); }
static void analyzeIdentifier(char **ch, int modelNumber, tree *tr) { char *start = *ch, ident[2048] = "", model[2048] = "", thisModel[2048] = ""; int i = 0, n, j, containsComma = 0; while(**ch != '=') { if(**ch == '\n' || **ch == '\r') { printf("\nPartition file parsing error!\n"); printf("Each line must contain a \"=\" character\n"); printf("Offending line: %s\n", start); printf("RAxML will exit now.\n\n"); errorExit(-1); } if(**ch != ' ' && **ch != '\t') { ident[i] = **ch; i++; } *ch = *ch + 1; } ident[i] = '\0'; n = i; i = 0; for(i = 0; i < n; i++) if(ident[i] == ',') containsComma = 1; if(!containsComma) { printf("Error, model file must have format: Substitution model, then a comma, and then the partition name\n"); exit(-1); } else { boolean analyzeRest = TRUE, useExternalFile = FALSE, found = FALSE; int openBracket = 0, closeBracket = 0, openPos = 0, closePos = 0; i = 0; while(ident[i] != ',') { if(ident[i] == '[') { openPos = i; openBracket++; } if(ident[i] == ']') { closePos = i; closeBracket++; } model[i] = ident[i]; i++; } if(closeBracket > 0 || openBracket > 0) { if((closeBracket == 1) && (openBracket == 1) && (openPos < closePos)) useExternalFile = TRUE; else { printf("\nError: Apparently you want to specify a user-defined protein substitution model\n"); printf("or ascertainment bias correction model that shall be read from file\n"); printf("It must be enclosed in opening and closing bracktes like this: [prot=fileName] or [asc=fileName]\n\n"); printf("you specified: %s\n\n", model); exit(-1); } } if(useExternalFile) { char designator[2048] = "", fileName[2048] = ""; int pos, index, lower = 0, upper = i - 1; boolean isProteinFile = TRUE; while(model[lower] == '[') lower++; while(model[upper] == ']') upper--; assert(lower < upper); index = lower; pos = 0; while(model[index] != '~') { designator[pos] = model[index]; pos++; index++; } designator[pos] = '\0'; if(strcmp(designator, "asc") == 0) isProteinFile = FALSE; else { if(strcmp(designator, "prot") == 0) isProteinFile = TRUE; else { printf("Error external partition file type %s does not exist\n", designator); printf("Available file types: asc and prot\n"); exit(-1); } } while(model[index] == '~') index++; pos = 0; while(model[index] != ']') { fileName[pos] = model[index]; index++; pos++; } fileName[pos] = '\0'; if(!filexists(fileName)) { printf("\n\ncustom protein substitution or ascertainment bias file [%s] you want to use does not exist!\n", fileName); printf("you need to specify the full path\n"); printf("the file name shall not contain blanks!\n\n"); exit(-1); } if(isProteinFile) { strcpy(tr->initialPartitionData[modelNumber].proteinSubstitutionFileName, fileName); /*printf("%s \n", tr->initialPartitionData[modelNumber].proteinSubstitutionFileName);*/ tr->initialPartitionData[modelNumber].protModels = PROT_FILE; tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = TRUE; tr->initialPartitionData[modelNumber].dataType = AA_DATA; analyzeRest = FALSE; } else { int newIndex = 0; strcpy(tr->initialPartitionData[modelNumber].ascFileName, fileName); i = 0; while(ident[i] != ',') { if(ident[i] == '\0') { printf("Expecting two commas in string %s\n", ident); exit(-1); } i++; } i++; while(ident[i] != ',') { if(ident[i] == '\0') { printf("Expecting two commas in string %s\n", ident); exit(-1); } model[newIndex] = ident[i]; i++; newIndex++; } model[newIndex] = '\0'; } } if(analyzeRest) { /* AA */ tr->initialPartitionData[modelNumber].ascBias = FALSE; for(i = 0; i < NUM_PROT_MODELS && !found; i++) { strcpy(thisModel, protModels[i]); if(strcasecmp(model, thisModel) == 0) { tr->initialPartitionData[modelNumber].protModels = i; tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = TRUE; tr->initialPartitionData[modelNumber].dataType = AA_DATA; found = TRUE; } if(!found) { if(i != GTR && i != GTR_UNLINKED) { strcpy(thisModel, protModels[i]); strcat(thisModel, "F"); if(strcasecmp(model, thisModel) == 0) { tr->initialPartitionData[modelNumber].protModels = i; tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE; tr->initialPartitionData[modelNumber].dataType = AA_DATA; found = TRUE; if(tr->initialPartitionData[modelNumber].protModels == AUTO) { printf("\nError: Option AUTOF has been deprecated, exiting\n\n"); errorExit(-1); } } } } if(!found) { strcpy(thisModel, protModels[i]); strcat(thisModel, "X"); if(strcasecmp(model, thisModel) == 0) { tr->initialPartitionData[modelNumber].protModels = i; tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE; tr->initialPartitionData[modelNumber].dataType = AA_DATA; tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE; found = TRUE; if(tr->initialPartitionData[modelNumber].protModels == AUTO) { printf("\nError: Option AUTOX has been deprecated, exiting\n\n"); errorExit(-1); } } } if(found && (tr->initialPartitionData[modelNumber].protModels == GTR || tr->initialPartitionData[modelNumber].protModels == GTR_UNLINKED)) tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE; } /* AA with Asc bias*/ if(!found) { for(i = 0; i < NUM_PROT_MODELS && !found; i++) { strcpy(thisModel, "ASC_"); strcat(thisModel, protModels[i]); if(strcasecmp(model, thisModel) == 0) { tr->initialPartitionData[modelNumber].protModels = i; tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = TRUE; tr->initialPartitionData[modelNumber].dataType = AA_DATA; found = TRUE; } if(!found) { if(i != GTR && i != GTR_UNLINKED) { strcpy(thisModel, protModels[i]); strcat(thisModel, "F"); if(strcasecmp(model, thisModel) == 0) { tr->initialPartitionData[modelNumber].protModels = i; tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE; tr->initialPartitionData[modelNumber].dataType = AA_DATA; found = TRUE; } } } if(!found) { strcpy(thisModel, protModels[i]); strcat(thisModel, "X"); if(strcasecmp(model, thisModel) == 0) { tr->initialPartitionData[modelNumber].protModels = i; tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE; tr->initialPartitionData[modelNumber].dataType = AA_DATA; tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE; found = TRUE; } } if(found) tr->initialPartitionData[modelNumber].ascBias = TRUE; if(found && (tr->initialPartitionData[modelNumber].protModels == GTR || tr->initialPartitionData[modelNumber].protModels == GTR_UNLINKED)) tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE; } } if(!found) { if(strcasecmp(model, "DNA") == 0 || strcasecmp(model, "DNAX") == 0 || strcasecmp(model, "ASC_DNA") == 0 || strcasecmp(model, "ASC_DNAX") == 0) { tr->initialPartitionData[modelNumber].protModels = -1; tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE; tr->initialPartitionData[modelNumber].dataType = DNA_DATA; if(strcasecmp(model, "DNAX") == 0 || strcasecmp(model, "ASC_DNAX") == 0) { if(strcasecmp(model, "ASC_DNAX") == 0) tr->initialPartitionData[modelNumber].ascBias = TRUE; else tr->initialPartitionData[modelNumber].ascBias = FALSE; tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE; } else { if(strcasecmp(model, "ASC_DNA") == 0) tr->initialPartitionData[modelNumber].ascBias = TRUE; else tr->initialPartitionData[modelNumber].ascBias = FALSE; tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = FALSE; } found = TRUE; } else { if(strcasecmp(model, "BIN") == 0 || strcasecmp(model, "BINX") == 0 || strcasecmp(model, "ASC_BIN") == 0 || strcasecmp(model, "ASC_BINX") == 0) { tr->initialPartitionData[modelNumber].protModels = -1; tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE; tr->initialPartitionData[modelNumber].dataType = BINARY_DATA; if(strcasecmp(model, "BINX") == 0 || strcasecmp(model, "ASC_BINX") == 0) { if(strcasecmp(model, "ASC_BINX") == 0) tr->initialPartitionData[modelNumber].ascBias = TRUE; else tr->initialPartitionData[modelNumber].ascBias = FALSE; tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE; } else { if(strcasecmp(model, "ASC_BIN") == 0) tr->initialPartitionData[modelNumber].ascBias = TRUE; else tr->initialPartitionData[modelNumber].ascBias = FALSE; tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = FALSE; } found = TRUE; } else { if(strcasecmp(model, "MULTI") == 0 || strcasecmp(model, "MULTIX") == 0 || strcasecmp(model, "ASC_MULTI") == 0 || strcasecmp(model, "ASC_MULTIX") == 0) { tr->initialPartitionData[modelNumber].protModels = -1; tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE; tr->initialPartitionData[modelNumber].dataType = GENERIC_32; if(strcasecmp(model, "MULTIX") == 0 || strcasecmp(model, "ASC_MULTIX") == 0) { if(strcasecmp(model, "ASC_MULTIX") == 0) tr->initialPartitionData[modelNumber].ascBias = TRUE; else tr->initialPartitionData[modelNumber].ascBias = FALSE; tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE; } else { if(strcasecmp(model, "ASC_MULTI") == 0) tr->initialPartitionData[modelNumber].ascBias = TRUE; else tr->initialPartitionData[modelNumber].ascBias = FALSE; tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = FALSE; } found = TRUE; } else { if(strcasecmp(model, "CODON") == 0 || strcasecmp(model, "CODONX") == 0 || strcasecmp(model, "ASC_CODON") == 0 || strcasecmp(model, "ASC_CODONX") == 0) { tr->initialPartitionData[modelNumber].protModels = -1; tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = FALSE; tr->initialPartitionData[modelNumber].dataType = GENERIC_64; if(strcasecmp(model, "CODONX") == 0 || strcasecmp(model, "ASC_CODONX") == 0) { if(strcasecmp(model, "ASC_CODONX") == 0) tr->initialPartitionData[modelNumber].ascBias = TRUE; else tr->initialPartitionData[modelNumber].ascBias = FALSE; tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = TRUE; } else { if(strcasecmp(model, "ASC_CODON") == 0) tr->initialPartitionData[modelNumber].ascBias = TRUE; else tr->initialPartitionData[modelNumber].ascBias = FALSE; tr->initialPartitionData[modelNumber].optimizeBaseFrequencies = FALSE; } found = TRUE; } } } } } if(!found) { printf("ERROR: you specified the unknown model %s for partition %d\n", model, modelNumber); exit(-1); } } i = 0; while(ident[i++] != ','); tr->initialPartitionData[modelNumber].partitionName = (char*)rax_malloc((n - i + 1) * sizeof(char)); j = 0; while(i < n) tr->initialPartitionData[modelNumber].partitionName[j++] = ident[i++]; tr->initialPartitionData[modelNumber].partitionName[j] = '\0'; } }
void parsePartitions(analdef *adef, rawdata *rdta, tree *tr) { FILE *f; int numberOfModels = 0; int nbytes = 0; char *ch; char *cc = (char *)NULL; char **p_names; int n, i, l; int lower, upper, modulo; char buf[256]; int **partitions; int pairsCount; int as, j; int k; f = myfopen(modelFileName, "rb"); while(myGetline(&cc, &nbytes, f) > -1) { if(!lineContainsOnlyWhiteChars(cc)) { numberOfModels++; } if(cc) rax_free(cc); cc = (char *)NULL; } rewind(f); p_names = (char **)rax_malloc(sizeof(char *) * numberOfModels); partitions = (int **)rax_malloc(sizeof(int *) * numberOfModels); tr->initialPartitionData = (pInfo*)rax_malloc(sizeof(pInfo) * numberOfModels); for(i = 0; i < numberOfModels; i++) { tr->initialPartitionData[i].protModels = adef->proteinMatrix; tr->initialPartitionData[i].usePredefinedProtFreqs = adef->protEmpiricalFreqs; tr->initialPartitionData[i].optimizeBaseFrequencies = FALSE; tr->initialPartitionData[i].dataType = -1; } for(i = 0; i < numberOfModels; i++) partitions[i] = (int *)NULL; i = 0; while(myGetline(&cc, &nbytes, f) > -1) { if(!lineContainsOnlyWhiteChars(cc)) { n = strlen(cc); p_names[i] = (char *)rax_malloc(sizeof(char) * (n + 1)); strcpy(&(p_names[i][0]), cc); i++; } if(cc) rax_free(cc); cc = (char *)NULL; } for(i = 0; i < numberOfModels; i++) { ch = p_names[i]; pairsCount = 0; skipWhites(&ch); if(*ch == '=') { printf("Identifier missing prior to '=' in %s\n", p_names[i]); exit(-1); } analyzeIdentifier(&ch, i, tr); ch++; numberPairs: pairsCount++; partitions[i] = (int *)rax_realloc((void *)partitions[i], (1 + 3 * pairsCount) * sizeof(int), FALSE); partitions[i][0] = pairsCount; partitions[i][3 + 3 * (pairsCount - 1)] = -1; skipWhites(&ch); if(!isNum(*ch)) { printf("%c Number expected in %s\n", *ch, p_names[i]); exit(-1); } l = 0; while(isNum(*ch)) { /*printf("%c", *ch);*/ buf[l] = *ch; ch++; l++; } buf[l] = '\0'; lower = atoi(buf); partitions[i][1 + 3 * (pairsCount - 1)] = lower; skipWhites(&ch); /* NEW */ if((*ch != '-') && (*ch != ',')) { if(*ch == '\0' || *ch == '\n' || *ch == '\r') { upper = lower; goto SINGLE_NUMBER; } else { printf("'-' or ',' expected in %s\n", p_names[i]); exit(-1); } } if(*ch == ',') { upper = lower; goto SINGLE_NUMBER; } /* END NEW */ ch++; skipWhites(&ch); if(!isNum(*ch)) { printf("%c Number expected in %s\n", *ch, p_names[i]); exit(-1); } l = 0; while(isNum(*ch)) { buf[l] = *ch; ch++; l++; } buf[l] = '\0'; upper = atoi(buf); SINGLE_NUMBER: partitions[i][2 + 3 * (pairsCount - 1)] = upper; if(upper < lower) { printf("Upper bound %d smaller than lower bound %d for this partition: %s\n", upper, lower, p_names[i]); exit(-1); } skipWhites(&ch); if(*ch == '\0' || *ch == '\n' || *ch == '\r') /* PC-LINEBREAK*/ { goto parsed; } if(*ch == ',') { ch++; goto numberPairs; } if(*ch == '\\') { ch++; skipWhites(&ch); if(!isNum(*ch)) { printf("%c Number expected in %s\n", *ch, p_names[i]); exit(-1); } if(adef->compressPatterns == FALSE) { printf("\nError: You are not allowed to use interleaved partitions, that is, assign non-contiguous sites\n"); printf("to the same partition model, when pattern compression is disabled via the -H flag,\n"); printf("or when pattern compression is disabled implicitely by some other option that requires it!\n\n"); exit(-1); } l = 0; while(isNum(*ch)) { buf[l] = *ch; ch++; l++; } buf[l] = '\0'; modulo = atoi(buf); partitions[i][3 + 3 * (pairsCount - 1)] = modulo; skipWhites(&ch); if(*ch == '\0' || *ch == '\n' || *ch == '\r') { goto parsed; } if(*ch == ',') { ch++; goto numberPairs; } } if(*ch == '/') { printf("\nRAxML detected the character \"/\" in your partition file.\n"); printf("Did you mean to write something similar to this: \"DNA, p1=1-100\\3\" ?\n"); printf("It's actually a backslash, not a slash, the program will exit now with an error!\n\n"); } else { printf("\nRAxML detected the character \"%c\" in your partition file,\n", *ch); printf("while it does not belong there!\n"); printf("\nAre you sure that your partition file complies with the RAxML partition file format?\n"); printf("\nActually reading the manual, does indeed do help a lot\n\n"); printf("The program will exit now with an error!\n\n"); } printf("The problematic line in your partition file is this one here:\n\n"); printf("%s\n\n", p_names[i]); assert(0); parsed: ; } fclose(f); /*********************************************************************************************************************/ for(i = 0; i <= rdta->sites; i++) tr->model[i] = -1; for(i = 0; i < numberOfModels; i++) { as = partitions[i][0]; for(j = 0; j < as; j++) { lower = partitions[i][1 + j * 3]; upper = partitions[i][2 + j * 3]; modulo = partitions[i][3 + j * 3]; if(modulo == -1) { for(k = lower; k <= upper; k++) setModel(i, k, tr->model); } else { for(k = lower; k <= upper; k += modulo) { if(k <= rdta->sites) setModel(i, k, tr->model); } } } } for(i = 1; i < rdta->sites + 1; i++) { if(tr->model[i] == -1) { printf("ERROR: Alignment Position %d has not been assigned any model\n", i); exit(-1); } } for(i = 0; i < numberOfModels; i++) { rax_free(partitions[i]); rax_free(p_names[i]); } rax_free(partitions); rax_free(p_names); tr->NumberOfModels = numberOfModels; if(adef->perGeneBranchLengths) { if(tr->NumberOfModels > NUM_BRANCHES) { printf("You are trying to use %d partitioned models for an individual per-gene branch length estimate.\n", tr->NumberOfModels); printf("Currently only %d are allowed to improve efficiency.\n", NUM_BRANCHES); printf("\n"); printf("In order to change this please replace the line \"#define NUM_BRANCHES %d\" in file \"axml.h\" \n", NUM_BRANCHES); printf("by \"#define NUM_BRANCHES %d\" and then re-compile RAxML.\n", tr->NumberOfModels); exit(-1); } else { tr->multiBranch = 1; tr->numBranches = tr->NumberOfModels; } } }
void parseSecondaryStructure(tree *tr, analdef *adef, int sites) { if(adef->useSecondaryStructure) { FILE *f = myfopen(secondaryStructureFileName, "rb"); int i, k, countCharacters = 0, ch, *characters, **brackets, opening, closing, depth, numberOfSymbols, numSecondaryColumns; unsigned char bracketTypes[4][2] = {{'(', ')'}, {'<', '>'}, {'[', ']'}, {'{', '}'}}; numberOfSymbols = 4; tr->secondaryStructureInput = (char*)rax_malloc(sizeof(char) * sites); while((ch = fgetc(f)) != EOF) { if(ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '{' || ch == '}' || ch == '.') countCharacters++; else { if(!whitechar(ch)) { printf("Secondary Structure file %s contains character %c at position %d\n", secondaryStructureFileName, ch, countCharacters + 1); printf("Allowed Characters are \"( ) < > [ ] { } \" and \".\" \n"); errorExit(-1); } } } if(countCharacters != sites) { printf("Error: Alignment length is: %d, secondary structure file has length %d\n", sites, countCharacters); errorExit(-1); } characters = (int*)rax_malloc(sizeof(int) * countCharacters); brackets = (int **)rax_malloc(sizeof(int*) * numberOfSymbols); for(k = 0; k < numberOfSymbols; k++) brackets[k] = (int*)rax_calloc(countCharacters, sizeof(int)); rewind(f); countCharacters = 0; while((ch = fgetc(f)) != EOF) { if(!whitechar(ch)) { tr->secondaryStructureInput[countCharacters] = ch; characters[countCharacters++] = ch; } } assert(countCharacters == sites); for(k = 0; k < numberOfSymbols; k++) { for(i = 0, opening = 0, closing = 0, depth = 0; i < countCharacters; i++) { if((characters[i] == bracketTypes[k][0] || characters[i] == bracketTypes[k][1]) && (tr->extendedDataVector[i+1] == AA_DATA || tr->extendedDataVector[i+1] == BINARY_DATA || tr->extendedDataVector[i+1] == GENERIC_32 || tr->extendedDataVector[i+1] == GENERIC_64)) { printf("Secondary Structure only for DNA character positions \n"); printf("I am at position %d of the secondary structure file and this is not part of a DNA partition\n", i+1); errorExit(-1); } if(characters[i] == bracketTypes[k][0]) { depth++; /*printf("%d %d\n", depth, i);*/ brackets[k][i] = depth; opening++; } if(characters[i] == bracketTypes[k][1]) { brackets[k][i] = depth; /*printf("%d %d\n", depth, i); */ depth--; closing++; } if(closing > opening) { printf("at position %d there is a closing bracket too much\n", i+1); errorExit(-1); } } if(depth != 0) { printf("Problem: Depth: %d\n", depth); printf("Your secondary structure file may be missing a closing or opening paraenthesis!\n"); } assert(depth == 0); if(countCharacters != sites) { printf("Problem: sec chars: %d sites: %d\n",countCharacters, sites); printf("The number of sites in the alignment does not match the length of the secondary structure file\n"); } assert(countCharacters == sites); if(closing != opening) { printf("Number of opening brackets %d should be equal to number of closing brackets %d\n", opening, closing); errorExit(-1); } } for(i = 0, numSecondaryColumns = 0; i < countCharacters; i++) { int checkSum = 0; for(k = 0; k < numberOfSymbols; k++) { if(brackets[k][i] > 0) { checkSum++; switch(tr->secondaryStructureModel) { case SEC_16: case SEC_16_A: case SEC_16_B: case SEC_16_C: case SEC_16_D: case SEC_16_E: case SEC_16_F: case SEC_16_I: case SEC_16_J: case SEC_16_K: tr->extendedDataVector[i+1] = SECONDARY_DATA; break; case SEC_6_A: case SEC_6_B: case SEC_6_C: case SEC_6_D: case SEC_6_E: tr->extendedDataVector[i+1] = SECONDARY_DATA_6; break; case SEC_7_A: case SEC_7_B: case SEC_7_C: case SEC_7_D: case SEC_7_E: case SEC_7_F: tr->extendedDataVector[i+1] = SECONDARY_DATA_7; break; default: assert(0); } numSecondaryColumns++; } } assert(checkSum <= 1); } assert(numSecondaryColumns % 2 == 0); /*printf("Number of secondary columns: %d merged columns: %d\n", numSecondaryColumns, numSecondaryColumns / 2);*/ tr->numberOfSecondaryColumns = numSecondaryColumns; if(numSecondaryColumns > 0) { int model = tr->NumberOfModels; int countPairs; pInfo *partBuffer = (pInfo*)rax_malloc(sizeof(pInfo) * tr->NumberOfModels); for(i = 1; i <= sites; i++) { for(k = 0; k < numberOfSymbols; k++) { if(brackets[k][i-1] > 0) tr->model[i] = model; } } /* now make a copy of partition data */ for(i = 0; i < tr->NumberOfModels; i++) { partBuffer[i].partitionName = (char*)rax_malloc((strlen(tr->extendedPartitionData[i].partitionName) + 1) * sizeof(char)); strcpy(partBuffer[i].partitionName, tr->extendedPartitionData[i].partitionName); strcpy(partBuffer[i].proteinSubstitutionFileName, tr->extendedPartitionData[i].proteinSubstitutionFileName); strcpy(partBuffer[i].ascFileName, tr->extendedPartitionData[i].ascFileName); partBuffer[i].dataType = tr->extendedPartitionData[i].dataType; partBuffer[i].protModels = tr->extendedPartitionData[i].protModels; partBuffer[i].usePredefinedProtFreqs = tr->extendedPartitionData[i].usePredefinedProtFreqs; partBuffer[i].optimizeBaseFrequencies = tr->extendedPartitionData[i].optimizeBaseFrequencies; } for(i = 0; i < tr->NumberOfModels; i++) rax_free(tr->extendedPartitionData[i].partitionName); rax_free(tr->extendedPartitionData); tr->extendedPartitionData = (pInfo*)rax_malloc(sizeof(pInfo) * (tr->NumberOfModels + 1)); for(i = 0; i < tr->NumberOfModels; i++) { tr->extendedPartitionData[i].partitionName = (char*)rax_malloc((strlen(partBuffer[i].partitionName) + 1) * sizeof(char)); strcpy(tr->extendedPartitionData[i].partitionName, partBuffer[i].partitionName); strcpy(tr->extendedPartitionData[i].proteinSubstitutionFileName, partBuffer[i].proteinSubstitutionFileName); strcpy(tr->extendedPartitionData[i].ascFileName, partBuffer[i].ascFileName); tr->extendedPartitionData[i].dataType = partBuffer[i].dataType; tr->extendedPartitionData[i].protModels= partBuffer[i].protModels; tr->extendedPartitionData[i].usePredefinedProtFreqs= partBuffer[i].usePredefinedProtFreqs; tr->extendedPartitionData[i].optimizeBaseFrequencies = partBuffer[i].optimizeBaseFrequencies; rax_free(partBuffer[i].partitionName); } rax_free(partBuffer); tr->extendedPartitionData[i].partitionName = (char*)rax_malloc(64 * sizeof(char)); switch(tr->secondaryStructureModel) { case SEC_16: case SEC_16_A: case SEC_16_B: case SEC_16_C: case SEC_16_D: case SEC_16_E: case SEC_16_F: case SEC_16_I: case SEC_16_J: case SEC_16_K: strcpy(tr->extendedPartitionData[i].partitionName, "SECONDARY STRUCTURE 16 STATE MODEL"); tr->extendedPartitionData[i].dataType = SECONDARY_DATA; break; case SEC_6_A: case SEC_6_B: case SEC_6_C: case SEC_6_D: case SEC_6_E: strcpy(tr->extendedPartitionData[i].partitionName, "SECONDARY STRUCTURE 6 STATE MODEL"); tr->extendedPartitionData[i].dataType = SECONDARY_DATA_6; break; case SEC_7_A: case SEC_7_B: case SEC_7_C: case SEC_7_D: case SEC_7_E: case SEC_7_F: strcpy(tr->extendedPartitionData[i].partitionName, "SECONDARY STRUCTURE 7 STATE MODEL"); tr->extendedPartitionData[i].dataType = SECONDARY_DATA_7; break; default: assert(0); } tr->extendedPartitionData[i].protModels= -1; tr->extendedPartitionData[i].usePredefinedProtFreqs = FALSE; tr->NumberOfModels++; if(adef->perGeneBranchLengths) { if(tr->NumberOfModels > NUM_BRANCHES) { printf("You are trying to use %d partitioned models for an individual per-gene branch length estimate.\n", tr->NumberOfModels); printf("Currently only %d are allowed to improve efficiency.\n", NUM_BRANCHES); printf("Note that the number of partitions has automatically been incremented by one to accommodate secondary structure models\n"); printf("\n"); printf("In order to change this please replace the line \"#define NUM_BRANCHES %d\" in file \"axml.h\" \n", NUM_BRANCHES); printf("by \"#define NUM_BRANCHES %d\" and then re-compile RAxML.\n", tr->NumberOfModels); exit(-1); } else { tr->multiBranch = 1; tr->numBranches = tr->NumberOfModels; } } assert(countCharacters == sites); tr->secondaryStructurePairs = (int*)rax_malloc(sizeof(int) * countCharacters); for(i = 0; i < countCharacters; i++) tr->secondaryStructurePairs[i] = -1; /* for(i = 0; i < countCharacters; i++) printf("%d", brackets[i]); printf("\n"); */ countPairs = 0; for(k = 0; k < numberOfSymbols; k++) { i = 0; while(i < countCharacters) { int j = i, bracket = -1, openBracket, closeBracket; while(j < countCharacters && ((bracket = brackets[k][j]) == 0)) { i++; j++; } assert(bracket >= 0); if(j == countCharacters) { assert(bracket == 0); break; } openBracket = j; j++; while(bracket != brackets[k][j] && j < countCharacters) j++; assert(j < countCharacters); closeBracket = j; assert(closeBracket < countCharacters && openBracket < countCharacters); assert(brackets[k][closeBracket] > 0 && brackets[k][openBracket] > 0); /*printf("%d %d %d\n", openBracket, closeBracket, bracket);*/ brackets[k][closeBracket] = 0; brackets[k][openBracket] = 0; countPairs++; tr->secondaryStructurePairs[closeBracket] = openBracket; tr->secondaryStructurePairs[openBracket] = closeBracket; } assert(i == countCharacters); } assert(countPairs == numSecondaryColumns / 2); /*for(i = 0; i < countCharacters; i++) printf("%d ", tr->secondaryStructurePairs[i]); printf("\n");*/ adef->useMultipleModel = TRUE; } for(k = 0; k < numberOfSymbols; k++) rax_free(brackets[k]); rax_free(brackets); rax_free(characters); fclose(f); } }
void doAllInOne(tree *tr, analdef *adef) { int i, n, bestIndex, bootstrapsPerformed; #ifdef _WAYNE_MPI int bootStopTests = 1, j, bootStrapsPerProcess = 0; #endif double loopTime; int *originalRateCategories; int *originalInvariant; #ifdef _WAYNE_MPI int slowSearches, fastEvery; #else int slowSearches, fastEvery = 5; #endif int treeVectorLength = -1; topolRELL_LIST *rl; double bestLH, mlTime, overallTime; long radiusSeed = adef->rapidBoot; FILE *f; char bestTreeFileName[1024]; hashtable *h = (hashtable*)NULL; unsigned int **bitVectors = (unsigned int**)NULL; boolean bootStopIt = FALSE; double pearsonAverage = 0.0; pInfo *catParams = allocParams(tr); pInfo *gammaParams = allocParams(tr); unsigned int vLength; n = adef->multipleRuns; #ifdef _WAYNE_MPI if(n % processes != 0) n = processes * ((n / processes) + 1); #endif if(adef->bootStopping) { h = initHashTable(tr->mxtips * 100); treeVectorLength = adef->multipleRuns; bitVectors = initBitVector(tr, &vLength); } rl = (topolRELL_LIST *)rax_malloc(sizeof(topolRELL_LIST)); initTL(rl, tr, n); originalRateCategories = (int*)rax_malloc(tr->cdta->endsite * sizeof(int)); originalInvariant = (int*)rax_malloc(tr->cdta->endsite * sizeof(int)); initModel(tr, tr->rdta, tr->cdta, adef); if(adef->grouping) printBothOpen("\n\nThe topologies of all Bootstrap and ML trees will adhere to the constraint tree specified in %s\n", tree_file); if(adef->constraint) printBothOpen("\n\nThe topologies of all Bootstrap and ML trees will adhere to the bifurcating backbone constraint tree specified in %s\n", tree_file); #ifdef _WAYNE_MPI long parsimonySeed0 = adef->parsimonySeed; long replicateSeed0 = adef->rapidBoot; n = n / processes; #endif for(i = 0; i < n && !bootStopIt; i++) { #ifdef _WAYNE_MPI j = i + n * processID; tr->treeID = j; #else tr->treeID = i; #endif tr->checkPointCounter = 0; loopTime = gettime(); #ifdef _WAYNE_MPI if(i == 0) { if(parsimonySeed0 != 0) adef->parsimonySeed = parsimonySeed0 + 10000 * processID; adef->rapidBoot = replicateSeed0 + 10000 * processID; radiusSeed = adef->rapidBoot; } #endif if(i % 10 == 0) { if(i > 0) reductionCleanup(tr, originalRateCategories, originalInvariant); if(adef->grouping || adef->constraint) { FILE *f = myfopen(tree_file, "rb"); assert(adef->restart); if (! treeReadLenMULT(f, tr, adef)) exit(-1); fclose(f); } else makeParsimonyTree(tr, adef); tr->likelihood = unlikely; if(i == 0) { double t; onlyInitrav(tr, tr->start); treeEvaluate(tr, 1); t = gettime(); modOpt(tr, adef, FALSE, 5.0); #ifdef _WAYNE_MPI printBothOpen("\nTime for BS model parameter optimization on Process %d: %f seconds\n", processID, gettime() - t); #else printBothOpen("\nTime for BS model parameter optimization %f\n", gettime() - t); #endif memcpy(originalRateCategories, tr->cdta->rateCategory, sizeof(int) * tr->cdta->endsite); memcpy(originalInvariant, tr->invariant, sizeof(int) * tr->cdta->endsite); if(adef->bootstrapBranchLengths) { if(tr->rateHetModel == CAT) { copyParams(tr->NumberOfModels, catParams, tr->partitionData, tr); assert(tr->cdta->endsite == tr->originalCrunchedLength); catToGamma(tr, adef); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); copyParams(tr->NumberOfModels, gammaParams, tr->partitionData, tr); gammaToCat(tr); copyParams(tr->NumberOfModels, tr->partitionData, catParams, tr); } else { assert(tr->cdta->endsite == tr->originalCrunchedLength); } } } } computeNextReplicate(tr, &adef->rapidBoot, originalRateCategories, originalInvariant, TRUE, TRUE); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1); computeBOOTRAPID(tr, adef, &radiusSeed); #ifdef _WAYNE_MPI saveTL(rl, tr, j); #else saveTL(rl, tr, i); #endif if(adef->bootstrapBranchLengths) { double lh = tr->likelihood; if(tr->rateHetModel == CAT) { copyParams(tr->NumberOfModels, tr->partitionData, gammaParams, tr); catToGamma(tr, adef); resetBranches(tr); onlyInitrav(tr, tr->start); treeEvaluate(tr, 2.0); gammaToCat(tr); copyParams(tr->NumberOfModels, tr->partitionData, catParams, tr); tr->likelihood = lh; } else { treeEvaluate(tr, 2.0); tr->likelihood = lh; } } printBootstrapResult(tr, adef, TRUE); loopTime = gettime() - loopTime; writeInfoFile(adef, tr, loopTime); if(adef->bootStopping) #ifdef _WAYNE_MPI { int nn = (i + 1) * processes; if((nn > START_BSTOP_TEST) && (i * processes < FC_SPACING * bootStopTests) && ((i + 1) * processes >= FC_SPACING * bootStopTests) ) { MPI_Barrier(MPI_COMM_WORLD); concatenateBSFiles(processes, bootstrapFileName); MPI_Barrier(MPI_COMM_WORLD); bootStopIt = computeBootStopMPI(tr, bootstrapFileName, adef, &pearsonAverage); bootStopTests++; } } #else bootStopIt = bootStop(tr, h, i, &pearsonAverage, bitVectors, treeVectorLength, vLength, adef); #endif } #ifdef _WAYNE_MPI MPI_Barrier(MPI_COMM_WORLD); bootstrapsPerformed = i * processes; bootStrapsPerProcess = i; concatenateBSFiles(processes, bootstrapFileName); removeBSFiles(processes, bootstrapFileName); MPI_Barrier(MPI_COMM_WORLD); #else bootstrapsPerformed = i; #endif rax_freeParams(tr->NumberOfModels, catParams); rax_free(catParams); rax_freeParams(tr->NumberOfModels, gammaParams); rax_free(gammaParams); if(adef->bootStopping) { freeBitVectors(bitVectors, 2 * tr->mxtips); rax_free(bitVectors); freeHashTable(h); rax_free(h); } { double t; printBothOpenMPI("\n\n"); if(adef->bootStopping) { if(bootStopIt) { switch(tr->bootStopCriterion) { case FREQUENCY_STOP: printBothOpenMPI("Stopped Rapid BS search after %d replicates with FC Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("Pearson Average of %d random splits: %f\n",BOOTSTOP_PERMUTATIONS , pearsonAverage); break; case MR_STOP: printBothOpenMPI("Stopped Rapid BS search after %d replicates with MR-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; case MRE_STOP: printBothOpenMPI("Stopped Rapid BS search after %d replicates with MRE-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; case MRE_IGN_STOP: printBothOpenMPI("Stopped Rapid BS search after %d replicates with MRE_IGN-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; default: assert(0); } } else { switch(tr->bootStopCriterion) { case FREQUENCY_STOP: printBothOpenMPI("Rapid BS search did not converge after %d replicates with FC Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("Pearson Average of %d random splits: %f\n",BOOTSTOP_PERMUTATIONS , pearsonAverage); break; case MR_STOP: printBothOpenMPI("Rapid BS search did not converge after %d replicates with MR-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; case MRE_STOP: printBothOpenMPI("Rapid BS search did not converge after %d replicates with MRE-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; case MRE_IGN_STOP: printBothOpenMPI("Rapid BS search did not converge after %d replicates with MR_IGN-based Bootstopping criterion\n", bootstrapsPerformed); printBothOpenMPI("WRF Average of %d random splits: %f\n", BOOTSTOP_PERMUTATIONS, pearsonAverage); break; default: assert(0); } } } t = gettime() - masterTime; printBothOpenMPI("Overall Time for %d Rapid Bootstraps %f seconds\n", bootstrapsPerformed, t); printBothOpenMPI("Average Time per Rapid Bootstrap %f seconds\n", (double)(t/((double)bootstrapsPerformed))); if(!adef->allInOne) { printBothOpenMPI("All %d bootstrapped trees written to: %s\n", bootstrapsPerformed, bootstrapFileName); #ifdef _WAYNE_MPI MPI_Finalize(); #endif exit(0); } } /* ML-search */ mlTime = gettime(); double t = mlTime; printBothOpenMPI("\nStarting ML Search ...\n\n"); /***CLEAN UP reduction stuff */ reductionCleanup(tr, originalRateCategories, originalInvariant); /****/ #ifdef _WAYNE_MPI restoreTL(rl, tr, n * processID); #else restoreTL(rl, tr, 0); #endif resetBranches(tr); evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); #ifdef _WAYNE_MPI if(bootstrapsPerformed <= 100) fastEvery = 5; else fastEvery = bootstrapsPerformed / 20; for(i = 0; i < bootstrapsPerformed; i++) rl->t[i]->likelihood = unlikely; for(i = 0; i < bootStrapsPerProcess; i++) { j = i + n * processID; if(i % fastEvery == 0) { restoreTL(rl, tr, j); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1); optimizeRAPID(tr, adef); saveTL(rl, tr, j); } } #else for(i = 0; i < bootstrapsPerformed; i++) { rl->t[i]->likelihood = unlikely; if(i % fastEvery == 0) { restoreTL(rl, tr, i); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1); optimizeRAPID(tr, adef); saveTL(rl, tr, i); } } #endif printBothOpenMPI("Fast ML optimization finished\n\n"); t = gettime() - t; #ifdef _WAYNE_MPI printBothOpen("Fast ML search on Process %d: Time %f seconds\n\n", processID, t); j = n * processID; qsort(&(rl->t[j]), n, sizeof(topolRELL*), compareTopolRell); restoreTL(rl, tr, j); #else printBothOpen("Fast ML search Time: %f seconds\n\n", t); qsort(&(rl->t[0]), bootstrapsPerformed, sizeof(topolRELL*), compareTopolRell); restoreTL(rl, tr, 0); #endif t = gettime(); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); slowSearches = bootstrapsPerformed / 5; if(bootstrapsPerformed % 5 != 0) slowSearches++; slowSearches = MIN(slowSearches, 10); #ifdef _WAYNE_MPI if(processes > 1) { if(slowSearches % processes == 0) slowSearches = slowSearches / processes; else slowSearches = (slowSearches / processes) + 1; } for(i = 0; i < slowSearches; i++) { j = i + n * processID; restoreTL(rl, tr, j); rl->t[j]->likelihood = unlikely; evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1.0); thoroughOptimization(tr, adef, rl, j); } #else for(i = 0; i < slowSearches; i++) { restoreTL(rl, tr, i); rl->t[i]->likelihood = unlikely; evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1.0); thoroughOptimization(tr, adef, rl, i); } #endif /*************************************************************************************************************/ if(tr->rateHetModel == CAT) { catToGamma(tr, adef); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); } bestIndex = -1; bestLH = unlikely; #ifdef _WAYNE_MPI for(i = 0; i < slowSearches; i++) { j = i + n * processID; restoreTL(rl, tr, j); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); printBothOpen("Slow ML Search %d Likelihood: %f\n", j, tr->likelihood); if(tr->likelihood > bestLH) { bestLH = tr->likelihood; bestIndex = j; } } /*printf("processID = %d, bestIndex = %d; bestLH = %f\n", processID, bestIndex, bestLH);*/ #else for(i = 0; i < slowSearches; i++) { restoreTL(rl, tr, i); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); printBothOpen("Slow ML Search %d Likelihood: %f\n", i, tr->likelihood); if(tr->likelihood > bestLH) { bestLH = tr->likelihood; bestIndex = i; } } #endif printBothOpenMPI("Slow ML optimization finished\n\n"); t = gettime() - t; #ifdef _WAYNE_MPI printBothOpen("Slow ML search on Process %d: Time %f seconds\n", processID, t); #else printBothOpen("Slow ML search Time: %f seconds\n", t); #endif t = gettime(); restoreTL(rl, tr, bestIndex); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); Thorough = 1; tr->doCutoff = FALSE; treeOptimizeThorough(tr, 1, 10); evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); t = gettime() - t; #ifdef _WAYNE_MPI printBothOpen("Thorough ML search on Process %d: Time %f seconds\n", processID, t); #else printBothOpen("Thorough ML search Time: %f seconds\n", t); #endif #ifdef _WAYNE_MPI bestLH = tr->likelihood; printf("\nprocessID = %d, bestLH = %f\n", processID, bestLH); if(processes > 1) { double *buffer; int bestProcess; buffer = (double *)rax_malloc(sizeof(double) * processes); for(i = 0; i < processes; i++) buffer[i] = unlikely; buffer[processID] = bestLH; for(i = 0; i < processes; i++) MPI_Bcast(&buffer[i], 1, MPI_DOUBLE, i, MPI_COMM_WORLD); bestLH = buffer[0]; bestProcess = 0; for(i = 1; i < processes; i++) if(buffer[i] > bestLH) { bestLH = buffer[i]; bestProcess = i; } rax_free(buffer); if(processID != bestProcess) { MPI_Finalize(); exit(0); } } #endif printBothOpen("\nFinal ML Optimization Likelihood: %f\n", tr->likelihood); printBothOpen("\nModel Information:\n\n"); printModelParams(tr, adef); strcpy(bestTreeFileName, workdir); strcat(bestTreeFileName, "RAxML_bestTree."); strcat(bestTreeFileName, run_id); Tree2String(tr->tree_string, tr, tr->start->back, TRUE, TRUE, FALSE, FALSE, TRUE, adef, SUMMARIZE_LH, FALSE, FALSE, FALSE, FALSE); f = myfopen(bestTreeFileName, "wb"); fprintf(f, "%s", tr->tree_string); fclose(f); if(adef->perGeneBranchLengths) printTreePerGene(tr, adef, bestTreeFileName, "w"); overallTime = gettime() - masterTime; mlTime = gettime() - mlTime; printBothOpen("\nML search took %f secs or %f hours\n", mlTime, mlTime / 3600.0); printBothOpen("\nCombined Bootstrap and ML search took %f secs or %f hours\n", overallTime, overallTime / 3600.0); printBothOpen("\nDrawing Bootstrap Support Values on best-scoring ML tree ...\n\n"); freeTL(rl); rax_free(rl); calcBipartitions(tr, adef, bestTreeFileName, bootstrapFileName); overallTime = gettime() - masterTime; printBothOpen("Program execution info written to %s\n", infoFileName); printBothOpen("All %d bootstrapped trees written to: %s\n\n", bootstrapsPerformed, bootstrapFileName); printBothOpen("Best-scoring ML tree written to: %s\n\n", bestTreeFileName); if(adef->perGeneBranchLengths && tr->NumberOfModels > 1) printBothOpen("Per-Partition branch lengths of best-scoring ML tree written to %s.PARTITION.0 to %s.PARTITION.%d\n\n", bestTreeFileName, bestTreeFileName, tr->NumberOfModels - 1); printBothOpen("Best-scoring ML tree with support values written to: %s\n\n", bipartitionsFileName); printBothOpen("Best-scoring ML tree with support values as branch labels written to: %s\n\n", bipartitionsFileNameBranchLabels); printBothOpen("Overall execution time for full ML analysis: %f secs or %f hours or %f days\n\n", overallTime, overallTime/3600.0, overallTime/86400.0); #ifdef _WAYNE_MPI MPI_Finalize(); #endif exit(0); }
void doInference(tree *tr, analdef *adef, rawdata *rdta, cruncheddata *cdta) { int i, n; #ifdef _WAYNE_MPI int j, bestProcess; #endif double loopTime; topolRELL_LIST *rl = (topolRELL_LIST *)NULL; int best = -1, newBest = -1; double bestLH = unlikely; FILE *f; char bestTreeFileName[1024]; double overallTime; n = adef->multipleRuns; #ifdef _WAYNE_MPI if(n % processes != 0) n = processes * ((n / processes) + 1); #endif if(!tr->catOnly) { rl = (topolRELL_LIST *)rax_malloc(sizeof(topolRELL_LIST)); initTL(rl, tr, n); } #ifdef _WAYNE_MPI long parsimonySeed0 = adef->parsimonySeed; n = n / processes; #endif if(adef->rellBootstrap) { #ifdef _WAYNE_MPI tr->resample = permutationSH(tr, NUM_RELL_BOOTSTRAPS, parsimonySeed0 + 10000 * processID); #else tr->resample = permutationSH(tr, NUM_RELL_BOOTSTRAPS, adef->parsimonySeed); #endif tr->rellTrees = (treeList *)rax_malloc(sizeof(treeList)); initTreeList(tr->rellTrees, tr, NUM_RELL_BOOTSTRAPS); } else { tr->resample = (int *)NULL; tr->rellTrees = (treeList *)NULL; } for(i = 0; i < n; i++) { #ifdef _WAYNE_MPI if(i == 0) { if(parsimonySeed0 != 0) adef->parsimonySeed = parsimonySeed0 + 10000 * processID; } j = i + n * processID; tr->treeID = j; #else tr->treeID = i; #endif tr->checkPointCounter = 0; loopTime = gettime(); initModel(tr, rdta, cdta, adef); if(i == 0) printBaseFrequencies(tr); getStartingTree(tr, adef); computeBIGRAPID(tr, adef, TRUE); #ifdef _WAYNE_MPI if(tr->likelihood > bestLH) { best = j; bestLH = tr->likelihood; } if(!tr->catOnly) saveTL(rl, tr, j); #else if(tr->likelihood > bestLH) { best = i; bestLH = tr->likelihood; } if(!tr->catOnly) saveTL(rl, tr, i); #endif loopTime = gettime() - loopTime; writeInfoFile(adef, tr, loopTime); } assert(best >= 0); #ifdef _WAYNE_MPI MPI_Barrier(MPI_COMM_WORLD); n = n * processes; #endif if(tr->catOnly) { printBothOpenMPI("\n\nNOT conducting any final model optimizations on all %d trees under CAT-based model ....\n", n); printBothOpenMPI("\nREMEMBER that CAT-based likelihood scores are meaningless!\n\n", n); #ifdef _WAYNE_MPI if(processID != 0) { MPI_Finalize(); exit(0); } #endif } else { printBothOpenMPI("\n\nConducting final model optimizations on all %d trees under GAMMA-based models ....\n\n", n); #ifdef _WAYNE_MPI n = n / processes; #endif if(tr->rateHetModel == GAMMA || tr->rateHetModel == GAMMA_I) { restoreTL(rl, tr, best); evaluateGenericInitrav(tr, tr->start); if(!adef->useBinaryModelFile) modOpt(tr, adef, FALSE, adef->likelihoodEpsilon); else { readBinaryModel(tr, adef); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); } bestLH = tr->likelihood; tr->likelihoods[best] = tr->likelihood; saveTL(rl, tr, best); tr->treeID = best; printResult(tr, adef, TRUE); newBest = best; for(i = 0; i < n; i++) { #ifdef _WAYNE_MPI j = i + n * processID; if(j != best) { restoreTL(rl, tr, j); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1); tr->likelihoods[j] = tr->likelihood; if(tr->likelihood > bestLH) { newBest = j; bestLH = tr->likelihood; saveTL(rl, tr, j); } tr->treeID = j; printResult(tr, adef, TRUE); } if(n == 1 && processes == 1) printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName); else printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", j, tr->likelihoods[j], resultFileName, j); #else if(i != best) { restoreTL(rl, tr, i); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 1); tr->likelihoods[i] = tr->likelihood; if(tr->likelihood > bestLH) { newBest = i; bestLH = tr->likelihood; saveTL(rl, tr, i); } tr->treeID = i; printResult(tr, adef, TRUE); } if(n == 1) printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName); else printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", i, tr->likelihoods[i], resultFileName, i); #endif } } else { catToGamma(tr, adef); #ifdef _WAYNE_MPI for(i = 0; i < n; i++) { j = i + n*processID; rl->t[j]->likelihood = unlikely; } #else for(i = 0; i < n; i++) rl->t[i]->likelihood = unlikely; #endif initModel(tr, rdta, cdta, adef); restoreTL(rl, tr, best); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); modOpt(tr, adef, TRUE, adef->likelihoodEpsilon); tr->likelihoods[best] = tr->likelihood; bestLH = tr->likelihood; saveTL(rl, tr, best); tr->treeID = best; printResult(tr, adef, TRUE); newBest = best; for(i = 0; i < n; i++) { #ifdef _WAYNE_MPI j = i + n*processID; if(j != best) { restoreTL(rl, tr, j); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); tr->likelihoods[j] = tr->likelihood; if(tr->likelihood > bestLH) { newBest = j; bestLH = tr->likelihood; saveTL(rl, tr, j); } tr->treeID = j; printResult(tr, adef, TRUE); } if(n == 1 && processes == 1) printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName); else printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", j, tr->likelihoods[j], resultFileName, j); #else if(i != best) { restoreTL(rl, tr, i); resetBranches(tr); evaluateGenericInitrav(tr, tr->start); treeEvaluate(tr, 2); tr->likelihoods[i] = tr->likelihood; if(tr->likelihood > bestLH) { newBest = i; bestLH = tr->likelihood; saveTL(rl, tr, i); } tr->treeID = i; printResult(tr, adef, TRUE); } if(n == 1) printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s\n", i, tr->likelihoods[i], resultFileName); else printBothOpen("Inference[%d] final GAMMA-based Likelihood: %f tree written to file %s.RUN.%d\n", i, tr->likelihoods[i], resultFileName, i); #endif } } assert(newBest >= 0); #ifdef _WAYNE_MPI if(processes > 1) { double *buffer = (double *)rax_malloc(sizeof(double) * processes); for(i = 0; i < processes; i++) buffer[i] = unlikely; buffer[processID] = bestLH; for(i = 0; i < processes; i++) MPI_Bcast(&buffer[i], 1, MPI_DOUBLE, i, MPI_COMM_WORLD); bestLH = buffer[0]; bestProcess = 0; for(i = 1; i < processes; i++) if(buffer[i] > bestLH) { bestLH = buffer[i]; bestProcess = i; } rax_free(buffer); } if(processID == bestProcess) { #endif restoreTL(rl, tr, newBest); evaluateGenericInitrav(tr, tr->start); printBothOpen("\n\nStarting final GAMMA-based thorough Optimization on tree %d likelihood %f .... \n\n", newBest, tr->likelihoods[newBest]); Thorough = 1; tr->doCutoff = FALSE; treeOptimizeThorough(tr, 1, 10); evaluateGenericInitrav(tr, tr->start); printBothOpen("Final GAMMA-based Score of best tree %f\n\n", tr->likelihood); strcpy(bestTreeFileName, workdir); strcat(bestTreeFileName, "RAxML_bestTree."); strcat(bestTreeFileName, run_id); Tree2String(tr->tree_string, tr, tr->start->back, TRUE, TRUE, FALSE, FALSE, TRUE, adef, SUMMARIZE_LH, FALSE, FALSE, FALSE, FALSE); f = myfopen(bestTreeFileName, "wb"); fprintf(f, "%s", tr->tree_string); fclose(f); if(adef->perGeneBranchLengths) printTreePerGene(tr, adef, bestTreeFileName, "w"); #ifdef _WAYNE_MPI } #endif } if(adef->rellBootstrap) { //WARNING the functions below need to be invoked after all other trees have been printed //don't move this part of the code further up! int i; #ifdef _WAYNE_MPI FILE *f = myfopen(rellBootstrapFileNamePID, "wb"); #else FILE *f = myfopen(rellBootstrapFileName, "wb"); #endif for(i = 0; i < NUM_RELL_BOOTSTRAPS; i++) { restoreTreeList(tr->rellTrees, tr, i); Tree2String(tr->tree_string, tr, tr->start->back, FALSE, TRUE, FALSE, FALSE, TRUE, adef, SUMMARIZE_LH, FALSE, FALSE, FALSE, FALSE); fprintf(f, "%s", tr->tree_string); } freeTreeList(tr->rellTrees); rax_free(tr->rellTrees); rax_free(tr->resample); fclose(f); #ifdef _WAYNE_MPI MPI_Barrier(MPI_COMM_WORLD); concatenateBSFiles(processes, rellBootstrapFileName); removeBSFiles(processes, rellBootstrapFileName); MPI_Barrier(MPI_COMM_WORLD); if(processID == 0) printBothOpen("\nRELL bootstraps written to file %s\n", rellBootstrapFileName); #else printBothOpen("\nRELL bootstraps written to file %s\n", rellBootstrapFileName); #endif } #ifdef _WAYNE_MPI if(processID == bestProcess) { #endif overallTime = gettime() - masterTime; printBothOpen("Program execution info written to %s\n", infoFileName); if(!tr->catOnly) { printBothOpen("Best-scoring ML tree written to: %s\n\n", bestTreeFileName); if(adef->perGeneBranchLengths && tr->NumberOfModels > 1) printBothOpen("Per-Partition branch lengths of best-scoring ML tree written to %s.PARTITION.0 to %s.PARTITION.%d\n\n", bestTreeFileName, bestTreeFileName, tr->NumberOfModels - 1); } printBothOpen("Overall execution time: %f secs or %f hours or %f days\n\n", overallTime, overallTime/3600.0, overallTime/86400.0); #ifdef _WAYNE_MPI } #endif if(!tr->catOnly) { freeTL(rl); rax_free(rl); } #ifdef _WAYNE_MPI MPI_Finalize(); #endif exit(0); }
static double evaluatePartialGTRCAT(int i, double ki, int counter, traversalInfo *ti, double qz, int w, double *EIGN, double *EI, double *EV, double *tipVector, unsigned char **yVector, int branchReference, int mxtips, double* expVector) { double lz, term; double d[3]; double *x1, *x2; int scale = 0, k; double *lVector = (double *)rax_malloc(sizeof(double) * 4 * mxtips); traversalInfo *trav = &ti[0]; assert(isTip(trav->pNumber, mxtips)); x1 = &(tipVector[4 * yVector[trav->pNumber][i]]); if (!expVector) { for(k = 1; k < counter; k++) computeVectorGTRCAT(lVector, &scale, ki, i, ti[k].qz[branchReference], ti[k].rz[branchReference], &ti[k], EIGN, EI, EV, tipVector, yVector, mxtips); } else { for(k = 1; k < counter; k++) { double *ev1 = &expVector[k*6], *ev2 = &expVector[k*6+3]; computeVectorGTRCAT_FAST(lVector, &scale, ki, i, &ti[k], EI, EV, tipVector, yVector, mxtips, ev1, ev2); } } x2 = &lVector[4 * (trav->qNumber - mxtips)]; assert(0 <= (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips); if(qz < zmin) lz = zmin; lz = log(qz); lz *= ki; d[0] = EXP (EIGN[0] * lz); d[1] = EXP (EIGN[1] * lz); d[2] = EXP (EIGN[2] * lz); term = x1[0] * x2[0]; term += x1[1] * x2[1] * d[0]; term += x1[2] * x2[2] * d[1]; term += x1[3] * x2[3] * d[2]; term = LOG(FABS(term)) + (scale * LOG(minlikelihood)); term = term * w; rax_free(lVector); return term; }
void computeBOOTRAPID (tree *tr, analdef *adef, int64_t *radiusSeed) { int i, bestTrav, impr; double lh, previousLh, difference, epsilon; bestlist *bestT, *bt; int countIT; bestT = (bestlist *) rax_malloc(sizeof(bestlist)); bestT->ninit = 0; initBestTree(bestT, 1, tr->mxtips); saveBestTree(bestT, tr); bt = (bestlist *) rax_malloc(sizeof(bestlist)); bt->ninit = 0; initBestTree(bt, 5, tr->mxtips); initInfoList(10); difference = 10.0; epsilon = 0.01; bestTrav = adef->bestTrav = 5 + 11 * randum(radiusSeed); Thorough = 1; impr = 1; if(tr->doCutoff) tr->itCount = 0; tr->bigCutoff = TRUE; for(countIT = 0; countIT < 2 && impr; countIT++) { recallBestTree(bestT, 1, tr); treeEvaluate(tr, 1); saveBestTree(bestT, tr); lh = previousLh = tr->likelihood; treeOptimizeRapid(tr, 1, bestTrav, adef, bt); impr = 0; for(i = 1; i <= bt->nvalid; i++) { recallBestTree(bt, i, tr); treeEvaluate(tr, 0.25); difference = ((tr->likelihood > previousLh)? tr->likelihood - previousLh: previousLh - tr->likelihood); if(tr->likelihood > lh && difference > epsilon) { impr = 1; lh = tr->likelihood; saveBestTree(bestT, tr); } } } tr->bigCutoff = FALSE; recallBestTree(bestT, 1, tr); freeBestTree(bestT); rax_free(bestT); freeBestTree(bt); rax_free(bt); freeInfoList(); }
char *Tree2String(char *treestr, tree *tr, nodeptr p, boolean printBranchLengths, boolean printNames, boolean printLikelihood, boolean rellTree, boolean finalPrint, analdef *adef, int perGene, boolean branchLabelSupport, boolean printSHSupport, boolean printIC, boolean printSHSupports) { if(rellTree) assert(!branchLabelSupport && !printSHSupport); if(branchLabelSupport) assert(!rellTree && !printSHSupport); if(printSHSupport) assert(!branchLabelSupport && !rellTree); if(finalPrint && adef->outgroup) { nodeptr startNode = tr->start; if(tr->numberOfOutgroups > 1) { nodeptr root; nodeptr *subtrees = (nodeptr *)rax_malloc(sizeof(nodeptr) * tr->mxtips); int i, k, count = 0; int *nodeNumbers = (int*)rax_malloc(sizeof(int) * tr->numberOfOutgroups); int *foundVector = (int*)rax_malloc(sizeof(int) * tr->numberOfOutgroups); boolean monophyletic = FALSE; collectSubtrees(tr, subtrees, &count, tr->numberOfOutgroups); /*printf("Found %d subtrees of size %d\n", count, tr->numberOfOutgroups);*/ for(i = 0; (i < count) && (!monophyletic); i++) { int l, sum, nc = 0; for(k = 0; k < tr->numberOfOutgroups; k++) { nodeNumbers[k] = -1; foundVector[k] = 0; } checkOM(subtrees[i], nodeNumbers, &nc, tr); for(l = 0; l < tr->numberOfOutgroups; l++) for(k = 0; k < tr->numberOfOutgroups; k++) { if(nodeNumbers[l] == tr->outgroupNums[k]) foundVector[l] = 1; } sum = 0; for(l = 0; l < tr->numberOfOutgroups; l++) sum += foundVector[l]; if(sum == tr->numberOfOutgroups) { root = subtrees[i]; tr->start = root; /*printf("outgroups are monphyletic!\n");*/ monophyletic = TRUE; } else { if(sum > 0) { /*printf("outgroups are NOT monophyletic!\n");*/ monophyletic = FALSE; } } } if(!monophyletic) { printf("WARNING, outgroups are not monophyletic, using first outgroup \"%s\"\n", tr->nameList[tr->outgroupNums[0]]); printf("from the list to root the tree!\n"); { FILE *infoFile = myfopen(infoFileName, "ab"); fprintf(infoFile, "\nWARNING, outgroups are not monophyletic, using first outgroup \"%s\"\n", tr->nameList[tr->outgroupNums[0]]); fprintf(infoFile, "from the list to root the tree!\n"); fclose(infoFile); } tr->start = tr->nodep[tr->outgroupNums[0]]; rootedTree(treestr, tr, tr->start->back, printBranchLengths, printNames, printLikelihood, rellTree, finalPrint, adef, perGene, branchLabelSupport, printSHSupport); } else { if(isTip(tr->start->number, tr->rdta->numsp)) { printf("Outgroup-Monophyly ERROR; tr->start is a tip \n"); errorExit(-1); } if(isTip(tr->start->back->number, tr->rdta->numsp)) { printf("Outgroup-Monophyly ERROR; tr->start is a tip \n"); errorExit(-1); } rootedTree(treestr, tr, tr->start->back, printBranchLengths, printNames, printLikelihood, rellTree, finalPrint, adef, perGene, branchLabelSupport, printSHSupport); } rax_free(foundVector); rax_free(nodeNumbers); rax_free(subtrees); } else { tr->start = tr->nodep[tr->outgroupNums[0]]; rootedTree(treestr, tr, tr->start->back, printBranchLengths, printNames, printLikelihood, rellTree, finalPrint, adef, perGene, branchLabelSupport, printSHSupports); } tr->start = startNode; } else Tree2StringREC(treestr, tr, p, printBranchLengths, printNames, printLikelihood, rellTree, finalPrint, perGene, branchLabelSupport, printSHSupport, printIC, printSHSupports); while (*treestr) treestr++; return treestr; }
//Use the plausibility checker overhead void plausibilityChecker(tree *tr, analdef *adef) { FILE *treeFile, *treeFile2, *rfFile; tree *smallTree = (tree *)rax_malloc(sizeof(tree)); char rfFileName[1024]; int numberOfTreesAnalyzed = 0, i; double avgRF = 0.0, sumEffectivetime = 0.0; /* set up an output file name */ strcpy(rfFileName, workdir); strcat(rfFileName, "RAxML_RF-Distances."); strcat(rfFileName, run_id); rfFile = myfopen(rfFileName, "wb"); assert(adef->mode == PLAUSIBILITY_CHECKER); /* open the big reference tree file and parse it */ treeFile = myfopen(tree_file, "r"); printBothOpen("Parsing reference tree %s\n", tree_file); treeReadLen(treeFile, tr, FALSE, TRUE, TRUE, adef, TRUE, FALSE); assert(tr->mxtips == tr->ntips); /*************************************************************************************/ /* Preprocessing Step */ double preprocesstime = gettime(); /* taxonToLabel[2*tr->mxtips - 2]; Array storing all 2n-2 labels from the preordertraversal: (Taxonnumber - 1) -> (Preorderlabel) */ int *taxonToLabel = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)), /* taxonHasDeg[2*tr->mxtips - 2] Array used to store the degree of every taxon, is needed to extract Bipartitions from multifurcating trees (Taxonnumber - 1) -> (degree of node(Taxonnumber)) */ *taxonHasDeg = (int *)rax_calloc((2*tr->mxtips - 2),sizeof(int)), /* taxonToReduction[2*tr->mxtips - 2]; Array used for reducing bitvector and speeding up extraction: (Taxonnumber - 1) -> Index in smallTreeTaxa (starting from 0) which is also: (Taxonnumber - 1) -> (0..1 (increment count of taxa appearing in small tree)) (Taxonnumber - 1) -> (0..1 (increment count of inner nodes appearing in small tree)) */ *taxonToReduction = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)); int newcount = 0; //counter used for correct traversals /* labelToTaxon[2*tr->mxtips - 2]; is used to translate between Perorderlabel and p->number: (Preorderlabel) -> (Taxonnumber) */ int *labelToTaxon = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)); /* Preorder-Traversal of the large tree */ preOrderTraversal(tr->start->back,tr->mxtips, tr->start->number, taxonToLabel, labelToTaxon, &newcount); newcount = 0; //counter set to 0 to be now used for Eulertraversal /* eulerIndexToLabel[4*tr->mxtips - 5]; Array storing all 4n-5 PreOrderlabels created during eulertour: (Eulerindex) -> (Preorderlabel) */ int* eulerIndexToLabel = (int *)rax_malloc((4*tr->mxtips - 5) * sizeof(int)); /* taxonToEulerIndex[tr->mxtips]; Stores all indices of the first appearance of a taxa in the eulerTour: (Taxonnumber - 1) -> (Index of the Eulertour where Taxonnumber first appears) is used for efficient computation of the Lowest Common Ancestor during Reconstruction Step */ int* taxonToEulerIndex = (int *)rax_malloc((tr->mxtips) * sizeof(int)); /* Init taxonToEulerIndex and taxonToReduction */ int ix; for(ix = 0; ix < tr->mxtips; ++ix) taxonToEulerIndex[ix] = -1; for(ix = 0; ix < (2*tr->mxtips - 2); ++ix) taxonToReduction[ix] = -1; /* Eulertraversal of the large tree*/ unrootedEulerTour(tr->start->back,tr->mxtips, eulerIndexToLabel, taxonToLabel, &newcount, taxonToEulerIndex); /* Creating RMQ Datastructure for efficient retrieval of LCAs, using Johannes Fischers Library rewritten in C Following Files: rmq.h,rmqs.c,rmqs.h are included in Makefile.RMQ.gcc */ RMQ_succinct(eulerIndexToLabel,4*tr->mxtips - 5); double preprocessendtime = gettime() - preprocesstime; /* Proprocessing Step End */ /*************************************************************************************/ printBothOpen("The reference tree has %d tips\n", tr->ntips); fclose(treeFile); /***********************************************************************************/ /* RF-OPT Preprocessing Step */ /***********************************************************************************/ /* now see how many small trees we have */ treeFile = getNumberOfTrees(tr, bootStrapFile, adef); treeFile2 = getNumberOfTrees(tr, bootStrapFile, adef); checkTreeNumber(tr->numberOfTrees, bootStrapFile); /* allocate a data structure for parsing the potentially mult-furcating tree */ allocateMultifurcations(tr, smallTree); /* Start Additional preprocessing step */ int numberOfBips = 0, numberOfSets = 0; //Stores the number of bips of each tree int *bipsPerTree = (int *)rax_malloc(tr->numberOfTrees * sizeof(int)); //Stores the number of taxa for each tree int *taxaPerTree = (int *)rax_malloc(tr->numberOfTrees * sizeof(int)); //To calculate all bipartitions, I created a new treeFile2 and a new getNumberOfTrees method!! for(i = 0; i < tr->numberOfTrees; i++) { int this_treeBips = readMultifurcatingTree(treeFile2, smallTree, adef, TRUE); numberOfBips = numberOfBips + this_treeBips; numberOfSets = numberOfSets + this_treeBips * this_treeBips; bipsPerTree[i] = this_treeBips; } printf("numberOfBips: %i , numberOfSets: %i \n \n", numberOfBips, numberOfSets); //stores induced bips (OLD?) unsigned int *ind_bips = (unsigned int *)rax_malloc(numberOfBips * sizeof(unsigned int)); //stores smalltree bips (OLD?) unsigned int *s_bips = (unsigned int *)rax_malloc(numberOfBips * sizeof(unsigned int)); //stores small bips per tree unsigned int ***sBipsPerTree = (unsigned int ***)rax_malloc(tr->numberOfTrees * sizeof(unsigned int**)); //stores induced bips per tree unsigned int ***indBipsPerTree = (unsigned int ***)rax_malloc(tr->numberOfTrees * sizeof(unsigned int**)); //stores vLength of each tree for processing bitVectors unsigned int *vectorLengthPerTree = (unsigned int *)rax_malloc(tr->numberOfTrees * sizeof(unsigned int*)); //stores the corresponding tree number for each bip int *treenumberOfBip = (int *)rax_malloc(numberOfBips * sizeof(int)); //Stores all dropsets of all trees int **sets = (int **)rax_malloc(numberOfSets * sizeof(int*)); //int **sets = NULL; //For each tree, stores a translation array from taxanumber smalltree->largetree int **smallTreeTaxaList = (int **)rax_malloc(tr->numberOfTrees * sizeof(int*)); //For each tree, store a translation array from taxanumber largetree->smalltree int **taxonToReductionList = (int **)rax_malloc(tr->numberOfTrees * sizeof(int*)); //I use these variables as global variables for all trees to determine the max number of possible sets to generate a static array int currentBips = 0; int currentSmallBips = 0; int currentSets = 0; //int currentTree = 0; already there in number of trees analyzed //Prefill sets with -1s for(int it = 0;it < (numberOfSets);it++){ int fill[1] = {-1}; sets[it] = fill; } /***********************************************************************************/ /* RF-OPT Preprocessing Step End */ /***********************************************************************************/ /* loop over all small trees */ for(i = 0; i < tr->numberOfTrees; i++) { int numberOfSplits = readMultifurcatingTree(treeFile, smallTree, adef, TRUE); if(numberOfSplits > 0) { int firstTaxon; double rec_rf, maxRF; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Small tree %d has %d tips and %d bipartitions\n", i, smallTree->ntips, numberOfSplits); /* compute the maximum RF distance for computing the relative RF distance later-on */ /* note that here we need to pay attention, since the RF distance is not normalized by 2 * (n-3) but we need to account for the fact that the multifurcating small tree will potentially contain less bipartitions. Hence the normalization factor is obtained as n-3 + numberOfSplits, where n-3 is the number of bipartitions of the pruned down large reference tree for which we know that it is bifurcating/strictly binary */ maxRF = (double)(2 * numberOfSplits); /* now get the index of the first taxon of the small tree. we will use this to unambiguously store the bipartitions */ firstTaxon = smallTree->start->number; //Saves the number of taxa in the tree (for RF-OPT) taxaPerTree[numberOfTreesAnalyzed] = smallTree->ntips; /***********************************************************************************/ /* Reconstruction Step */ double time_start = gettime(); /* Init hashtable to store Bipartitions of the induced subtree T|t_i */ /* using smallTree->ntips instead of smallTree->mxtips yields faster code e.g. 120 versus 128 seconds for 20,000 small trees on my laptop */ hashtable *s_hash = initHashTable(smallTree->ntips * 4); /* Init hashtable to store Bipartitions of the reference tree t_i*/ hashtable *ind_hash = initHashTable(smallTree->ntips * 4); /* smallTreeTaxa[smallTree->ntips]; Stores all taxa numbers from smallTree into an array called smallTreeTaxa: (Index) -> (Taxonnumber) */ int* smallTreeTaxa = (int *)rax_malloc((smallTree->ntips) * sizeof(int)); /* counter is set to 0 for correctly extracting taxa of the small tree */ newcount = 0; int newcount2 = 0; /* seq2[2*smallTree->ntips - 2]; stores PreorderSequence of the reference smalltree: (Preorderindex) -> (Taxonnumber) */ int* seq2 = (int *)rax_malloc((2*smallTree->ntips - 2) * sizeof(int)); /* used to store the vectorLength of the bitvector */ unsigned int vectorLength; /* extract all taxa of the smalltree and store it into an array, also store all counts of taxa and nontaxa in taxonToReduction */ rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start, smallTree->mxtips, &newcount, &newcount2); rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start->back, smallTree->mxtips, &newcount, &newcount2); /* counter is set to 0 to correctly preorder traverse the small tree */ newcount = 0; /* Preordertraversal of the small reference tree and save its sequence into seq2 for later extracting the bipartitions, it also stores information about the degree of every node */ rec_preOrderTraversalMulti(smallTree->start->back,smallTree->mxtips, smallTree->start->number, seq2, taxonHasDeg, &newcount); /* calculate the bitvector length */ if(smallTree->ntips % MASK_LENGTH == 0) vectorLength = smallTree->ntips / MASK_LENGTH; else vectorLength = 1 + (smallTree->ntips / MASK_LENGTH); /***********************************************************************************/ /* RF-OPT Additional Preprocessing storing Bipartitions */ /***********************************************************************************/ vectorLengthPerTree[numberOfTreesAnalyzed] = vectorLength; unsigned int **bitVectors = rec_initBitVector(smallTree, vectorLength); unsigned int **sBips; /* store all non trivial bitvectors using an subtree approach for the reference subtree and store it into a hashtable, this method was changed for multifurcation */ sBips = RFOPT_extractBipartitionsMulti(bitVectors, seq2, newcount,tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, taxonToReduction, taxonHasDeg, numberOfSplits); sBipsPerTree[numberOfTreesAnalyzed] = sBips; /***********************************************************************************/ /* End RF-OPT Additional Preprocessing storing Bipartitions */ /***********************************************************************************/ /* counter is set to 0 to be used for correctly storing all EulerIndices */ newcount = 0; /* smallTreeTaxonToEulerIndex[smallTree->ntips]; Saves all first Euler indices for all Taxons appearing in small Tree: (Index) -> (Index of the Eulertour where the taxonnumber of the small tree first appears) */ int* smallTreeTaxonToEulerIndex = (int *)rax_malloc((smallTree->ntips) * sizeof(int)); /* seq[(smallTree->ntips*2) - 1] Stores the Preordersequence of the induced small tree */ int* seq = (int *)rax_malloc((2*smallTree->ntips - 1) * sizeof(int)); /* iterate through all small tree taxa */ for(ix = 0; ix < smallTree->ntips; ix++) { int taxanumber = smallTreeTaxa[ix]; /* To create smallTreeTaxonToEulerIndex we filter taxonToEulerIndex for taxa in the small tree*/ smallTreeTaxonToEulerIndex[newcount] = taxonToEulerIndex[taxanumber-1]; /* Saves all Preorderlabel of the smalltree taxa in seq*/ seq[newcount] = taxonToLabel[taxanumber-1]; newcount++; } /* sort the euler indices to correctly calculate LCA */ //quicksort(smallTreeTaxonToEulerIndex,0,newcount - 1); qsort(smallTreeTaxonToEulerIndex, newcount, sizeof(int), sortIntegers); //printf("newcount2 %i \n", newcount2); /* Iterate through all small tree taxa */ for(ix = 1; ix < newcount; ix++) { /* query LCAs using RMQ Datastructure */ seq[newcount - 1 + ix] = eulerIndexToLabel[query(smallTreeTaxonToEulerIndex[ix - 1],smallTreeTaxonToEulerIndex[ix])]; /* Used for dynamic programming. We save an index for every inner node: For example the reference tree has 3 inner nodes which we saves them as 0,1,2. Now we calculate for example 5 LCA to construct the induced subtree, which are also inner nodes. Therefore we mark them as 3,4,5,6,7 */ taxonToReduction[labelToTaxon[seq[newcount - 1 + ix]] - 1] = newcount2; newcount2 += 1; } /* sort to construct the Preordersequence of the induced subtree */ //quicksort(seq,0,(2*smallTree->ntips - 2)); qsort(seq, (2 * smallTree->ntips - 2) + 1, sizeof(int), sortIntegers); /* calculates all bipartitions of the reference small tree and count how many bipartition it shares with the induced small tree and stores those bipartitions in a additional hashtable called ind_hash */ int rec_bips = 0; unsigned int **indBips; indBips = RFOPT_findAddBipartitions(bitVectors, seq,(2*smallTree->ntips - 1), labelToTaxon, tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, ind_hash, taxonToReduction); indBipsPerTree[numberOfTreesAnalyzed] = indBips; /* calculates all bipartitions of the reference small tree and put them into ind_hash*/ // rec_extractBipartitionsMulti(bitVectors, seq2, (2*smallTree->ntips - 1),tr->mxtips, vectorLength, smallTree->ntips, // firstTaxon, s_hash, taxonToReduction, taxonHasDeg, numberOfSplits); /* Reconstruction Step End */ /***********************************************************************************/ double effectivetime = gettime() - time_start; /* if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Reconstruction time: %.10f secs\n\n", effectivetime); */ /* compute the relative RF */ /***********************************************************************************/ /* RF-OPT Save Translation Vectors */ /***********************************************************************************/ //copy array taxonToReduction because it is originally defined in preprocessing step int * taxonToReductionCopy = (int *)rax_malloc((tr->mxtips)*sizeof(int)); memcpy(taxonToReductionCopy,taxonToReduction,(tr->mxtips)*sizeof(int)); //storing smallTree and taxonToReduction Arrays for further usage smallTreeTaxaList[numberOfTreesAnalyzed] = smallTreeTaxa; taxonToReductionList[numberOfTreesAnalyzed] = taxonToReductionCopy; int this_currentSmallBips = 0; //Variable resets everytime for each tree analyzed /***********************************************************************************/ /* End RF-OPT Save Translation Vectors */ /***********************************************************************************/ rec_rf = (double)(2 * (numberOfSplits - rec_bips)) / maxRF; assert(numberOfSplits >= rec_bips); avgRF += rec_rf; sumEffectivetime += effectivetime; //if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Relative RF tree %d: %f\n\n", i, rec_rf); fprintf(rfFile, "%d %f\n", i, rec_rf); //rax_free(smallTreeTaxa); //Need it for calculating the SmallTreeTaxaList after all iterations! rax_free(seq); rax_free(seq2); rax_free(smallTreeTaxonToEulerIndex); numberOfTreesAnalyzed++; //Counting the number of trees analyzed } }// End of Small Tree Iterations /***********************************************************************************/ /* RF-OPT DropSet Calculation using BitVectors */ /***********************************************************************************/ log_info("===> Create DropSet Datastructure \n"); static Hashmap* map = NULL; //Set a hashmap for dropsets with a dropset comparision and standard hash map = Hashmap_create(compareDropSet, NULL); static Hashmap** mapArray = NULL; //Set an array to store the pointers to bitvector hashtables for each tree mapArray = rax_malloc(tr->numberOfTrees * sizeof(Hashmap*)); printf("===> BitVector Set Calculation \n"); //Calculate dropsets of two given bips lists and extract all sets into array sets and into a hashmap. Each set has following format //dropset = {taxa_1,taxa_2,...,taxa_n,-1}; //Furtheremore calculate Dropset generates two data structures from type bips and dropsets which are pointing to each other in hashtables calculateDropSets(mapArray, map, indBipsPerTree, sBipsPerTree, sets, smallTreeTaxaList, bipsPerTree, taxaPerTree, vectorLengthPerTree, tr->numberOfTrees); /***********************************************************************************/ /* RF-OPT Graph Construction */ /***********************************************************************************/ // printf("\n == Sets == \n"); // for(int fooo = 0; fooo < numberOfSets; fooo++){ // printf("Set %i: ", fooo); // int i = 0; // while(sets[fooo][i] > -1) { // printf("%i ",sets[fooo][i]); // i++; // } // printf("\n"); // } // printf("\n"); /* Filter for unique sets */ log_info("===> Hashmap tests...\n"); Hashmap_traverse(map, traverse_cb); // int key[2] = {0,-1}; // Dropset* drop = Hashmap_get(map,key); // DArray* bips = drop->bipartitions; // for(int i = 0; i < DArray_count(bips); i++) { // Bipartition* bip = DArray_get(bips,i); // printBitVector(bip->bitvector[0]); // printf("matching: %i \n", bip->matching); // printf("tree: %i \n", bip->treenumber); // } // Bipartition* bipFromHash = DArray_first(bips); // Bipartition* testBip = Hashmap_get(mapArray[0],bipFromHash->bitvector); // printf("matching before: %i",testBip->matching); // testBip->matching = 999; // for(int i = 0; i < DArray_count(bips); i++) { // Bipartition* bip = DArray_get(bips,i); // printBitVector(bip->bitvector[0]); // printf("matching: %i \n", bip->matching); // printf("tree: %i \n", bip->treenumber); // } printf("===> Filter for unique sets (naive)...\n"); /* unique sets array data structures */ int** uniqSets = (int **) rax_malloc(sizeof(int*) * numberOfSets); int* setsToUniqSets = (int*) rax_malloc(sizeof(int) * numberOfSets); int numberOfUniqueSets = 0; int dropSetCount = 0; //stores the scores for each bips, we are using a bitvector approach (need to scale) //Legacy Code int bvec_scores = 0; numberOfUniqueSets = getUniqueDropSets(sets, uniqSets, setsToUniqSets, numberOfSets); printf("number of unique sets: %i \n", numberOfUniqueSets); /* Detect initial matchings, we calculate them using bitvectors to represent our bipartitions */ printf("===> Detect initial matchings...\n"); int vLengthBip = 0; //determine the bitVector Length of our bitVector if(numberOfBips % MASK_LENGTH == 0) vLengthBip = numberOfBips / MASK_LENGTH; else vLengthBip = numberOfBips / MASK_LENGTH + 1; //Initialize a bvecScore vector with 0s int* bvecScores = (int*)rax_calloc(vLengthBip,sizeof(int)); //Calculate Initial Matchings and save the result in bvecScores detectInitialMatchings(sets, bvecScores, bipsPerTree, numberOfTreesAnalyzed, vLengthBip); //Short summary until now: // - bipsPerTree consists of all bipartitions per tree // - bvecScores is the bitvector setting 1 to all bipartition indices which can score // - taxaPerTree number of taxa per tree // - smallTreeTaxaList list of all smalltree->largetree translation arrays /* Generate useful data structures for calculating and updating scores */ printf("===> Create data structures...\n"); //Stores the number of bips per Set and initialize it with 0s int* numberOfBipsPerSet = (int*)rax_calloc(numberOfUniqueSets,sizeof(int)); //Stores all sets which includes this taxa int **setsOfTaxa = (int**)rax_malloc((tr->mxtips + 1) *sizeof(int*)); //Now calculate number of bipartitions affected by each unique set for(int i = 0; i < numberOfSets; i++) { int setindex = setsToUniqSets[i]; numberOfBipsPerSet[setindex]++; } //Now using the knowledge of how many bips there are per set, generate an array for each unique dropset containing all bips int** bipsOfDropSet = (int**)rax_malloc(sizeof(int*)*numberOfUniqueSets); //Allocate the space needed for storing all bips for(int i = 0; i < numberOfUniqueSets; i++) { bipsOfDropSet[i] = (int*)rax_malloc(sizeof(int)*numberOfBipsPerSet[i]); } printf("==> Initialize the Bips Of Taxa \n"); //Stores the number of bips each taxa is included (ABC|DE is stored by A,B,C,D and E) //It can be calculated by iterating through all trees and adding the taxa int **bipsOfTaxa = (int**)rax_malloc((tr->mxtips + 1) * sizeof(int*)); int *numberOfBipsPerTaxa = (int*)rax_calloc((tr->mxtips + 1), sizeof(int)); int *taxaBipsCounter = (int*)rax_calloc((tr->mxtips + 1), sizeof(int)); //Now add up all for (int tree = 0; tree < tr->numberOfTrees; tree++) { int* list = smallTreeTaxaList[tree]; for (int j = 0; j < taxaPerTree[tree]; j++) { int taxa = list[j]; numberOfBipsPerTaxa[taxa] = numberOfBipsPerTaxa[taxa] + bipsPerTree[tree]; } } //Now create dummy arrays inside bipsOfTaxa for(int i = 1; i < tr->mxtips+1; i++) { bipsOfTaxa[i] = (int*)rax_malloc(sizeof(int)*numberOfBipsPerTaxa[i]); } printf("==> Storing all bip indices of a certain dropset into an array \n"); //For checking if all dropsets are iterated dropSetCount = 0; //Arrays of counter to keep in track int* counterOfSet = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets); for(int i = 0; i < numberOfUniqueSets; i++) { counterOfSet[i] = 0; } currentBips = 0; //Need to keep in track of the number of bips //First iterate through all trees for(int i = 0; i < numberOfTreesAnalyzed; i++ ) { //get the correct smallTreeTaxa List int* list = smallTreeTaxaList[i]; //For each bipartition in the tree for(int j = 0; j < bipsPerTree[i]; j++) { //Look at all bips it is compared too int dropSetsPerBip = bipsPerTree[i]; for(int k = 0; k < dropSetsPerBip; k++){ int indexOfUniqDropSet = setsToUniqSets[dropSetCount + k]; int* bips_array = bipsOfDropSet[indexOfUniqDropSet]; //add bipartition j into the bips array of its dropset bips_array[counterOfSet[indexOfUniqDropSet]] = currentBips; //increment the internal array index counterOfSet[indexOfUniqDropSet]++; } //Jump to the next correct dropSetCount! dropSetCount = dropSetCount + dropSetsPerBip; //now insert the bip into bipsOfTaxa Array for(int ix = 0; ix < taxaPerTree[i]; ix++) { //get the taxa number int stree_Taxa = list[ix]; //get the bips list of this taxa number int* bipsList = bipsOfTaxa[stree_Taxa]; //now get the position of the biplist and put in our bip index bipsList[taxaBipsCounter[stree_Taxa]] = currentBips; //increment the counter taxaBipsCounter[stree_Taxa]++; } //increment currentBips currentBips++; } } /***********************************************************************************/ /* End RF-OPT Graph Construction */ /***********************************************************************************/ /* Short summary : sets - array of all dropsets uniqSets - array of all unique dropsets bipsPerTree - bips per tree setsToUniqSets - translates the index of sets to the index of its unique dropset index bipsOfDropSets - all bips which disappear when dropset i is pruned scores - has all scores between 0 and 1 for the bips (however 0s can be found out by looking at all dropsets with link to dropset 0 (because we sort and it will always be the lowest)) */ /***********************************************************************************/ /* RF-OPT Initial Score Calculation */ /***********************************************************************************/ unsigned int bipsVectorLength; /* calculate the bitvector length for bips bitvector */ if(numberOfBips % MASK_LENGTH == 0) bipsVectorLength = numberOfBips / MASK_LENGTH; else bipsVectorLength = 1 + (numberOfBips / MASK_LENGTH); //Starting from index 1 (because 0 stands for all who already matches) //We need a score array saving the scores for each uniqset int* rf_score = (int*)rax_calloc(numberOfUniqueSets,sizeof(int)); printf("==> Calculating the score for the first iteration \n \n"); //Store all bvecs of all merged and destroyed bipartitions per DropSet int* bvecs_bips = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets); int* bvecs_destroyed = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets); //Iterate through all sets for(int i = 0; i < numberOfUniqueSets; i++) { //Bitvectors of merged and destroyed int bvec_destroyed = 0; int* set = uniqSets[i]; //Get the dropset, first dropset is 0 (if something is matching) //printf(" ==> Analyze Unique DropSet %i \n", i); //We use this data structure to keep track of the to toggled bits int* toggleBits = (int*)rax_calloc(numberOfBips, sizeof(int)); //Now iterate through the set int j = 0; //Stores the affected bips into a bitvector int bvec_bips = 0; while(set[j] != -1) { int taxa = set[j]; //Get the taxa //printf(" Taxa number is %i \n",taxa); //Check if set[j] is itself already a set int test[2] = {taxa,-1}; //0 if it is not a set, index + 1 otherwise int test_index = contains(test, uniqSets, numberOfUniqueSets); if(test_index){ //printf(" It also is in uniqSet %i \n", test_index - 1); bvec_bips = getBipsOfDropSet(bvec_bips, (test_index - 1), numberOfBipsPerSet, bipsOfDropSet); } //Get all bips of this taxa to detect which one will be destroyed int* listOfBips = bipsOfTaxa[taxa]; //Go through all bipartitions containing this taxa for(int k = 0; k < numberOfBipsPerTaxa[taxa]; k++){ int bipindex = listOfBips[k]; //Get the index of the Bipartition int bip = ind_bips[bipindex]; //Now analyze this Bipartition //Which tree does this bipartition belongs too? int treenumber = treenumberOfBip[bipindex]; //Get the taxonToSmallTree Array of this tree int* stTaxa = taxonToReductionList[treenumber]; //Translate the global taxon number it into the local index used by our bips int translated_index = stTaxa[taxa - 1]; //We use taxa - 1 because we start counting at taxa 1 = 0 ! //Save the to toggle index into toggleBits vector toggleBits[bipindex] |= 1 << translated_index; //Sort for bits set on one side of the bip and on the other side int leftBits = __builtin_popcount(toggleBits[bipindex] & bip); int rightBits = __builtin_popcount(toggleBits[bipindex]) - leftBits; //Check for the number of bits set in the original bip int leftBip = __builtin_popcount(bip); int rightBip = taxaPerTree[treenumber] - leftBip; //Subtract the total number of bits set on one side of the bip with the bits we have to toggle int leftBip_after = leftBip - leftBits; int rightBip_after = rightBip - rightBits; //Check if bipartition gets trivial/destroyed due to pruning the taxa and set the bit (representing the bip) which is destroyed if((leftBip_after <= 1) | (rightBip_after <=1)) { //Add bips to the bits which represent destroyed bipartitions bvec_destroyed = setBit(bvec_destroyed,bipindex); } } j++; }//End iterate through the set int penality = 0; int score = 0; int bvec_mask = 0; bvec_mask = setOffSet(bvec_mask, numberOfBips); //Bitvector of already matching bips int bvec_tmp = 0; bvec_tmp = ~bvec_scores & bvec_mask; //Penality score are all bitvectors who were matching but is destroyed penality = __builtin_popcount(bvec_destroyed & bvec_tmp); //Now iterate through bipsOfDropSet list and extract all bips which will merge into a bitVector bvec_bips = getBipsOfDropSet(bvec_bips, i, numberOfBipsPerSet, bipsOfDropSet); //Calculate the bitvectors which remains bvec_tmp = ~bvec_destroyed & bvec_mask; bvec_tmp = bvec_bips & bvec_tmp; score = __builtin_popcount(bvec_scores & bvec_tmp); rf_score[i] = score - penality; //Save our results for convenience into an array bvecs_bips[i] = bvec_bips; bvecs_destroyed[i] = bvec_destroyed; }//End Score Calculation printf("======> Scores:\n"); for(int i = 0; i < numberOfUniqueSets; i++) { printf("RF Score for %i : %i \n", i, rf_score[i]); //printBitVector(bvecs_bips[i]); //printBitVector(bvecs_destroyed[i]); } int maxDropSet = getMax(rf_score, numberOfUniqueSets); printf("Max Element is %i \n", maxDropSet); /***********************************************************************************/ /* RF-OPT Create Update Data Structures */ /***********************************************************************************/ printf("====> Delete DropSet from all bips and update numbers \n"); //Create a bitVector to store all deleted taxa int bvec_deletedTaxa = 0; //Create a bitVector to store all still existing bips int bvec_existingBips = 0; //Create a bitvector to store deleted dropsets int bvec_deletedDropSets = 0; //Get the dropset int* deleteDropSet = uniqSets[maxDropSet]; //Store it into a BitVector bvec_deletedDropSets = setBit(bvec_deletedDropSets,maxDropSet); //Select all bips destroyed by removing this dropset int bvec_destroyedBips = bvecs_destroyed[maxDropSet]; //Select all bips that are now matching int bvec_matchingBips = bvecs_bips[maxDropSet]; //Filter for existent bipartitions bvec_existingBips = getExistingBips(bvec_existingBips, numberOfBips, bvec_destroyedBips); //Iterate through its taxa int iterSet = 0; while(deleteDropSet[iterSet] != -1) { //Get taxon int taxon = deleteDropSet[iterSet]; //Store the taxon into deletedTaxa BitVector bvec_deletedTaxa = setBit(bvec_deletedTaxa, taxon - 1); //Check if taxon is inside int test[2] = {taxon, -1}; int index = contains(test, uniqSets, numberOfUniqueSets); iterSet++; } //printBitVector(bvec_existingBips); //printBitVector(bvec_deletedTaxa); //Update the scores with now matching bips bvec_scores = bvec_scores & (~bvec_matchingBips); //printBitVector(bvec_scores); /* Short summary : bvec_existingBips - bitVector of all still existing bips bvec_deletedTaxa - bitVector to keep track of destroyed taxa */ /***********************************************************************************/ /* TODO RF-OPT Update function */ /***********************************************************************************/ /***********************************************************************************/ /* End RF-OPT Update function */ /***********************************************************************************/ //printf("Ind Bipartitions?: "); // printf("Induced Bipartitions: "); // printBitVector(ind_bips[0]); // printBitVector(ind_bips[1]); // printBitVector(ind_bips[2]); // printBitVector(ind_bips[3]); // printBitVector(ind_bips[4]); // printBitVector(ind_bips[5]); // printBitVector(ind_bips[6]); /***********************************************************************************/ /* Console Logs for debugging */ /***********************************************************************************/ //Printing if printf("==> Unique Sets: "); for(int i = 0; i < numberOfUniqueSets; i++) { int j = 0; int* set = uniqSets[i]; while(set[j] > -1) { printf("%i ",set[j]); j++; } printf("; "); } printf("\n"); printf("\n == Sets == \n"); for(int fooo = 0; fooo < numberOfSets; fooo++){ printf("Set %i: ", fooo); int i = 0; while(sets[fooo][i] > -1) { printf("%i ",sets[fooo][i]); i++; } printf("\n"); } printf("\n"); //#define _PRINT_ #ifdef _PRINT_ for(int i = 0; i < numberOfUniqueSets; i++) { printf("Bips of Set %i: ", i); for(int j = 0; j < numberOfBipsPerSet[i]; j++) { int* bips = bipsOfDropSet[i]; printf("%i ", bips[j]); } printf("\n"); } printf("Induced Bips! \n"); // Now checking which dropset would destroy which bipartition for(int i = 0 ; i < numberOfBips; i++) { printf("Bip %i is %i \n",i,ind_bips[i]); } printf("Taxa Names : \n"); for(int i = 0; i < tr->mxtips + 1; i++) { printf("%s ",tr->nameList[i]); } printf("\n"); printf("Small Tree Taxa Names 0 : \n"); for(int i = 0; i < taxaPerTree[0]; i++) { int* list = smallTreeTaxaList[0]; int taxa = list[i]; printf("%s ",tr->nameList[taxa]); } printf("\n"); printf("Small Tree Taxa Names 1 : \n"); for(int i = 0; i < taxaPerTree[1]; i++) { int* list = smallTreeTaxaList[1]; int taxa = list[i]; printf("%s ",tr->nameList[taxa]); } printf("\n"); printf("Small Tree Taxa Names 2 : \n"); for(int i = 0; i < taxaPerTree[2]; i++) { int* list = smallTreeTaxaList[2]; int taxa = list[i]; printf("%s ",tr->nameList[taxa]); } printf("\n"); printf("Number of DropSets extracted%i \n",dropSetCount); printf("Number of Bips extracted %i \n",currentBips); //Testing ... printf("Number of Sets is %i \n",numberOfSets); printf("Number of Unique Sets is %i \n",numberOfUniqueSets); printf("==> Testing bips of unique sets \n"); for(int i = 0; i < numberOfUniqueSets; i++) { printf("Bips of Set %i: ", i); for(int j = 0; j < numberOfBipsPerSet[i]; j++) { int* bips = bipsOfDropSet[i]; printf("%i ", bips[j]); } printf("\n"); } printf("==> Testing bips of taxa \n"); for(int i = 1; i < tr->mxtips + 1; i++) { printf("Bips of Taxa %i: ", i); for(int j = 0; j < numberOfBipsPerTaxa[i]; j++) { int* bips = bipsOfTaxa[i]; printf("%i ", bips[j]); } printf("\n"); } printf("==> Unique Sets: "); for(int i = 0; i < numberOfUniqueSets; i++) { int j = 0; int* set = uniqSets[i]; while(set[j] > -1) { printf("%i ",set[j]); j++; } printf("; "); } printf("\n"); printf("==> setsToUniqSets: "); for(int i = 0; i < numberOfSets; i++) { printf("%i ",setsToUniqSets[i]); } printf("\n"); //=== TREE GRAPH CONSTRUCTION ENDS === printf("Scores: "); printBitVector(bvec_scores); printf("BipsPerTree: "); for(int foo = 0; foo < tr->numberOfTrees; foo++) { printf("%i ",bipsPerTree[foo]); } printf("\nInduced Bips: "); for(int foo = 0;foo < numberOfBips; foo++) { printf("%u ",ind_bips[foo]); } printf("\nSmall Tree Bips: "); for(int foo = 0;foo < numberOfBips; foo++) { printf("%u ",s_bips[foo]); } printf("\n == Sets == \n"); for(int fooo = 0; fooo < numberOfSets; fooo++){ printf("Set %i: ", fooo); int i = 0; while(sets[fooo][i] > -1) { printf("%i ",sets[fooo][i]); i++; } printf("\n"); } printf("\n"); #endif printBothOpen("Number of small trees skipped: %d\n\n", tr->numberOfTrees - numberOfTreesAnalyzed); printBothOpen("Average RF distance %f\n\n", avgRF / (double)numberOfTreesAnalyzed); printBothOpen("Large Tree: %i, Number of SmallTrees analyzed: %i \n\n", tr->mxtips, numberOfTreesAnalyzed); printBothOpen("Total execution time: %f secs\n\n", gettime() - masterTime); printBothOpen("File containing all %d pair-wise RF distances written to file %s\n\n", numberOfTreesAnalyzed, rfFileName); printBothOpen("execution stats:\n\n"); printBothOpen("Accumulated time Effective algorithm: %.5f sec \n", sumEffectivetime); printBothOpen("Average time for effective: %.10f sec \n",sumEffectivetime / (double)numberOfTreesAnalyzed); printBothOpen("Preprocessingtime: %0.5f sec \n\n", preprocessendtime); fclose(treeFile); fclose(rfFile); /* free the data structure used for parsing the potentially multi-furcating tree */ freeMultifurcations(smallTree); rax_free(smallTree); rax_free(taxonToLabel); rax_free(taxonToEulerIndex); rax_free(labelToTaxon); rax_free(eulerIndexToLabel); rax_free(taxonToReduction); rax_free(taxonHasDeg); }