static void insertHashRF(unsigned int *bitVector, pllHashTable *h, unsigned int vectorLength, int treeNumber, int treeVectorLength, hashNumberType position, int support, boolean computeWRF) { pllBipartitionEntry * e; pllHashItem * hitem; if(h->Items[position] != NULL) { for (hitem = h->Items[position]; hitem; hitem = hitem->next) { e = (pllBipartitionEntry *)(hitem->data); if (!memcmp(bitVector, e->bitVector, vectorLength * sizeof(unsigned int))) { e->treeVector[treeNumber / PLL_MASK_LENGTH] |= mask32[treeNumber % PLL_MASK_LENGTH]; if(computeWRF) { e->supportVector[treeNumber] = support; assert(0 <= treeNumber && treeNumber < treeVectorLength * PLL_MASK_LENGTH); } return; } } } e = initEntry(); rax_posix_memalign ((void **)&(e->bitVector), PLL_BYTE_ALIGNMENT, (size_t)vectorLength * sizeof(unsigned int)); memset(e->bitVector, 0, vectorLength * sizeof(unsigned int)); e->treeVector = (unsigned int*)rax_calloc((size_t)treeVectorLength, sizeof(unsigned int)); if(computeWRF) e->supportVector = (int*)rax_calloc((size_t)treeVectorLength * PLL_MASK_LENGTH, sizeof(int)); e->treeVector[treeNumber / PLL_MASK_LENGTH] |= mask32[treeNumber % PLL_MASK_LENGTH]; if(computeWRF) { e->supportVector[treeNumber] = support; assert(0 <= treeNumber && treeNumber < treeVectorLength * PLL_MASK_LENGTH); } memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength); pllHashAdd (h, position, NULL, (void *)e); }
stringHashtable *initStringHashTable(hashNumberType n) { /* init with primes */ static const hashNumberType initTable[] = {53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741}; /* init with powers of two static const hashNumberType initTable[] = {64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648U}; */ stringHashtable *h = (stringHashtable*)rax_malloc(sizeof(stringHashtable)); hashNumberType tableSize, i, primeTableLength = sizeof(initTable)/sizeof(initTable[0]), maxSize = (hashNumberType)-1; assert(n <= maxSize); i = 0; while(initTable[i] < n && i < primeTableLength) i++; assert(i < primeTableLength); tableSize = initTable[i]; h->table = (stringEntry**)rax_calloc(tableSize, sizeof(stringEntry*)); h->tableSize = tableSize; return h; }
static int parse_newick (pllStack ** stack, int * inp) { pllNewickNodeInfo * item = NULL; int item_active = 0; pllLexToken token; int input; pllLexToken prev_token; int nop = 0; /* number of open parentheses */ int depth = 0; prev_token.tokenType = PLL_TOKEN_UNKNOWN; input = *inp; NEXT_TOKEN while (token.tokenType != PLL_TOKEN_EOF && token.tokenType != PLL_TOKEN_UNKNOWN) { switch (token.tokenType) { case PLL_TOKEN_OPAREN: #ifdef PLLDEBUG printf ("PLL_TOKEN_OPAREN\n"); #endif ++nop; memcpy (&prev_token, &token, sizeof (pllLexToken)); ++depth; break; case PLL_TOKEN_CPAREN: #ifdef PLLDEBUG printf ("PLL_TOKEN_CPAREN\n"); #endif if (prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_UNKNOWN && prev_token.tokenType != PLL_TOKEN_STRING && prev_token.tokenType != PLL_TOKEN_NUMBER && prev_token.tokenType != PLL_TOKEN_FLOAT) return (0); if (!nop) return (0); --nop; memcpy (&prev_token, &token, sizeof (pllLexToken)); /* push to the stack */ if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nec //if (item->name == NULL) item->name = strdup ("INTERNAL_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char)); strcpy (item->name, "INTERNAL_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } item->depth = depth; pllStackPush (stack, item); item_active = 1; /* active = 1 */ item = NULL; --depth; break; case PLL_TOKEN_STRING: #ifdef PLLDEBUG printf ("PLL_TOKEN_STRING %.*s\n", token.len, token.lexeme); #endif if (prev_token.tokenType != PLL_TOKEN_OPAREN && prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_UNKNOWN && prev_token.tokenType != PLL_TOKEN_COMMA) return (0); if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); //item->name = strndup (token.lexeme, token.len); item->name = (char *) rax_malloc ((token.len + 1) * sizeof (char)); strncpy (item->name, token.lexeme, token.len); item->name[token.len] = 0; item_active = 1; item->depth = depth; if (prev_token.tokenType == PLL_TOKEN_COMMA || prev_token.tokenType == PLL_TOKEN_OPAREN || prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1; memcpy (&prev_token, &token, sizeof (pllLexToken)); break; case PLL_TOKEN_FLOAT: case PLL_TOKEN_NUMBER: #ifdef PLLDEBUG if (token.tokenType == PLL_TOKEN_FLOAT) printf ("PLL_TOKEN_FLOAT\n"); else printf ("PLL_TOKEN_NUMBER\n"); #endif if (prev_token.tokenType != PLL_TOKEN_OPAREN && prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_COLON && prev_token.tokenType != PLL_TOKEN_UNKNOWN && prev_token.tokenType != PLL_TOKEN_COMMA) return (0); if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); if (prev_token.tokenType == PLL_TOKEN_COLON) { //item->branch = strndup (token.lexeme, token.len); item->branch = (char *) rax_malloc ((token.len + 1) * sizeof (char)); strncpy (item->branch, token.lexeme, token.len); item->branch[token.len] = 0; } else { if (prev_token.tokenType == PLL_TOKEN_COMMA || prev_token.tokenType == PLL_TOKEN_OPAREN || prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1; //if (prev_token.tokenType != PLL_TOKEN_UNKNOWN) ++ indent; //item->name = strndup (token.lexeme, token.len); item->name = (char *) rax_malloc ((token.len + 1) * sizeof (char)); strncpy (item->name, token.lexeme, token.len); item->name[token.len] = 0; } item_active = 1; item->depth = depth; memcpy (&prev_token, &token, sizeof (pllLexToken)); break; case PLL_TOKEN_COLON: #ifdef PLLDEBUG printf ("PLL_TOKEN_COLON\n"); #endif if (prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_STRING && prev_token.tokenType != PLL_TOKEN_FLOAT && prev_token.tokenType != PLL_TOKEN_NUMBER) return (0); memcpy (&prev_token, &token, sizeof (pllLexToken)); break; case PLL_TOKEN_COMMA: #ifdef PLLDEBUG printf ("PLL_TOKEN_COMMA\n"); #endif if (prev_token.tokenType != PLL_TOKEN_CPAREN && prev_token.tokenType != PLL_TOKEN_STRING && prev_token.tokenType != PLL_TOKEN_FLOAT && prev_token.tokenType != PLL_TOKEN_NUMBER) return (0); memcpy (&prev_token, &token, sizeof (pllLexToken)); /* push to the stack */ if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nece //if (item->name == NULL) item->name = strdup ("INTERNAL_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char)); strcpy (item->name, "INTERNAL_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } item->depth = depth; pllStackPush (stack, item); item_active = 0; item = NULL; break; case PLL_TOKEN_SEMICOLON: #ifdef PLLDEBUG printf ("PLL_TOKEN_SEMICOLON\n"); #endif /* push to the stack */ if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); //if (item->name == NULL) item->name = strdup ("ROOT_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char)); strcpy (item->name, "ROOT_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } pllStackPush (stack, item); item_active = 0; item = NULL; break; default: #ifdef __DEBUGGING_MODE printf ("Unknown token: %d\n", token.tokenType); #endif // TODO: Finish this part and add error codes break; } NEXT_TOKEN CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE); } if (item_active) { if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); //if (item->name == NULL) item->name = strdup ("ROOT_NODE"); if (item->name == NULL) { item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char)); strcpy (item->name, "ROOT_NODE"); } //if (item->branch == NULL) item->branch = strdup ("0.000000"); if (item->branch == NULL) { item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char)); strcpy (item->branch, "0.000000"); } pllStackPush (stack, item); item_active = 0; } if (nop || token.tokenType == PLL_TOKEN_UNKNOWN) { return (0); } return (1); }
void parseSecondaryStructure(tree *tr, analdef *adef, int sites) { if(adef->useSecondaryStructure) { FILE *f = myfopen(secondaryStructureFileName, "rb"); int i, k, countCharacters = 0, ch, *characters, **brackets, opening, closing, depth, numberOfSymbols, numSecondaryColumns; unsigned char bracketTypes[4][2] = {{'(', ')'}, {'<', '>'}, {'[', ']'}, {'{', '}'}}; numberOfSymbols = 4; tr->secondaryStructureInput = (char*)rax_malloc(sizeof(char) * sites); while((ch = fgetc(f)) != EOF) { if(ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '{' || ch == '}' || ch == '.') countCharacters++; else { if(!whitechar(ch)) { printf("Secondary Structure file %s contains character %c at position %d\n", secondaryStructureFileName, ch, countCharacters + 1); printf("Allowed Characters are \"( ) < > [ ] { } \" and \".\" \n"); errorExit(-1); } } } if(countCharacters != sites) { printf("Error: Alignment length is: %d, secondary structure file has length %d\n", sites, countCharacters); errorExit(-1); } characters = (int*)rax_malloc(sizeof(int) * countCharacters); brackets = (int **)rax_malloc(sizeof(int*) * numberOfSymbols); for(k = 0; k < numberOfSymbols; k++) brackets[k] = (int*)rax_calloc(countCharacters, sizeof(int)); rewind(f); countCharacters = 0; while((ch = fgetc(f)) != EOF) { if(!whitechar(ch)) { tr->secondaryStructureInput[countCharacters] = ch; characters[countCharacters++] = ch; } } assert(countCharacters == sites); for(k = 0; k < numberOfSymbols; k++) { for(i = 0, opening = 0, closing = 0, depth = 0; i < countCharacters; i++) { if((characters[i] == bracketTypes[k][0] || characters[i] == bracketTypes[k][1]) && (tr->extendedDataVector[i+1] == AA_DATA || tr->extendedDataVector[i+1] == BINARY_DATA || tr->extendedDataVector[i+1] == GENERIC_32 || tr->extendedDataVector[i+1] == GENERIC_64)) { printf("Secondary Structure only for DNA character positions \n"); printf("I am at position %d of the secondary structure file and this is not part of a DNA partition\n", i+1); errorExit(-1); } if(characters[i] == bracketTypes[k][0]) { depth++; /*printf("%d %d\n", depth, i);*/ brackets[k][i] = depth; opening++; } if(characters[i] == bracketTypes[k][1]) { brackets[k][i] = depth; /*printf("%d %d\n", depth, i); */ depth--; closing++; } if(closing > opening) { printf("at position %d there is a closing bracket too much\n", i+1); errorExit(-1); } } if(depth != 0) { printf("Problem: Depth: %d\n", depth); printf("Your secondary structure file may be missing a closing or opening paraenthesis!\n"); } assert(depth == 0); if(countCharacters != sites) { printf("Problem: sec chars: %d sites: %d\n",countCharacters, sites); printf("The number of sites in the alignment does not match the length of the secondary structure file\n"); } assert(countCharacters == sites); if(closing != opening) { printf("Number of opening brackets %d should be equal to number of closing brackets %d\n", opening, closing); errorExit(-1); } } for(i = 0, numSecondaryColumns = 0; i < countCharacters; i++) { int checkSum = 0; for(k = 0; k < numberOfSymbols; k++) { if(brackets[k][i] > 0) { checkSum++; switch(tr->secondaryStructureModel) { case SEC_16: case SEC_16_A: case SEC_16_B: case SEC_16_C: case SEC_16_D: case SEC_16_E: case SEC_16_F: case SEC_16_I: case SEC_16_J: case SEC_16_K: tr->extendedDataVector[i+1] = SECONDARY_DATA; break; case SEC_6_A: case SEC_6_B: case SEC_6_C: case SEC_6_D: case SEC_6_E: tr->extendedDataVector[i+1] = SECONDARY_DATA_6; break; case SEC_7_A: case SEC_7_B: case SEC_7_C: case SEC_7_D: case SEC_7_E: case SEC_7_F: tr->extendedDataVector[i+1] = SECONDARY_DATA_7; break; default: assert(0); } numSecondaryColumns++; } } assert(checkSum <= 1); } assert(numSecondaryColumns % 2 == 0); /*printf("Number of secondary columns: %d merged columns: %d\n", numSecondaryColumns, numSecondaryColumns / 2);*/ tr->numberOfSecondaryColumns = numSecondaryColumns; if(numSecondaryColumns > 0) { int model = tr->NumberOfModels; int countPairs; pInfo *partBuffer = (pInfo*)rax_malloc(sizeof(pInfo) * tr->NumberOfModels); for(i = 1; i <= sites; i++) { for(k = 0; k < numberOfSymbols; k++) { if(brackets[k][i-1] > 0) tr->model[i] = model; } } /* now make a copy of partition data */ for(i = 0; i < tr->NumberOfModels; i++) { partBuffer[i].partitionName = (char*)rax_malloc((strlen(tr->extendedPartitionData[i].partitionName) + 1) * sizeof(char)); strcpy(partBuffer[i].partitionName, tr->extendedPartitionData[i].partitionName); strcpy(partBuffer[i].proteinSubstitutionFileName, tr->extendedPartitionData[i].proteinSubstitutionFileName); strcpy(partBuffer[i].ascFileName, tr->extendedPartitionData[i].ascFileName); partBuffer[i].dataType = tr->extendedPartitionData[i].dataType; partBuffer[i].protModels = tr->extendedPartitionData[i].protModels; partBuffer[i].usePredefinedProtFreqs = tr->extendedPartitionData[i].usePredefinedProtFreqs; partBuffer[i].optimizeBaseFrequencies = tr->extendedPartitionData[i].optimizeBaseFrequencies; } for(i = 0; i < tr->NumberOfModels; i++) rax_free(tr->extendedPartitionData[i].partitionName); rax_free(tr->extendedPartitionData); tr->extendedPartitionData = (pInfo*)rax_malloc(sizeof(pInfo) * (tr->NumberOfModels + 1)); for(i = 0; i < tr->NumberOfModels; i++) { tr->extendedPartitionData[i].partitionName = (char*)rax_malloc((strlen(partBuffer[i].partitionName) + 1) * sizeof(char)); strcpy(tr->extendedPartitionData[i].partitionName, partBuffer[i].partitionName); strcpy(tr->extendedPartitionData[i].proteinSubstitutionFileName, partBuffer[i].proteinSubstitutionFileName); strcpy(tr->extendedPartitionData[i].ascFileName, partBuffer[i].ascFileName); tr->extendedPartitionData[i].dataType = partBuffer[i].dataType; tr->extendedPartitionData[i].protModels= partBuffer[i].protModels; tr->extendedPartitionData[i].usePredefinedProtFreqs= partBuffer[i].usePredefinedProtFreqs; tr->extendedPartitionData[i].optimizeBaseFrequencies = partBuffer[i].optimizeBaseFrequencies; rax_free(partBuffer[i].partitionName); } rax_free(partBuffer); tr->extendedPartitionData[i].partitionName = (char*)rax_malloc(64 * sizeof(char)); switch(tr->secondaryStructureModel) { case SEC_16: case SEC_16_A: case SEC_16_B: case SEC_16_C: case SEC_16_D: case SEC_16_E: case SEC_16_F: case SEC_16_I: case SEC_16_J: case SEC_16_K: strcpy(tr->extendedPartitionData[i].partitionName, "SECONDARY STRUCTURE 16 STATE MODEL"); tr->extendedPartitionData[i].dataType = SECONDARY_DATA; break; case SEC_6_A: case SEC_6_B: case SEC_6_C: case SEC_6_D: case SEC_6_E: strcpy(tr->extendedPartitionData[i].partitionName, "SECONDARY STRUCTURE 6 STATE MODEL"); tr->extendedPartitionData[i].dataType = SECONDARY_DATA_6; break; case SEC_7_A: case SEC_7_B: case SEC_7_C: case SEC_7_D: case SEC_7_E: case SEC_7_F: strcpy(tr->extendedPartitionData[i].partitionName, "SECONDARY STRUCTURE 7 STATE MODEL"); tr->extendedPartitionData[i].dataType = SECONDARY_DATA_7; break; default: assert(0); } tr->extendedPartitionData[i].protModels= -1; tr->extendedPartitionData[i].usePredefinedProtFreqs = FALSE; tr->NumberOfModels++; if(adef->perGeneBranchLengths) { if(tr->NumberOfModels > NUM_BRANCHES) { printf("You are trying to use %d partitioned models for an individual per-gene branch length estimate.\n", tr->NumberOfModels); printf("Currently only %d are allowed to improve efficiency.\n", NUM_BRANCHES); printf("Note that the number of partitions has automatically been incremented by one to accommodate secondary structure models\n"); printf("\n"); printf("In order to change this please replace the line \"#define NUM_BRANCHES %d\" in file \"axml.h\" \n", NUM_BRANCHES); printf("by \"#define NUM_BRANCHES %d\" and then re-compile RAxML.\n", tr->NumberOfModels); exit(-1); } else { tr->multiBranch = 1; tr->numBranches = tr->NumberOfModels; } } assert(countCharacters == sites); tr->secondaryStructurePairs = (int*)rax_malloc(sizeof(int) * countCharacters); for(i = 0; i < countCharacters; i++) tr->secondaryStructurePairs[i] = -1; /* for(i = 0; i < countCharacters; i++) printf("%d", brackets[i]); printf("\n"); */ countPairs = 0; for(k = 0; k < numberOfSymbols; k++) { i = 0; while(i < countCharacters) { int j = i, bracket = -1, openBracket, closeBracket; while(j < countCharacters && ((bracket = brackets[k][j]) == 0)) { i++; j++; } assert(bracket >= 0); if(j == countCharacters) { assert(bracket == 0); break; } openBracket = j; j++; while(bracket != brackets[k][j] && j < countCharacters) j++; assert(j < countCharacters); closeBracket = j; assert(closeBracket < countCharacters && openBracket < countCharacters); assert(brackets[k][closeBracket] > 0 && brackets[k][openBracket] > 0); /*printf("%d %d %d\n", openBracket, closeBracket, bracket);*/ brackets[k][closeBracket] = 0; brackets[k][openBracket] = 0; countPairs++; tr->secondaryStructurePairs[closeBracket] = openBracket; tr->secondaryStructurePairs[openBracket] = closeBracket; } assert(i == countCharacters); } assert(countPairs == numSecondaryColumns / 2); /*for(i = 0; i < countCharacters; i++) printf("%d ", tr->secondaryStructurePairs[i]); printf("\n");*/ adef->useMultipleModel = TRUE; } for(k = 0; k < numberOfSymbols; k++) rax_free(brackets[k]); rax_free(brackets); rax_free(characters); fclose(f); } }
void computeNextReplicate(tree *tr, long *randomSeed, int *originalRateCategories, int *originalInvariant, boolean isRapid, boolean fixRates) { int j, model, w, *weightBuffer, endsite, *weights, i, l; for(j = 0; j < tr->originalCrunchedLength; j++) tr->cdta->aliaswgt[j] = 0; for(model = 0; model < tr->NumberOfModels; model++) { int nonzero = 0, pos = 0; for (j = 0; j < tr->originalCrunchedLength; j++) { if(tr->originalModel[j] == model) nonzero += tr->originalWeights[j]; } weightBuffer = (int *)rax_calloc(nonzero, sizeof(int)); for (j = 0; j < nonzero; j++) weightBuffer[(int) (nonzero*randum(randomSeed))]++; for(j = 0; j < tr->originalCrunchedLength; j++) { if(model == tr->originalModel[j]) { for(w = 0; w < tr->originalWeights[j]; w++) { tr->cdta->aliaswgt[j] += weightBuffer[pos]; pos++; } } } rax_free(weightBuffer); } endsite = 0; for (j = 0; j < tr->originalCrunchedLength; j++) { if(tr->cdta->aliaswgt[j] > 0) endsite++; } weights = tr->cdta->aliaswgt; for(i = 0; i < tr->rdta->numsp; i++) { unsigned char *yPos = &(tr->rdta->y0[((size_t)tr->originalCrunchedLength) * ((size_t)i)]), *origSeq = &(tr->rdta->yBUF[((size_t)tr->originalCrunchedLength) * ((size_t)i)]); for(j = 0, l = 0; j < tr->originalCrunchedLength; j++) if(tr->cdta->aliaswgt[j] > 0) yPos[l++] = origSeq[j]; } for(j = 0, l = 0; j < tr->originalCrunchedLength; j++) { if(weights[j]) { tr->cdta->aliaswgt[l] = tr->cdta->aliaswgt[j]; tr->dataVector[l] = tr->originalDataVector[j]; tr->model[l] = tr->originalModel[j]; if(isRapid) { tr->cdta->rateCategory[l] = originalRateCategories[j]; tr->invariant[l] = originalInvariant[j]; } l++; } } tr->cdta->endsite = endsite; fixModelIndices(tr, endsite, fixRates); { int count = 0; for(j = 0; j < tr->cdta->endsite; j++) count += tr->cdta->aliaswgt[j]; if(count != tr->rdta->sites) printf("count=%d\ttr->rdta->sites=%d\n",count, tr->rdta->sites ); assert(count == tr->rdta->sites); } }
//Use the plausibility checker overhead void plausibilityChecker(tree *tr, analdef *adef) { FILE *treeFile, *treeFile2, *rfFile; tree *smallTree = (tree *)rax_malloc(sizeof(tree)); char rfFileName[1024]; int numberOfTreesAnalyzed = 0, i; double avgRF = 0.0, sumEffectivetime = 0.0; /* set up an output file name */ strcpy(rfFileName, workdir); strcat(rfFileName, "RAxML_RF-Distances."); strcat(rfFileName, run_id); rfFile = myfopen(rfFileName, "wb"); assert(adef->mode == PLAUSIBILITY_CHECKER); /* open the big reference tree file and parse it */ treeFile = myfopen(tree_file, "r"); printBothOpen("Parsing reference tree %s\n", tree_file); treeReadLen(treeFile, tr, FALSE, TRUE, TRUE, adef, TRUE, FALSE); assert(tr->mxtips == tr->ntips); /*************************************************************************************/ /* Preprocessing Step */ double preprocesstime = gettime(); /* taxonToLabel[2*tr->mxtips - 2]; Array storing all 2n-2 labels from the preordertraversal: (Taxonnumber - 1) -> (Preorderlabel) */ int *taxonToLabel = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)), /* taxonHasDeg[2*tr->mxtips - 2] Array used to store the degree of every taxon, is needed to extract Bipartitions from multifurcating trees (Taxonnumber - 1) -> (degree of node(Taxonnumber)) */ *taxonHasDeg = (int *)rax_calloc((2*tr->mxtips - 2),sizeof(int)), /* taxonToReduction[2*tr->mxtips - 2]; Array used for reducing bitvector and speeding up extraction: (Taxonnumber - 1) -> Index in smallTreeTaxa (starting from 0) which is also: (Taxonnumber - 1) -> (0..1 (increment count of taxa appearing in small tree)) (Taxonnumber - 1) -> (0..1 (increment count of inner nodes appearing in small tree)) */ *taxonToReduction = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)); int newcount = 0; //counter used for correct traversals /* labelToTaxon[2*tr->mxtips - 2]; is used to translate between Perorderlabel and p->number: (Preorderlabel) -> (Taxonnumber) */ int *labelToTaxon = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)); /* Preorder-Traversal of the large tree */ preOrderTraversal(tr->start->back,tr->mxtips, tr->start->number, taxonToLabel, labelToTaxon, &newcount); newcount = 0; //counter set to 0 to be now used for Eulertraversal /* eulerIndexToLabel[4*tr->mxtips - 5]; Array storing all 4n-5 PreOrderlabels created during eulertour: (Eulerindex) -> (Preorderlabel) */ int* eulerIndexToLabel = (int *)rax_malloc((4*tr->mxtips - 5) * sizeof(int)); /* taxonToEulerIndex[tr->mxtips]; Stores all indices of the first appearance of a taxa in the eulerTour: (Taxonnumber - 1) -> (Index of the Eulertour where Taxonnumber first appears) is used for efficient computation of the Lowest Common Ancestor during Reconstruction Step */ int* taxonToEulerIndex = (int *)rax_malloc((tr->mxtips) * sizeof(int)); /* Init taxonToEulerIndex and taxonToReduction */ int ix; for(ix = 0; ix < tr->mxtips; ++ix) taxonToEulerIndex[ix] = -1; for(ix = 0; ix < (2*tr->mxtips - 2); ++ix) taxonToReduction[ix] = -1; /* Eulertraversal of the large tree*/ unrootedEulerTour(tr->start->back,tr->mxtips, eulerIndexToLabel, taxonToLabel, &newcount, taxonToEulerIndex); /* Creating RMQ Datastructure for efficient retrieval of LCAs, using Johannes Fischers Library rewritten in C Following Files: rmq.h,rmqs.c,rmqs.h are included in Makefile.RMQ.gcc */ RMQ_succinct(eulerIndexToLabel,4*tr->mxtips - 5); double preprocessendtime = gettime() - preprocesstime; /* Proprocessing Step End */ /*************************************************************************************/ printBothOpen("The reference tree has %d tips\n", tr->ntips); fclose(treeFile); /***********************************************************************************/ /* RF-OPT Preprocessing Step */ /***********************************************************************************/ /* now see how many small trees we have */ treeFile = getNumberOfTrees(tr, bootStrapFile, adef); treeFile2 = getNumberOfTrees(tr, bootStrapFile, adef); checkTreeNumber(tr->numberOfTrees, bootStrapFile); /* allocate a data structure for parsing the potentially mult-furcating tree */ allocateMultifurcations(tr, smallTree); /* Start Additional preprocessing step */ int numberOfBips = 0, numberOfSets = 0; //Stores the number of bips of each tree int *bipsPerTree = (int *)rax_malloc(tr->numberOfTrees * sizeof(int)); //Stores the number of taxa for each tree int *taxaPerTree = (int *)rax_malloc(tr->numberOfTrees * sizeof(int)); //To calculate all bipartitions, I created a new treeFile2 and a new getNumberOfTrees method!! for(i = 0; i < tr->numberOfTrees; i++) { int this_treeBips = readMultifurcatingTree(treeFile2, smallTree, adef, TRUE); numberOfBips = numberOfBips + this_treeBips; numberOfSets = numberOfSets + this_treeBips * this_treeBips; bipsPerTree[i] = this_treeBips; } printf("numberOfBips: %i , numberOfSets: %i \n \n", numberOfBips, numberOfSets); //stores induced bips (OLD?) unsigned int *ind_bips = (unsigned int *)rax_malloc(numberOfBips * sizeof(unsigned int)); //stores smalltree bips (OLD?) unsigned int *s_bips = (unsigned int *)rax_malloc(numberOfBips * sizeof(unsigned int)); //stores small bips per tree unsigned int ***sBipsPerTree = (unsigned int ***)rax_malloc(tr->numberOfTrees * sizeof(unsigned int**)); //stores induced bips per tree unsigned int ***indBipsPerTree = (unsigned int ***)rax_malloc(tr->numberOfTrees * sizeof(unsigned int**)); //stores vLength of each tree for processing bitVectors unsigned int *vectorLengthPerTree = (unsigned int *)rax_malloc(tr->numberOfTrees * sizeof(unsigned int*)); //stores the corresponding tree number for each bip int *treenumberOfBip = (int *)rax_malloc(numberOfBips * sizeof(int)); //Stores all dropsets of all trees int **sets = (int **)rax_malloc(numberOfSets * sizeof(int*)); //int **sets = NULL; //For each tree, stores a translation array from taxanumber smalltree->largetree int **smallTreeTaxaList = (int **)rax_malloc(tr->numberOfTrees * sizeof(int*)); //For each tree, store a translation array from taxanumber largetree->smalltree int **taxonToReductionList = (int **)rax_malloc(tr->numberOfTrees * sizeof(int*)); //I use these variables as global variables for all trees to determine the max number of possible sets to generate a static array int currentBips = 0; int currentSmallBips = 0; int currentSets = 0; //int currentTree = 0; already there in number of trees analyzed //Prefill sets with -1s for(int it = 0;it < (numberOfSets);it++){ int fill[1] = {-1}; sets[it] = fill; } /***********************************************************************************/ /* RF-OPT Preprocessing Step End */ /***********************************************************************************/ /* loop over all small trees */ for(i = 0; i < tr->numberOfTrees; i++) { int numberOfSplits = readMultifurcatingTree(treeFile, smallTree, adef, TRUE); if(numberOfSplits > 0) { int firstTaxon; double rec_rf, maxRF; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Small tree %d has %d tips and %d bipartitions\n", i, smallTree->ntips, numberOfSplits); /* compute the maximum RF distance for computing the relative RF distance later-on */ /* note that here we need to pay attention, since the RF distance is not normalized by 2 * (n-3) but we need to account for the fact that the multifurcating small tree will potentially contain less bipartitions. Hence the normalization factor is obtained as n-3 + numberOfSplits, where n-3 is the number of bipartitions of the pruned down large reference tree for which we know that it is bifurcating/strictly binary */ maxRF = (double)(2 * numberOfSplits); /* now get the index of the first taxon of the small tree. we will use this to unambiguously store the bipartitions */ firstTaxon = smallTree->start->number; //Saves the number of taxa in the tree (for RF-OPT) taxaPerTree[numberOfTreesAnalyzed] = smallTree->ntips; /***********************************************************************************/ /* Reconstruction Step */ double time_start = gettime(); /* Init hashtable to store Bipartitions of the induced subtree T|t_i */ /* using smallTree->ntips instead of smallTree->mxtips yields faster code e.g. 120 versus 128 seconds for 20,000 small trees on my laptop */ hashtable *s_hash = initHashTable(smallTree->ntips * 4); /* Init hashtable to store Bipartitions of the reference tree t_i*/ hashtable *ind_hash = initHashTable(smallTree->ntips * 4); /* smallTreeTaxa[smallTree->ntips]; Stores all taxa numbers from smallTree into an array called smallTreeTaxa: (Index) -> (Taxonnumber) */ int* smallTreeTaxa = (int *)rax_malloc((smallTree->ntips) * sizeof(int)); /* counter is set to 0 for correctly extracting taxa of the small tree */ newcount = 0; int newcount2 = 0; /* seq2[2*smallTree->ntips - 2]; stores PreorderSequence of the reference smalltree: (Preorderindex) -> (Taxonnumber) */ int* seq2 = (int *)rax_malloc((2*smallTree->ntips - 2) * sizeof(int)); /* used to store the vectorLength of the bitvector */ unsigned int vectorLength; /* extract all taxa of the smalltree and store it into an array, also store all counts of taxa and nontaxa in taxonToReduction */ rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start, smallTree->mxtips, &newcount, &newcount2); rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start->back, smallTree->mxtips, &newcount, &newcount2); /* counter is set to 0 to correctly preorder traverse the small tree */ newcount = 0; /* Preordertraversal of the small reference tree and save its sequence into seq2 for later extracting the bipartitions, it also stores information about the degree of every node */ rec_preOrderTraversalMulti(smallTree->start->back,smallTree->mxtips, smallTree->start->number, seq2, taxonHasDeg, &newcount); /* calculate the bitvector length */ if(smallTree->ntips % MASK_LENGTH == 0) vectorLength = smallTree->ntips / MASK_LENGTH; else vectorLength = 1 + (smallTree->ntips / MASK_LENGTH); /***********************************************************************************/ /* RF-OPT Additional Preprocessing storing Bipartitions */ /***********************************************************************************/ vectorLengthPerTree[numberOfTreesAnalyzed] = vectorLength; unsigned int **bitVectors = rec_initBitVector(smallTree, vectorLength); unsigned int **sBips; /* store all non trivial bitvectors using an subtree approach for the reference subtree and store it into a hashtable, this method was changed for multifurcation */ sBips = RFOPT_extractBipartitionsMulti(bitVectors, seq2, newcount,tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, taxonToReduction, taxonHasDeg, numberOfSplits); sBipsPerTree[numberOfTreesAnalyzed] = sBips; /***********************************************************************************/ /* End RF-OPT Additional Preprocessing storing Bipartitions */ /***********************************************************************************/ /* counter is set to 0 to be used for correctly storing all EulerIndices */ newcount = 0; /* smallTreeTaxonToEulerIndex[smallTree->ntips]; Saves all first Euler indices for all Taxons appearing in small Tree: (Index) -> (Index of the Eulertour where the taxonnumber of the small tree first appears) */ int* smallTreeTaxonToEulerIndex = (int *)rax_malloc((smallTree->ntips) * sizeof(int)); /* seq[(smallTree->ntips*2) - 1] Stores the Preordersequence of the induced small tree */ int* seq = (int *)rax_malloc((2*smallTree->ntips - 1) * sizeof(int)); /* iterate through all small tree taxa */ for(ix = 0; ix < smallTree->ntips; ix++) { int taxanumber = smallTreeTaxa[ix]; /* To create smallTreeTaxonToEulerIndex we filter taxonToEulerIndex for taxa in the small tree*/ smallTreeTaxonToEulerIndex[newcount] = taxonToEulerIndex[taxanumber-1]; /* Saves all Preorderlabel of the smalltree taxa in seq*/ seq[newcount] = taxonToLabel[taxanumber-1]; newcount++; } /* sort the euler indices to correctly calculate LCA */ //quicksort(smallTreeTaxonToEulerIndex,0,newcount - 1); qsort(smallTreeTaxonToEulerIndex, newcount, sizeof(int), sortIntegers); //printf("newcount2 %i \n", newcount2); /* Iterate through all small tree taxa */ for(ix = 1; ix < newcount; ix++) { /* query LCAs using RMQ Datastructure */ seq[newcount - 1 + ix] = eulerIndexToLabel[query(smallTreeTaxonToEulerIndex[ix - 1],smallTreeTaxonToEulerIndex[ix])]; /* Used for dynamic programming. We save an index for every inner node: For example the reference tree has 3 inner nodes which we saves them as 0,1,2. Now we calculate for example 5 LCA to construct the induced subtree, which are also inner nodes. Therefore we mark them as 3,4,5,6,7 */ taxonToReduction[labelToTaxon[seq[newcount - 1 + ix]] - 1] = newcount2; newcount2 += 1; } /* sort to construct the Preordersequence of the induced subtree */ //quicksort(seq,0,(2*smallTree->ntips - 2)); qsort(seq, (2 * smallTree->ntips - 2) + 1, sizeof(int), sortIntegers); /* calculates all bipartitions of the reference small tree and count how many bipartition it shares with the induced small tree and stores those bipartitions in a additional hashtable called ind_hash */ int rec_bips = 0; unsigned int **indBips; indBips = RFOPT_findAddBipartitions(bitVectors, seq,(2*smallTree->ntips - 1), labelToTaxon, tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, ind_hash, taxonToReduction); indBipsPerTree[numberOfTreesAnalyzed] = indBips; /* calculates all bipartitions of the reference small tree and put them into ind_hash*/ // rec_extractBipartitionsMulti(bitVectors, seq2, (2*smallTree->ntips - 1),tr->mxtips, vectorLength, smallTree->ntips, // firstTaxon, s_hash, taxonToReduction, taxonHasDeg, numberOfSplits); /* Reconstruction Step End */ /***********************************************************************************/ double effectivetime = gettime() - time_start; /* if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Reconstruction time: %.10f secs\n\n", effectivetime); */ /* compute the relative RF */ /***********************************************************************************/ /* RF-OPT Save Translation Vectors */ /***********************************************************************************/ //copy array taxonToReduction because it is originally defined in preprocessing step int * taxonToReductionCopy = (int *)rax_malloc((tr->mxtips)*sizeof(int)); memcpy(taxonToReductionCopy,taxonToReduction,(tr->mxtips)*sizeof(int)); //storing smallTree and taxonToReduction Arrays for further usage smallTreeTaxaList[numberOfTreesAnalyzed] = smallTreeTaxa; taxonToReductionList[numberOfTreesAnalyzed] = taxonToReductionCopy; int this_currentSmallBips = 0; //Variable resets everytime for each tree analyzed /***********************************************************************************/ /* End RF-OPT Save Translation Vectors */ /***********************************************************************************/ rec_rf = (double)(2 * (numberOfSplits - rec_bips)) / maxRF; assert(numberOfSplits >= rec_bips); avgRF += rec_rf; sumEffectivetime += effectivetime; //if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Relative RF tree %d: %f\n\n", i, rec_rf); fprintf(rfFile, "%d %f\n", i, rec_rf); //rax_free(smallTreeTaxa); //Need it for calculating the SmallTreeTaxaList after all iterations! rax_free(seq); rax_free(seq2); rax_free(smallTreeTaxonToEulerIndex); numberOfTreesAnalyzed++; //Counting the number of trees analyzed } }// End of Small Tree Iterations /***********************************************************************************/ /* RF-OPT DropSet Calculation using BitVectors */ /***********************************************************************************/ log_info("===> Create DropSet Datastructure \n"); static Hashmap* map = NULL; //Set a hashmap for dropsets with a dropset comparision and standard hash map = Hashmap_create(compareDropSet, NULL); static Hashmap** mapArray = NULL; //Set an array to store the pointers to bitvector hashtables for each tree mapArray = rax_malloc(tr->numberOfTrees * sizeof(Hashmap*)); printf("===> BitVector Set Calculation \n"); //Calculate dropsets of two given bips lists and extract all sets into array sets and into a hashmap. Each set has following format //dropset = {taxa_1,taxa_2,...,taxa_n,-1}; //Furtheremore calculate Dropset generates two data structures from type bips and dropsets which are pointing to each other in hashtables calculateDropSets(mapArray, map, indBipsPerTree, sBipsPerTree, sets, smallTreeTaxaList, bipsPerTree, taxaPerTree, vectorLengthPerTree, tr->numberOfTrees); /***********************************************************************************/ /* RF-OPT Graph Construction */ /***********************************************************************************/ // printf("\n == Sets == \n"); // for(int fooo = 0; fooo < numberOfSets; fooo++){ // printf("Set %i: ", fooo); // int i = 0; // while(sets[fooo][i] > -1) { // printf("%i ",sets[fooo][i]); // i++; // } // printf("\n"); // } // printf("\n"); /* Filter for unique sets */ log_info("===> Hashmap tests...\n"); Hashmap_traverse(map, traverse_cb); // int key[2] = {0,-1}; // Dropset* drop = Hashmap_get(map,key); // DArray* bips = drop->bipartitions; // for(int i = 0; i < DArray_count(bips); i++) { // Bipartition* bip = DArray_get(bips,i); // printBitVector(bip->bitvector[0]); // printf("matching: %i \n", bip->matching); // printf("tree: %i \n", bip->treenumber); // } // Bipartition* bipFromHash = DArray_first(bips); // Bipartition* testBip = Hashmap_get(mapArray[0],bipFromHash->bitvector); // printf("matching before: %i",testBip->matching); // testBip->matching = 999; // for(int i = 0; i < DArray_count(bips); i++) { // Bipartition* bip = DArray_get(bips,i); // printBitVector(bip->bitvector[0]); // printf("matching: %i \n", bip->matching); // printf("tree: %i \n", bip->treenumber); // } printf("===> Filter for unique sets (naive)...\n"); /* unique sets array data structures */ int** uniqSets = (int **) rax_malloc(sizeof(int*) * numberOfSets); int* setsToUniqSets = (int*) rax_malloc(sizeof(int) * numberOfSets); int numberOfUniqueSets = 0; int dropSetCount = 0; //stores the scores for each bips, we are using a bitvector approach (need to scale) //Legacy Code int bvec_scores = 0; numberOfUniqueSets = getUniqueDropSets(sets, uniqSets, setsToUniqSets, numberOfSets); printf("number of unique sets: %i \n", numberOfUniqueSets); /* Detect initial matchings, we calculate them using bitvectors to represent our bipartitions */ printf("===> Detect initial matchings...\n"); int vLengthBip = 0; //determine the bitVector Length of our bitVector if(numberOfBips % MASK_LENGTH == 0) vLengthBip = numberOfBips / MASK_LENGTH; else vLengthBip = numberOfBips / MASK_LENGTH + 1; //Initialize a bvecScore vector with 0s int* bvecScores = (int*)rax_calloc(vLengthBip,sizeof(int)); //Calculate Initial Matchings and save the result in bvecScores detectInitialMatchings(sets, bvecScores, bipsPerTree, numberOfTreesAnalyzed, vLengthBip); //Short summary until now: // - bipsPerTree consists of all bipartitions per tree // - bvecScores is the bitvector setting 1 to all bipartition indices which can score // - taxaPerTree number of taxa per tree // - smallTreeTaxaList list of all smalltree->largetree translation arrays /* Generate useful data structures for calculating and updating scores */ printf("===> Create data structures...\n"); //Stores the number of bips per Set and initialize it with 0s int* numberOfBipsPerSet = (int*)rax_calloc(numberOfUniqueSets,sizeof(int)); //Stores all sets which includes this taxa int **setsOfTaxa = (int**)rax_malloc((tr->mxtips + 1) *sizeof(int*)); //Now calculate number of bipartitions affected by each unique set for(int i = 0; i < numberOfSets; i++) { int setindex = setsToUniqSets[i]; numberOfBipsPerSet[setindex]++; } //Now using the knowledge of how many bips there are per set, generate an array for each unique dropset containing all bips int** bipsOfDropSet = (int**)rax_malloc(sizeof(int*)*numberOfUniqueSets); //Allocate the space needed for storing all bips for(int i = 0; i < numberOfUniqueSets; i++) { bipsOfDropSet[i] = (int*)rax_malloc(sizeof(int)*numberOfBipsPerSet[i]); } printf("==> Initialize the Bips Of Taxa \n"); //Stores the number of bips each taxa is included (ABC|DE is stored by A,B,C,D and E) //It can be calculated by iterating through all trees and adding the taxa int **bipsOfTaxa = (int**)rax_malloc((tr->mxtips + 1) * sizeof(int*)); int *numberOfBipsPerTaxa = (int*)rax_calloc((tr->mxtips + 1), sizeof(int)); int *taxaBipsCounter = (int*)rax_calloc((tr->mxtips + 1), sizeof(int)); //Now add up all for (int tree = 0; tree < tr->numberOfTrees; tree++) { int* list = smallTreeTaxaList[tree]; for (int j = 0; j < taxaPerTree[tree]; j++) { int taxa = list[j]; numberOfBipsPerTaxa[taxa] = numberOfBipsPerTaxa[taxa] + bipsPerTree[tree]; } } //Now create dummy arrays inside bipsOfTaxa for(int i = 1; i < tr->mxtips+1; i++) { bipsOfTaxa[i] = (int*)rax_malloc(sizeof(int)*numberOfBipsPerTaxa[i]); } printf("==> Storing all bip indices of a certain dropset into an array \n"); //For checking if all dropsets are iterated dropSetCount = 0; //Arrays of counter to keep in track int* counterOfSet = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets); for(int i = 0; i < numberOfUniqueSets; i++) { counterOfSet[i] = 0; } currentBips = 0; //Need to keep in track of the number of bips //First iterate through all trees for(int i = 0; i < numberOfTreesAnalyzed; i++ ) { //get the correct smallTreeTaxa List int* list = smallTreeTaxaList[i]; //For each bipartition in the tree for(int j = 0; j < bipsPerTree[i]; j++) { //Look at all bips it is compared too int dropSetsPerBip = bipsPerTree[i]; for(int k = 0; k < dropSetsPerBip; k++){ int indexOfUniqDropSet = setsToUniqSets[dropSetCount + k]; int* bips_array = bipsOfDropSet[indexOfUniqDropSet]; //add bipartition j into the bips array of its dropset bips_array[counterOfSet[indexOfUniqDropSet]] = currentBips; //increment the internal array index counterOfSet[indexOfUniqDropSet]++; } //Jump to the next correct dropSetCount! dropSetCount = dropSetCount + dropSetsPerBip; //now insert the bip into bipsOfTaxa Array for(int ix = 0; ix < taxaPerTree[i]; ix++) { //get the taxa number int stree_Taxa = list[ix]; //get the bips list of this taxa number int* bipsList = bipsOfTaxa[stree_Taxa]; //now get the position of the biplist and put in our bip index bipsList[taxaBipsCounter[stree_Taxa]] = currentBips; //increment the counter taxaBipsCounter[stree_Taxa]++; } //increment currentBips currentBips++; } } /***********************************************************************************/ /* End RF-OPT Graph Construction */ /***********************************************************************************/ /* Short summary : sets - array of all dropsets uniqSets - array of all unique dropsets bipsPerTree - bips per tree setsToUniqSets - translates the index of sets to the index of its unique dropset index bipsOfDropSets - all bips which disappear when dropset i is pruned scores - has all scores between 0 and 1 for the bips (however 0s can be found out by looking at all dropsets with link to dropset 0 (because we sort and it will always be the lowest)) */ /***********************************************************************************/ /* RF-OPT Initial Score Calculation */ /***********************************************************************************/ unsigned int bipsVectorLength; /* calculate the bitvector length for bips bitvector */ if(numberOfBips % MASK_LENGTH == 0) bipsVectorLength = numberOfBips / MASK_LENGTH; else bipsVectorLength = 1 + (numberOfBips / MASK_LENGTH); //Starting from index 1 (because 0 stands for all who already matches) //We need a score array saving the scores for each uniqset int* rf_score = (int*)rax_calloc(numberOfUniqueSets,sizeof(int)); printf("==> Calculating the score for the first iteration \n \n"); //Store all bvecs of all merged and destroyed bipartitions per DropSet int* bvecs_bips = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets); int* bvecs_destroyed = (int*)rax_malloc(sizeof(int)*numberOfUniqueSets); //Iterate through all sets for(int i = 0; i < numberOfUniqueSets; i++) { //Bitvectors of merged and destroyed int bvec_destroyed = 0; int* set = uniqSets[i]; //Get the dropset, first dropset is 0 (if something is matching) //printf(" ==> Analyze Unique DropSet %i \n", i); //We use this data structure to keep track of the to toggled bits int* toggleBits = (int*)rax_calloc(numberOfBips, sizeof(int)); //Now iterate through the set int j = 0; //Stores the affected bips into a bitvector int bvec_bips = 0; while(set[j] != -1) { int taxa = set[j]; //Get the taxa //printf(" Taxa number is %i \n",taxa); //Check if set[j] is itself already a set int test[2] = {taxa,-1}; //0 if it is not a set, index + 1 otherwise int test_index = contains(test, uniqSets, numberOfUniqueSets); if(test_index){ //printf(" It also is in uniqSet %i \n", test_index - 1); bvec_bips = getBipsOfDropSet(bvec_bips, (test_index - 1), numberOfBipsPerSet, bipsOfDropSet); } //Get all bips of this taxa to detect which one will be destroyed int* listOfBips = bipsOfTaxa[taxa]; //Go through all bipartitions containing this taxa for(int k = 0; k < numberOfBipsPerTaxa[taxa]; k++){ int bipindex = listOfBips[k]; //Get the index of the Bipartition int bip = ind_bips[bipindex]; //Now analyze this Bipartition //Which tree does this bipartition belongs too? int treenumber = treenumberOfBip[bipindex]; //Get the taxonToSmallTree Array of this tree int* stTaxa = taxonToReductionList[treenumber]; //Translate the global taxon number it into the local index used by our bips int translated_index = stTaxa[taxa - 1]; //We use taxa - 1 because we start counting at taxa 1 = 0 ! //Save the to toggle index into toggleBits vector toggleBits[bipindex] |= 1 << translated_index; //Sort for bits set on one side of the bip and on the other side int leftBits = __builtin_popcount(toggleBits[bipindex] & bip); int rightBits = __builtin_popcount(toggleBits[bipindex]) - leftBits; //Check for the number of bits set in the original bip int leftBip = __builtin_popcount(bip); int rightBip = taxaPerTree[treenumber] - leftBip; //Subtract the total number of bits set on one side of the bip with the bits we have to toggle int leftBip_after = leftBip - leftBits; int rightBip_after = rightBip - rightBits; //Check if bipartition gets trivial/destroyed due to pruning the taxa and set the bit (representing the bip) which is destroyed if((leftBip_after <= 1) | (rightBip_after <=1)) { //Add bips to the bits which represent destroyed bipartitions bvec_destroyed = setBit(bvec_destroyed,bipindex); } } j++; }//End iterate through the set int penality = 0; int score = 0; int bvec_mask = 0; bvec_mask = setOffSet(bvec_mask, numberOfBips); //Bitvector of already matching bips int bvec_tmp = 0; bvec_tmp = ~bvec_scores & bvec_mask; //Penality score are all bitvectors who were matching but is destroyed penality = __builtin_popcount(bvec_destroyed & bvec_tmp); //Now iterate through bipsOfDropSet list and extract all bips which will merge into a bitVector bvec_bips = getBipsOfDropSet(bvec_bips, i, numberOfBipsPerSet, bipsOfDropSet); //Calculate the bitvectors which remains bvec_tmp = ~bvec_destroyed & bvec_mask; bvec_tmp = bvec_bips & bvec_tmp; score = __builtin_popcount(bvec_scores & bvec_tmp); rf_score[i] = score - penality; //Save our results for convenience into an array bvecs_bips[i] = bvec_bips; bvecs_destroyed[i] = bvec_destroyed; }//End Score Calculation printf("======> Scores:\n"); for(int i = 0; i < numberOfUniqueSets; i++) { printf("RF Score for %i : %i \n", i, rf_score[i]); //printBitVector(bvecs_bips[i]); //printBitVector(bvecs_destroyed[i]); } int maxDropSet = getMax(rf_score, numberOfUniqueSets); printf("Max Element is %i \n", maxDropSet); /***********************************************************************************/ /* RF-OPT Create Update Data Structures */ /***********************************************************************************/ printf("====> Delete DropSet from all bips and update numbers \n"); //Create a bitVector to store all deleted taxa int bvec_deletedTaxa = 0; //Create a bitVector to store all still existing bips int bvec_existingBips = 0; //Create a bitvector to store deleted dropsets int bvec_deletedDropSets = 0; //Get the dropset int* deleteDropSet = uniqSets[maxDropSet]; //Store it into a BitVector bvec_deletedDropSets = setBit(bvec_deletedDropSets,maxDropSet); //Select all bips destroyed by removing this dropset int bvec_destroyedBips = bvecs_destroyed[maxDropSet]; //Select all bips that are now matching int bvec_matchingBips = bvecs_bips[maxDropSet]; //Filter for existent bipartitions bvec_existingBips = getExistingBips(bvec_existingBips, numberOfBips, bvec_destroyedBips); //Iterate through its taxa int iterSet = 0; while(deleteDropSet[iterSet] != -1) { //Get taxon int taxon = deleteDropSet[iterSet]; //Store the taxon into deletedTaxa BitVector bvec_deletedTaxa = setBit(bvec_deletedTaxa, taxon - 1); //Check if taxon is inside int test[2] = {taxon, -1}; int index = contains(test, uniqSets, numberOfUniqueSets); iterSet++; } //printBitVector(bvec_existingBips); //printBitVector(bvec_deletedTaxa); //Update the scores with now matching bips bvec_scores = bvec_scores & (~bvec_matchingBips); //printBitVector(bvec_scores); /* Short summary : bvec_existingBips - bitVector of all still existing bips bvec_deletedTaxa - bitVector to keep track of destroyed taxa */ /***********************************************************************************/ /* TODO RF-OPT Update function */ /***********************************************************************************/ /***********************************************************************************/ /* End RF-OPT Update function */ /***********************************************************************************/ //printf("Ind Bipartitions?: "); // printf("Induced Bipartitions: "); // printBitVector(ind_bips[0]); // printBitVector(ind_bips[1]); // printBitVector(ind_bips[2]); // printBitVector(ind_bips[3]); // printBitVector(ind_bips[4]); // printBitVector(ind_bips[5]); // printBitVector(ind_bips[6]); /***********************************************************************************/ /* Console Logs for debugging */ /***********************************************************************************/ //Printing if printf("==> Unique Sets: "); for(int i = 0; i < numberOfUniqueSets; i++) { int j = 0; int* set = uniqSets[i]; while(set[j] > -1) { printf("%i ",set[j]); j++; } printf("; "); } printf("\n"); printf("\n == Sets == \n"); for(int fooo = 0; fooo < numberOfSets; fooo++){ printf("Set %i: ", fooo); int i = 0; while(sets[fooo][i] > -1) { printf("%i ",sets[fooo][i]); i++; } printf("\n"); } printf("\n"); //#define _PRINT_ #ifdef _PRINT_ for(int i = 0; i < numberOfUniqueSets; i++) { printf("Bips of Set %i: ", i); for(int j = 0; j < numberOfBipsPerSet[i]; j++) { int* bips = bipsOfDropSet[i]; printf("%i ", bips[j]); } printf("\n"); } printf("Induced Bips! \n"); // Now checking which dropset would destroy which bipartition for(int i = 0 ; i < numberOfBips; i++) { printf("Bip %i is %i \n",i,ind_bips[i]); } printf("Taxa Names : \n"); for(int i = 0; i < tr->mxtips + 1; i++) { printf("%s ",tr->nameList[i]); } printf("\n"); printf("Small Tree Taxa Names 0 : \n"); for(int i = 0; i < taxaPerTree[0]; i++) { int* list = smallTreeTaxaList[0]; int taxa = list[i]; printf("%s ",tr->nameList[taxa]); } printf("\n"); printf("Small Tree Taxa Names 1 : \n"); for(int i = 0; i < taxaPerTree[1]; i++) { int* list = smallTreeTaxaList[1]; int taxa = list[i]; printf("%s ",tr->nameList[taxa]); } printf("\n"); printf("Small Tree Taxa Names 2 : \n"); for(int i = 0; i < taxaPerTree[2]; i++) { int* list = smallTreeTaxaList[2]; int taxa = list[i]; printf("%s ",tr->nameList[taxa]); } printf("\n"); printf("Number of DropSets extracted%i \n",dropSetCount); printf("Number of Bips extracted %i \n",currentBips); //Testing ... printf("Number of Sets is %i \n",numberOfSets); printf("Number of Unique Sets is %i \n",numberOfUniqueSets); printf("==> Testing bips of unique sets \n"); for(int i = 0; i < numberOfUniqueSets; i++) { printf("Bips of Set %i: ", i); for(int j = 0; j < numberOfBipsPerSet[i]; j++) { int* bips = bipsOfDropSet[i]; printf("%i ", bips[j]); } printf("\n"); } printf("==> Testing bips of taxa \n"); for(int i = 1; i < tr->mxtips + 1; i++) { printf("Bips of Taxa %i: ", i); for(int j = 0; j < numberOfBipsPerTaxa[i]; j++) { int* bips = bipsOfTaxa[i]; printf("%i ", bips[j]); } printf("\n"); } printf("==> Unique Sets: "); for(int i = 0; i < numberOfUniqueSets; i++) { int j = 0; int* set = uniqSets[i]; while(set[j] > -1) { printf("%i ",set[j]); j++; } printf("; "); } printf("\n"); printf("==> setsToUniqSets: "); for(int i = 0; i < numberOfSets; i++) { printf("%i ",setsToUniqSets[i]); } printf("\n"); //=== TREE GRAPH CONSTRUCTION ENDS === printf("Scores: "); printBitVector(bvec_scores); printf("BipsPerTree: "); for(int foo = 0; foo < tr->numberOfTrees; foo++) { printf("%i ",bipsPerTree[foo]); } printf("\nInduced Bips: "); for(int foo = 0;foo < numberOfBips; foo++) { printf("%u ",ind_bips[foo]); } printf("\nSmall Tree Bips: "); for(int foo = 0;foo < numberOfBips; foo++) { printf("%u ",s_bips[foo]); } printf("\n == Sets == \n"); for(int fooo = 0; fooo < numberOfSets; fooo++){ printf("Set %i: ", fooo); int i = 0; while(sets[fooo][i] > -1) { printf("%i ",sets[fooo][i]); i++; } printf("\n"); } printf("\n"); #endif printBothOpen("Number of small trees skipped: %d\n\n", tr->numberOfTrees - numberOfTreesAnalyzed); printBothOpen("Average RF distance %f\n\n", avgRF / (double)numberOfTreesAnalyzed); printBothOpen("Large Tree: %i, Number of SmallTrees analyzed: %i \n\n", tr->mxtips, numberOfTreesAnalyzed); printBothOpen("Total execution time: %f secs\n\n", gettime() - masterTime); printBothOpen("File containing all %d pair-wise RF distances written to file %s\n\n", numberOfTreesAnalyzed, rfFileName); printBothOpen("execution stats:\n\n"); printBothOpen("Accumulated time Effective algorithm: %.5f sec \n", sumEffectivetime); printBothOpen("Average time for effective: %.10f sec \n",sumEffectivetime / (double)numberOfTreesAnalyzed); printBothOpen("Preprocessingtime: %0.5f sec \n\n", preprocessendtime); fclose(treeFile); fclose(rfFile); /* free the data structure used for parsing the potentially multi-furcating tree */ freeMultifurcations(smallTree); rax_free(smallTree); rax_free(taxonToLabel); rax_free(taxonToEulerIndex); rax_free(labelToTaxon); rax_free(eulerIndexToLabel); rax_free(taxonToReduction); rax_free(taxonHasDeg); }
int *permutationSH(tree *tr, int nBootstrap, long _randomSeed) { int replicate, model, maxNonZero = 0, *weightBuffer, *col = (int*)rax_calloc(((size_t)tr->cdta->endsite) * ((size_t)nBootstrap), sizeof(int)), *nonzero = (int*)rax_calloc(tr->NumberOfModels, sizeof(int)); long randomSeed = _randomSeed; size_t bufferSize; for(model = 0; model < tr->NumberOfModels; model++) { int j; for (j = 0; j < tr->cdta->endsite; j++) { if(tr->originalModel[j] == model) nonzero[model] += tr->originalWeights[j]; } if(nonzero[model] > maxNonZero) maxNonZero = nonzero[model]; } bufferSize = ((size_t)maxNonZero) * sizeof(int); weightBuffer = (int*)rax_malloc(bufferSize); for(replicate = 0; replicate < nBootstrap; replicate++) { int j, *wgt = &col[((size_t)tr->cdta->endsite) * ((size_t)replicate)]; for(model = 0; model < tr->NumberOfModels; model++) { int pos, nonz = nonzero[model]; memset(weightBuffer, 0, bufferSize); for(j = 0; j < nonz; j++) weightBuffer[(int) (nonz * randum(&randomSeed))]++; for(j = 0, pos = 0; j < tr->cdta->endsite; j++) { if(model == tr->originalModel[j]) { int w; for(w = 0; w < tr->originalWeights[j]; w++) { wgt[j] += weightBuffer[pos]; pos++; } } } } } rax_free(weightBuffer); rax_free(nonzero); return col; }
/** @ingroup alignmentGroup @brief Parse the PHYLIP file body */ static int parse_phylip (pllAlignmentData * alignmentData, int input) { int i,j; pllLexToken token; int * sequenceLength; int rc; sequenceLength = (int *) rax_calloc (alignmentData->sequenceCount + 1, sizeof (int)); NEXT_TOKEN for (i = 0; ; ++i) { j = i % alignmentData->sequenceCount; if (i < alignmentData->sequenceCount) { if (token.tokenType == PLL_TOKEN_EOF) { rc = parsedOk (sequenceLength, alignmentData->sequenceCount, alignmentData->sequenceLength); rax_free (sequenceLength); return (rc); } if (token.tokenType == PLL_TOKEN_UNKNOWN) { rax_free (sequenceLength); return (0); } CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE) if (token.tokenType != PLL_TOKEN_STRING && token.tokenType != PLL_TOKEN_NUMBER && token.tokenType != PLL_TOKEN_FLOAT) { rax_free (sequenceLength); return (0); } alignmentData->sequenceLabels[i + 1] = strndup (token.lexeme, token.len); NEXT_TOKEN CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE) } while (1) { if (token.tokenType == PLL_TOKEN_EOF) { rc = parsedOk (sequenceLength, alignmentData->sequenceCount, alignmentData->sequenceLength); rax_free (sequenceLength); return (rc); } if (token.tokenType == PLL_TOKEN_UNKNOWN) { rax_free (sequenceLength); return (0); } if (token.tokenType == PLL_TOKEN_NEWLINE) break; if (token.tokenType != PLL_TOKEN_STRING) { rax_free (sequenceLength); return (0); } if (sequenceLength[j + 1] + token.len > alignmentData->sequenceLength) { fprintf (stderr, "Sequence %d is larger than specified\n", j + 1); rax_free (sequenceLength); return (0); } memmove (alignmentData->sequenceData[j + 1] + sequenceLength[j + 1], token.lexeme, token.len); sequenceLength[j + 1] += token.len; NEXT_TOKEN CONSUME (PLL_TOKEN_WHITESPACE) } CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE); } }
void plausibilityChecker(tree *tr, analdef *adef) { FILE *treeFile, *rfFile; tree *smallTree = (tree *)rax_malloc(sizeof(tree)); char rfFileName[1024]; /* init hash table for big reference tree */ hashtable *h = initHashTable(tr->mxtips * 2 * 2); /* init the bit vectors we need for computing and storing bipartitions during the tree traversal */ unsigned int vLength, **bitVectors = initBitVector(tr, &vLength); int numberOfTreesAnalyzed = 0, branchCounter = 0, i; double avgRF = 0.0; /* set up an output file name */ strcpy(rfFileName, workdir); strcat(rfFileName, "RAxML_RF-Distances."); strcat(rfFileName, run_id); rfFile = myfopen(rfFileName, "wb"); assert(adef->mode == PLAUSIBILITY_CHECKER); /* open the big reference tree file and parse it */ treeFile = myfopen(tree_file, "r"); printBothOpen("Parsing reference tree %s\n", tree_file); treeReadLen(treeFile, tr, FALSE, TRUE, TRUE, adef, TRUE, FALSE); assert(tr->mxtips == tr->ntips); printBothOpen("The reference tree has %d tips\n", tr->ntips); fclose(treeFile); /* extract all induced bipartitions from the big tree and store them in the hastable */ bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips, vLength, h, 0, BIPARTITIONS_RF, (branchInfo *)NULL, &branchCounter, 1, FALSE, FALSE); assert(branchCounter == tr->mxtips - 3); /* now see how many small trees we have */ treeFile = getNumberOfTrees(tr, bootStrapFile, adef); checkTreeNumber(tr->numberOfTrees, bootStrapFile); /* allocate a data structure for parsing the potentially mult-furcating tree */ allocateMultifurcations(tr, smallTree); /* loop over all small trees */ for(i = 0; i < tr->numberOfTrees; i++) { int numberOfSplits = readMultifurcatingTree(treeFile, smallTree, adef, TRUE); if(numberOfSplits > 0) { unsigned int entryCount = 0, k, j, *masked = (unsigned int *)rax_calloc(vLength, sizeof(unsigned int)), *smallTreeMask = (unsigned int *)rax_calloc(vLength, sizeof(unsigned int)); hashtable *rehash = initHashTable(tr->mxtips * 2 * 2); double rf, maxRF; int bCounter = 0, bips, firstTaxon, taxa = 0; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Small tree %d has %d tips and %d bipartitions\n", i, smallTree->ntips, numberOfSplits); /* compute the maximum RF distance for computing the relative RF distance later-on */ /* note that here we need to pay attention, since the RF distance is not normalized by 2 * (n-3) but we need to account for the fact that the multifurcating small tree will potentially contain less bipartitions. Hence the normalization factor is obtained as 2 * numberOfSplits, where numberOfSplits is the number of bipartitions in the small tree. */ maxRF = (double)(2 * numberOfSplits); /* now set up a bit mask where only the bits are set to one for those taxa that are actually present in the small tree we just read */ /* note that I had to apply some small changes to this function to make it work for multi-furcating trees ! */ setupMask(smallTreeMask, smallTree->start, smallTree->mxtips); setupMask(smallTreeMask, smallTree->start->back, smallTree->mxtips); /* now get the index of the first taxon of the small tree. we will use this to unambiguously store the bipartitions */ firstTaxon = smallTree->start->number; /* make sure that this bit vector is set up correctly, i.e., that it contains as many non-zero bits as there are taxa in this small tree */ for(j = 0; j < vLength; j++) taxa += BIT_COUNT(smallTreeMask[j]); assert(taxa == smallTree->ntips); /* now re-hash the big tree by applying the above bit mask */ /* loop over hash table */ for(k = 0, entryCount = 0; k < h->tableSize; k++) { if(h->table[k] != NULL) { entry *e = h->table[k]; /* we resolve collisions by chaining, hence the loop here */ do { unsigned int *bitVector = e->bitVector; hashNumberType position; int count = 0; /* double check that our tree mask contains the first taxon of the small tree */ assert(smallTreeMask[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH]); /* if the first taxon is set then we will re-hash the bit-wise complement of the bit vector. The count variable is used for a small optimization */ if(bitVector[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH]) { //hash complement for(j = 0; j < vLength; j++) { masked[j] = (~bitVector[j]) & smallTreeMask[j]; count += BIT_COUNT(masked[j]); } } else { //hash this vector for(j = 0; j < vLength; j++) { masked[j] = bitVector[j] & smallTreeMask[j]; count += BIT_COUNT(masked[j]); } } /* note that padding the last bits is not required because they are set to 0 automatically by smallTreeMask */ /* make sure that we will re-hash the canonic representation of the bipartition where the bit for firstTaxon is set to 0! */ assert(!(masked[(firstTaxon - 1) / MASK_LENGTH] & mask32[(firstTaxon - 1) % MASK_LENGTH])); /* only if the masked bipartition of the large tree is a non-trivial bipartition (two or more bits set to 1 will we re-hash it */ if(count > 1) { /* compute hash */ position = oat_hash((unsigned char *)masked, sizeof(unsigned int) * vLength); position = position % rehash->tableSize; /* re-hash to the new hash table that contains the bips of the large tree, pruned down to the taxa contained in the small tree */ insertHashPlausibility(masked, rehash, vLength, position); } entryCount++; e = e->next; } while(e != NULL); } } /* make sure that we tried to re-hash all bipartitions of the original tree */ assert(entryCount == (unsigned int)(tr->mxtips - 3)); /* now traverse the small tree and count how many bipartitions it shares with the corresponding induced tree from the large tree */ /* the following function also had to be modified to account for multi-furcating trees ! */ bips = bitVectorTraversePlausibility(bitVectors, smallTree->start->back, smallTree->mxtips, vLength, rehash, &bCounter, firstTaxon, smallTree, TRUE); /* compute the relative RF */ rf = (double)(2 * (numberOfSplits - bips)) / maxRF; assert(numberOfSplits >= bips); assert(rf <= 1.0); avgRF += rf; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Relative RF tree %d: %f\n\n", i, rf); fprintf(rfFile, "%d %f\n", i, rf); /* I also modified this assertion, we nee to make sure here that we checked all non-trivial splits/bipartitions in the multi-furcating tree whech can be less than n - 3 ! */ assert(bCounter == numberOfSplits); /* free masks and hast table for this iteration */ rax_free(smallTreeMask); rax_free(masked); freeHashTable(rehash); rax_free(rehash); numberOfTreesAnalyzed++; } } printBothOpen("Number of small trees skipped: %d\n\n", tr->numberOfTrees - numberOfTreesAnalyzed); printBothOpen("Average RF distance %f\n\n", avgRF / (double)numberOfTreesAnalyzed); printBothOpen("Total execution time: %f secs\n\n", gettime() - masterTime); printBothOpen("\nFile containing all %d pair-wise RF distances written to file %s\n\n", numberOfTreesAnalyzed, rfFileName); fclose(treeFile); fclose(rfFile); /* free the data structure used for parsing the potentially multi-furcating tree */ freeMultifurcations(smallTree); rax_free(smallTree); freeBitVectors(bitVectors, 2 * tr->mxtips); rax_free(bitVectors); freeHashTable(h); rax_free(h); }
void plausibilityChecker(tree *tr, analdef *adef) { FILE *treeFile, *rfFile; tree *smallTree = (tree *)rax_malloc(sizeof(tree)); char rfFileName[1024]; int numberOfTreesAnalyzed = 0, i; double avgRF = 0.0, sumEffectivetime = 0.0; /* set up an output file name */ strcpy(rfFileName, workdir); strcat(rfFileName, "RAxML_RF-Distances."); strcat(rfFileName, run_id); rfFile = myfopen(rfFileName, "wb"); assert(adef->mode == PLAUSIBILITY_CHECKER); /* open the big reference tree file and parse it */ treeFile = myfopen(tree_file, "r"); printBothOpen("Parsing reference tree %s\n", tree_file); treeReadLen(treeFile, tr, FALSE, TRUE, TRUE, adef, TRUE, FALSE); assert(tr->mxtips == tr->ntips); /*************************************************************************************/ /* Preprocessing Step */ double preprocesstime = gettime(); /* taxonToLabel[2*tr->mxtips - 2]; Array storing all 2n-2 labels from the preordertraversal: (Taxonnumber - 1) -> (Preorderlabel) */ int *taxonToLabel = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)), /* taxonHasDeg[2*tr->mxtips - 2] Array used to store the degree of every taxon, is needed to extract Bipartitions from multifurcating trees (Taxonnumber - 1) -> (degree of node(Taxonnumber)) */ *taxonHasDeg = (int *)rax_calloc((2*tr->mxtips - 2),sizeof(int)), /* taxonToReduction[2*tr->mxtips - 2]; Array used for reducing bitvector and speeding up extraction: (Taxonnumber - 1) -> (0..1 (increment count of taxa appearing in small tree)) (Taxonnumber - 1) -> (0..1 (increment count of inner nodes appearing in small tree)) */ *taxonToReduction = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)); int newcount = 0; //counter used for correct traversals /* labelToTaxon[2*tr->mxtips - 2]; is used to translate between Perorderlabel and p->number: (Preorderlabel) -> (Taxonnumber) */ int *labelToTaxon = (int *)rax_malloc((2*tr->mxtips - 2) * sizeof(int)); /* Preorder-Traversal of the large tree */ preOrderTraversal(tr->start->back,tr->mxtips, tr->start->number, taxonToLabel, labelToTaxon, &newcount); newcount = 0; //counter set to 0 to be now used for Eulertraversal /* eulerIndexToLabel[4*tr->mxtips - 5]; Array storing all 4n-5 PreOrderlabels created during eulertour: (Eulerindex) -> (Preorderlabel) */ int* eulerIndexToLabel = (int *)rax_malloc((4*tr->mxtips - 5) * sizeof(int)); /* taxonToEulerIndex[tr->mxtips]; Stores all indices of the first appearance of a taxa in the eulerTour: (Taxonnumber - 1) -> (Index of the Eulertour where Taxonnumber first appears) is used for efficient computation of the Lowest Common Ancestor during Reconstruction Step */ int* taxonToEulerIndex = (int *)rax_malloc((tr->mxtips) * sizeof(int)); /* Init taxonToEulerIndex and taxonToReduction */ int ix; for(ix = 0; ix < tr->mxtips; ++ix) taxonToEulerIndex[ix] = -1; for(ix = 0; ix < (2*tr->mxtips - 2); ++ix) taxonToReduction[ix] = -1; /* Eulertraversal of the large tree*/ unrootedEulerTour(tr->start->back,tr->mxtips, eulerIndexToLabel, taxonToLabel, &newcount, taxonToEulerIndex); /* Creating RMQ Datastructure for efficient retrieval of LCAs, using Johannes Fischers Library rewritten in C Following Files: rmq.h,rmqs.c,rmqs.h are included in Makefile.RMQ.gcc */ RMQ_succinct(eulerIndexToLabel,4*tr->mxtips - 5); double preprocessendtime = gettime() - preprocesstime; /* Proprocessing Step End */ /*************************************************************************************/ printBothOpen("The reference tree has %d tips\n", tr->ntips); fclose(treeFile); /* now see how many small trees we have */ treeFile = getNumberOfTrees(tr, bootStrapFile, adef); checkTreeNumber(tr->numberOfTrees, bootStrapFile); /* allocate a data structure for parsing the potentially mult-furcating tree */ allocateMultifurcations(tr, smallTree); /* loop over all small trees */ for(i = 0; i < tr->numberOfTrees; i++) { int numberOfSplits = readMultifurcatingTree(treeFile, smallTree, adef, TRUE); if(numberOfSplits > 0) { int firstTaxon; double rec_rf, maxRF; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Small tree %d has %d tips and %d bipartitions\n", i, smallTree->ntips, numberOfSplits); /* compute the maximum RF distance for computing the relative RF distance later-on */ /* note that here we need to pay attention, since the RF distance is not normalized by 2 * (n-3) but we need to account for the fact that the multifurcating small tree will potentially contain less bipartitions. Hence the normalization factor is obtained as n-3 + numberOfSplits, where n-3 is the number of bipartitions of the pruned down large reference tree for which we know that it is bifurcating/strictly binary */ maxRF = (double)(2 * numberOfSplits); /* now get the index of the first taxon of the small tree. we will use this to unambiguously store the bipartitions */ firstTaxon = smallTree->start->number; /***********************************************************************************/ /* Reconstruction Step */ double time_start = gettime(); /* Init hashtable to store Bipartitions of the induced subtree */ /* using smallTree->ntips instead of smallTree->mxtips yields faster code e.g. 120 versus 128 seconds for 20,000 small trees on my laptop */ hashtable *s_hash = initHashTable(smallTree->ntips * 4); /* smallTreeTaxa[smallTree->ntips]; Stores all taxa numbers from smallTree into an array called smallTreeTaxa: (Index) -> (Taxonnumber) */ int* smallTreeTaxa = (int *)rax_malloc((smallTree->ntips) * sizeof(int)); /* counter is set to 0 for correctly extracting taxa of the small tree */ newcount = 0; int newcount2 = 0; /* seq2[2*smallTree->ntips - 2]; stores PreorderSequence of the reference smalltree: (Preorderindex) -> (Taxonnumber) */ int* seq2 = (int *)rax_malloc((2*smallTree->ntips - 2) * sizeof(int)); /* used to store the vectorLength of the bitvector */ unsigned int vectorLength; /* extract all taxa of the smalltree and store it into an array, also store all counts of taxa and nontaxa in taxonToReduction */ rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start, smallTree->mxtips, &newcount, &newcount2); rec_extractTaxa(smallTreeTaxa, taxonToReduction, smallTree->start->back, smallTree->mxtips, &newcount, &newcount2); /* counter is set to 0 to correctly preorder traverse the small tree */ newcount = 0; /* Preordertraversal of the small tree and save its sequence into seq2 for later extracting the bipartitions, it also stores information about the degree of every node */ rec_preOrderTraversalMulti(smallTree->start->back,smallTree->mxtips, smallTree->start->number, seq2, taxonHasDeg, &newcount); /* calculate the bitvector length */ if(smallTree->ntips % MASK_LENGTH == 0) vectorLength = smallTree->ntips / MASK_LENGTH; else vectorLength = 1 + (smallTree->ntips / MASK_LENGTH); unsigned int **bitVectors = rec_initBitVector(smallTree, vectorLength); /* store all non trivial bitvectors using an subtree approach for the induced subtree and store it into a hashtable, this method was changed for multifurcation */ rec_extractBipartitionsMulti(bitVectors, seq2, newcount,tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, taxonToReduction, taxonHasDeg, numberOfSplits); /* counter is set to 0 to be used for correctly storing all EulerIndices */ newcount = 0; /* smallTreeTaxonToEulerIndex[smallTree->ntips]; Saves all first Euler indices for all Taxons appearing in small Tree: (Index) -> (Index of the Eulertour where the taxonnumber of the small tree first appears) */ int* smallTreeTaxonToEulerIndex = (int *)rax_malloc((smallTree->ntips) * sizeof(int)); /* seq[(smallTree->ntips*2) - 1] Stores the Preordersequence of the induced small tree */ int* seq = (int *)rax_malloc((2*smallTree->ntips - 1) * sizeof(int)); /* iterate through all small tree taxa */ for(ix = 0; ix < smallTree->ntips; ix++) { int taxanumber = smallTreeTaxa[ix]; /* To create smallTreeTaxonToEulerIndex we filter taxonToEulerIndex for taxa in the small tree*/ smallTreeTaxonToEulerIndex[newcount] = taxonToEulerIndex[taxanumber-1]; /* Saves all Preorderlabel of the smalltree taxa in seq*/ seq[newcount] = taxonToLabel[taxanumber-1]; newcount++; } /* sort the euler indices to correctly calculate LCA */ //quicksort(smallTreeTaxonToEulerIndex,0,newcount - 1); qsort(smallTreeTaxonToEulerIndex, newcount, sizeof(int), sortIntegers); //printf("newcount2 %i \n", newcount2); /* Iterate through all small tree taxa */ for(ix = 1; ix < newcount; ix++) { /* query LCAs using RMQ Datastructure */ seq[newcount - 1 + ix] = eulerIndexToLabel[query(smallTreeTaxonToEulerIndex[ix - 1],smallTreeTaxonToEulerIndex[ix])]; /* Used for dynamic programming. We save an index for every inner node: For example the reference tree has 3 inner nodes which we saves them as 0,1,2. Now we calculate for example 5 LCA to construct the induced subtree, which are also inner nodes. Therefore we mark them as 3,4,5,6,7 */ taxonToReduction[labelToTaxon[seq[newcount - 1 + ix]] - 1] = newcount2; newcount2 += 1; } /* sort to construct the Preordersequence of the induced subtree */ //quicksort(seq,0,(2*smallTree->ntips - 2)); qsort(seq, (2 * smallTree->ntips - 2) + 1, sizeof(int), sortIntegers); /* calculates all bipartitions of the reference small tree and count how many bipartition it shares with the induced small tree */ int rec_bips = rec_findBipartitions(bitVectors, seq,(2*smallTree->ntips - 1), labelToTaxon, tr->mxtips, vectorLength, smallTree->ntips, firstTaxon, s_hash, taxonToReduction); /* Reconstruction Step End */ /***********************************************************************************/ double effectivetime = gettime() - time_start; /* if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Reconstruction time: %.10f secs\n\n", effectivetime); */ /* compute the relative RF */ rec_rf = (double)(2 * (numberOfSplits - rec_bips)) / maxRF; assert(numberOfSplits >= rec_bips); avgRF += rec_rf; sumEffectivetime += effectivetime; if(numberOfTreesAnalyzed % 100 == 0) printBothOpen("Relative RF tree %d: %f\n\n", i, rec_rf); fprintf(rfFile, "%d %f\n", i, rec_rf); /* free masks and hast table for this iteration */ rec_freeBitVector(smallTree, bitVectors); rax_free(bitVectors); freeHashTable(s_hash); rax_free(s_hash); rax_free(smallTreeTaxa); rax_free(seq); rax_free(seq2); rax_free(smallTreeTaxonToEulerIndex); numberOfTreesAnalyzed++; } } printBothOpen("Number of small trees skipped: %d\n\n", tr->numberOfTrees - numberOfTreesAnalyzed); printBothOpen("Average RF distance %f\n\n", avgRF / (double)numberOfTreesAnalyzed); printBothOpen("Large Tree: %i, Number of SmallTrees analyzed: %i \n\n", tr->mxtips, numberOfTreesAnalyzed); printBothOpen("Total execution time: %f secs\n\n", gettime() - masterTime); printBothOpen("File containing all %d pair-wise RF distances written to file %s\n\n", numberOfTreesAnalyzed, rfFileName); printBothOpen("execution stats:\n\n"); printBothOpen("Accumulated time Effective algorithm: %.5f sec \n", sumEffectivetime); printBothOpen("Average time for effective: %.10f sec \n",sumEffectivetime / (double)numberOfTreesAnalyzed); printBothOpen("Preprocessingtime: %0.5f sec \n\n", preprocessendtime); fclose(treeFile); fclose(rfFile); /* free the data structure used for parsing the potentially multi-furcating tree */ freeMultifurcations(smallTree); rax_free(smallTree); rax_free(taxonToLabel); rax_free(taxonToEulerIndex); rax_free(labelToTaxon); rax_free(eulerIndexToLabel); rax_free(taxonToReduction); rax_free(taxonHasDeg); }