/** @ingroup alignmentGroup @brief Parse a PHYLIP file Parses the PHYLIP file \a filename and returns a ::pllAlignmentData structure with the alignment. @param filename Name of file to be parsed @return Returns a structure of type ::pllAlignmentData that contains the alignment, or \b NULL in case of failure. */ static pllAlignmentData * pllParsePHYLIP (const char * filename) { int i, input, sequenceCount, sequenceLength; char * rawdata; long filesize; pllAlignmentData * alignmentData; rawdata = pllReadFile (filename, &filesize); if (!rawdata) { errno = PLL_ERROR_FILE_OPEN; return (NULL); } init_lexan (rawdata, filesize); input = get_next_symbol(); /* parse the header to obtain the number of taxa and sequence length */ if (!read_phylip_header (&input, &sequenceCount, &sequenceLength)) { rax_free (rawdata); fprintf (stderr, "Error while parsing PHYLIP header (number of taxa and sequence length)\n"); errno = PLL_ERROR_PHYLIP_HEADER_SYNTAX; return (NULL); } lex_table_amend_phylip(); /* allocate alignment structure */ alignmentData = pllInitAlignmentData (sequenceCount, sequenceLength); if (! parse_phylip (alignmentData, input)) { errno = PLL_ERROR_PHYLIP_BODY_SYNTAX; pllAlignmentDataDestroy (alignmentData); lex_table_restore(); rax_free (rawdata); return (NULL); } lex_table_restore(); rax_free (rawdata); alignmentData->siteWeights = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int)); for (i = 0; i < alignmentData->sequenceLength; ++ i) alignmentData->siteWeights[i] = 1; return (alignmentData); }
int run_sequential( char *dataset_file, pltb_config_t *config, model_space_t *model_space ) { FILE *out = DEBUG_PROCESS_STATISTICS_OPEN_OUTPUT; pltb_model_stat_t stats[model_space->matrix_count]; TIME_STRUCT_INIT(timer); fprint_eval_header(out); pllAlignmentData *data = read_alignment_data(dataset_file); pltb_result_t result; for(unsigned i = 0; i < IC_MAX; i++) { result.ic[i] = FLT_MAX; } while (next_model(model_space)) { partitionList *parts = init_partitions(data, config->base_freq_kind); pllInstance *inst = setup_instance(model_space->matrix_repr, &config->attr_model_eval, data, parts); pltb_model_stat_t *stat = &stats[model_space->matrix_index]; stat->matrix_index = model_space->matrix_index; TIME_START(timer); optimize_model_parameters(inst, parts); TIME_END(timer); stat->time_cpu = TIME_CPU(timer); stat->time_real = TIME_REAL(timer); stat->likelihood = inst->likelihood; calculate_model_ICs(stat, data, inst, model_space->free_parameter_count, config); merge_into_result(&result, stat, model_space->matrix_index); fprint_eval_row(out, model_space, stat); pllPartitionsDestroy(inst, &parts); pllDestroyInstance(inst); } fprint_eval_summary(out, model_space, &stats, &result); DEBUG_PROCESS_STATISTICS_CLOSE_OUTPUT(out); evaluate_result(model_space, &result, data, config); pllAlignmentDataDestroy(data); return 0; }
static void testProteinStuff() { pllAlignmentData * alignmentData; pllInstance * tr; pllNewickTree * newick; partitionList * partitions; struct pllQueue * parts; int i; for(i = 0; i < 5; i++) { //write a simple partition file with 3 partitions //for dataset dna.phy.dat contained //in this source directory FILE *f = fopen("proteinPartitions", "w"); switch(i) { case 0: fprintf(f, "WAG, p1 = 1-200\n"); fprintf(f, "WAG, p2 = 201-600\n"); fprintf(f, "WAG, p3 = 601-1104\n"); break; case 1: fprintf(f, "LG, p1 = 1-200\n"); fprintf(f, "LG, p2 = 201-600\n"); fprintf(f, "LG, p3 = 601-1104\n"); break; case 2: fprintf(f, "JTT, p1 = 1-200\n"); fprintf(f, "JTT, p2 = 201-600\n"); fprintf(f, "JTT, p3 = 601-1104\n"); break; case 3: case 4: fprintf(f, "GTR, p1 = 1-200\n"); fprintf(f, "GTR, p2 = 201-600\n"); fprintf(f, "GTR, p3 = 601-1104\n"); break; default: assert(0); } fclose(f); tr = pllCreateInstance (GAMMA, PLL_FALSE, PLL_FALSE, PLL_FALSE, 12345); alignmentData = pllParsePHYLIP ("prot.phy"); /* or alternatively, parse a FASTA file */ // alignmentData = pllParseFASTA ("prot.phy"); newick = pllNewickParseFile("parsimonyTree"); parts = pllPartitionParse ("proteinPartitions"); /* Validate the partitions */ if (!pllPartitionsValidate (parts, alignmentData)) { fprintf (stderr, "Error: Partitions do not cover all sites\n"); return (EXIT_FAILURE); } /* commit the partitions and build a partitions structure */ partitions = pllPartitionsCommit (parts, alignmentData); /* destroy the intermedia partition queue structure */ pllQueuePartitionsDestroy (&parts); /* eliminate duplicate sites from the alignment and update weights vector */ pllPhylipRemoveDuplicate (alignmentData, partitions); pllTreeInitTopologyNewick (tr, newick, PLL_TRUE); if (!pllLoadAlignment (tr, alignmentData, partitions, PLL_DEEP_COPY)) { fprintf (stderr, "Incompatible tree/alignment combination\n"); return (EXIT_FAILURE); } //pllInitModel(tr, PLL_TRUE, alignmentData, partitions); pllInitModel(tr, alignmentData, partitions); switch(i) { case 0: //all params unlinked pllLinkAlphaParameters("0,1,2", partitions); pllLinkFrequencies("0,1,2", partitions); pllLinkRates("0,1,2", partitions); break; case 1: //link params in another way pllLinkAlphaParameters("0,0,0", partitions); pllLinkFrequencies("0,1,2", partitions); pllLinkRates("0,1,2", partitions); break; case 2: //link params in yet another way pllLinkAlphaParameters("0,0,0", partitions); pllLinkFrequencies("0,1,2", partitions); pllLinkRates("0,1,0", partitions); break; case 3: //also fiddle around with the Q matrices, make them to be non-GTR, but simpler pllLinkAlphaParameters("0,1,2", partitions); pllLinkFrequencies("0,1,2", partitions); pllLinkRates("0,1,2", partitions); //these are GTR models //pllSetSubstitutionRateMatrixSymmetries("0,1,2,3,4,5", partitions, 0); //pllSetSubstitutionRateMatrixSymmetries("0,1,2,3,4,5", partitions, 1); //this is a simpler model with 5 parameters, parameter a and f have //the same value //pllSetSubstitutionRateMatrixSymmetries("0,1,2,3,4,0", partitions, 2); break; case 4: { //test case to show how the model parameters can be set to fixed values // set up arrays of user-defined base frequencies // and a user defined q matrix double f[4] = {0.25, 0.25, 0.25, 0.25}, q[6] = {1.0, 1.0, 1.0, 1.0, 1.0, 0.5}; //unlink alpha parameters base frequencies and Q matrices //across all partitions pllLinkAlphaParameters("0,1,2", partitions); pllLinkFrequencies("0,1,2", partitions); pllLinkRates("0,0,0", partitions); //set alpha to a fixed value of 1.0 for partition 0 and //parition 1 //pllSetFixedAlpha(1.0, 0, partitions, tr); //pllSetFixedAlpha(1.0, 1, partitions, tr); //fix the base frequencies to 0.25 for //partitions 0 and 1 //pllSetFixedBaseFrequencies(f, 4, 0, partitions, tr); //pllSetFixedBaseFrequencies(f, 4, 1, partitions, tr); //set the Q matrix to fixed values for partition //0 //pllSetFixedSubstitutionMatrix(q, 6, 0, partitions, tr); } break; default: assert(0); } evaluateGeneric(tr, partitions, tr->start, PLL_TRUE, PLL_FALSE); printf("%f \n", tr->likelihood); pllOptimizeModelParameters(tr, partitions, 1.0); //print the model parameters //printModelParameters(partitions); printf("%f \n", tr->likelihood); //cleanup pllAlignmentDataDestroy (alignmentData); pllNewickParseDestroy (&newick); pllPartitionsDestroy (tr, &partitions); pllTreeDestroy (tr); } }
int main (int argc, char * argv[]) { pllAlignmentData *alignmentData1, *alignmentData2; pllInstance * tr, *tr2; pllNewickTree * newick; partitionList * partitions, *partitions2; struct pllQueue * parts; int i; if (argc != 4) { fprintf (stderr, "usage: %s [phylip-file] [newick-file] [partition-file]\n", argv[0]); return (EXIT_FAILURE); } /* Create a PLL tree */ tr = pllCreateInstance (GAMMA, PLL_FALSE, PLL_FALSE, PLL_FALSE, 12345); tr2 = pllCreateInstance (GAMMA, PLL_FALSE, PLL_FALSE, PLL_FALSE, 12345); /* Parse a PHYLIP file */ alignmentData1= pllParsePHYLIP (argv[1]); alignmentData2 = pllParsePHYLIP (argv[1]); if (!alignmentData1) { fprintf (stderr, "Error while parsing %s\n", argv[1]); return (EXIT_FAILURE); } /* Parse a NEWICK file */ newick = pllNewickParseFile (argv[2]); if (!newick) { fprintf (stderr, "Error while parsing newick file %s\n", argv[2]); return (EXIT_FAILURE); } if (!pllValidateNewick (newick)) /* check whether the valid newick tree is also a tree that can be processed with our nodeptr structure */ { fprintf (stderr, "Invalid phylogenetic tree\n"); return (EXIT_FAILURE); } /* Parse the partitions file into a partition queue structure */ parts = pllPartitionParse (argv[3]); /* Validate the partitions */ if (!pllPartitionsValidate (parts, alignmentData1)) { fprintf (stderr, "Error: Partitions do not cover all sites\n"); return (EXIT_FAILURE); } /* commit the partitions and build a partitions structure */ partitions = pllPartitionsCommit (parts, alignmentData1); partitions2 = pllPartitionsCommit (parts, alignmentData2); /* destroy the intermedia partition queue structure */ pllQueuePartitionsDestroy (&parts); /* eliminate duplicate sites from the alignment and update weights vector */ pllPhylipRemoveDuplicate (alignmentData1, partitions); pllPhylipRemoveDuplicate (alignmentData2, partitions2); /* Set the topology of the PLL tree from a parsed newick tree */ //pllTreeInitTopologyNewick (tr, newick, PLL_TRUE); /* Or instead of the previous function use the next commented line to create a random tree topology pllTreeInitTopologyRandom (tr, phylip->nTaxa, phylip->label); */ pllTreeInitTopologyForAlignment(tr, alignmentData1); /* Connect the alignment with the tree structure */ if (!pllLoadAlignment (tr, alignmentData1, partitions, PLL_DEEP_COPY)) { fprintf (stderr, "Incompatible tree/alignment combination\n"); return (EXIT_FAILURE); } /* Initialize the model TODO: Put the parameters in a logical order and change the TRUE to flags */ pllInitModel(tr, alignmentData1, partitions); /* TODO transform into pll functions !*/ /* allocateParsimonyDataStructures(tr, partitions); pllMakeParsimonyTreeFast(tr, partitions); pllFreeParsimonyDataStructures(tr, partitions); */ pllComputeRandomizedStepwiseAdditionParsimonyTree(tr, partitions); pllTreeToNewick (tr->tree_string, tr, partitions, tr->start->back, PLL_TRUE, PLL_TRUE, PLL_FALSE, PLL_FALSE, PLL_FALSE, PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE); printf ("Tree: %s %d\n", tr->tree_string, tr->start->number); evaluateGeneric(tr, partitions, tr->start, PLL_TRUE, PLL_FALSE); double firstTree = tr->likelihood; printf("%f \n", tr->likelihood); //computeBIGRAPID_Test(tr, partitions, PLL_TRUE); printf("final like %f\n", tr->likelihood); //pllInitModel(tr, PLL_TRUE, phylip, partitions); pllTreeInitTopologyNewick (tr2, newick, PLL_TRUE); if (!pllLoadAlignment (tr2, alignmentData2, partitions2, PLL_DEEP_COPY)) { fprintf (stderr, "Incompatible tree/alignment combination\n"); return (EXIT_FAILURE); } pllInitModel(tr2, alignmentData2, partitions2); pllTreeToNewick (tr2->tree_string, tr2, partitions2, tr2->start->back, PLL_TRUE, PLL_TRUE, PLL_FALSE, PLL_FALSE, PLL_FALSE, PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE); printf ("Tree: %s %d\n", tr2->tree_string, tr2->start->number); evaluateGeneric(tr2, partitions2, tr2->start, PLL_TRUE, PLL_FALSE); printf("%f \n", tr2->likelihood); double secondTree = tr2->likelihood; assert(firstTree == secondTree); pllOptimizeModelParameters(tr2, partitions2, 10.0); printf("%f \n", tr2->likelihood); pllAlignmentDataDestroy (alignmentData1); pllNewickParseDestroy (&newick); pllPartitionsDestroy (tr, &partitions); pllTreeDestroy (tr); pllAlignmentDataDestroy (alignmentData2); pllPartitionsDestroy (&partitions2, tr2->mxtips); pllTreeDestroy (tr2); for(i = 0; i < 5; i++) { //write a simple partition file with 3 partitions //for dataset dna.phy.dat contained //in this source directory FILE *f = fopen("dummy", "w"); fprintf(f, "DNA, p1 = 1-200\n"); fprintf(f, "DNA, p1 = 201-400\n"); fprintf(f, "DNA, p1 = 401-705\n"); fclose(f); tr = pllCreateInstance (GAMMA, PLL_FALSE, PLL_FALSE, PLL_FALSE, 12345); alignmentData1= pllParsePHYLIP (argv[1]); newick = pllNewickParseFile (argv[2]); parts = pllPartitionParse ("dummy"); /* Validate the partitions */ if (!pllPartitionsValidate (parts, alignmentData1)) { fprintf (stderr, "Error: Partitions do not cover all sites\n"); return (EXIT_FAILURE); } /* commit the partitions and build a partitions structure */ partitions = pllPartitionsCommit (parts, alignmentData1); /* destroy the intermedia partition queue structure */ pllQueuePartitionsDestroy (&parts); /* eliminate duplicate sites from the alignment and update weights vector */ pllPhylipRemoveDuplicate (alignmentData1, partitions); pllTreeInitTopologyNewick (tr, newick, PLL_TRUE); if (!pllLoadAlignment (tr, alignmentData1, partitions, PLL_DEEP_COPY)) { fprintf (stderr, "Incompatible tree/alignment combination\n"); return (EXIT_FAILURE); } pllInitModel(tr, alignmentData1, partitions); switch(i) { case 0: //link params in one way pllLinkAlphaParameters("0,1,2", partitions); pllLinkFrequencies("0,1,2", partitions); pllLinkRates("0,1,2", partitions); break; case 1: //link params in another way pllLinkAlphaParameters("0,0,0", partitions); pllLinkFrequencies("0,1,2", partitions); pllLinkRates("0,1,2", partitions); break; case 2: //link params in yet another way pllLinkAlphaParameters("0,0,0", partitions); pllLinkFrequencies("0,1,2", partitions); pllLinkRates("0,1,0", partitions); break; case 3: //also fiddle around with the Q matrices, make them to be non-GTR, but simpler pllLinkAlphaParameters("0,1,2", partitions); pllLinkFrequencies("0,1,2", partitions); pllLinkRates("0,1,2", partitions); //these are GTR models pllSetSubstitutionRateMatrixSymmetries("0,1,2,3,4,5", partitions, 0); pllSetSubstitutionRateMatrixSymmetries("0,1,2,3,4,5", partitions, 1); //this is a simpler model with 5 parameters, parameter a and f have //the same value pllSetSubstitutionRateMatrixSymmetries("0,1,2,3,4,0", partitions, 2); break; case 4: { //test case to show how the model parameters can be set to fixed values // set up arrays of user-defined base frequencies // and a user defined q matrix double f[4] = {0.25, 0.25, 0.25, 0.25}, q[6] = {1.0, 1.0, 1.0, 1.0, 1.0, 0.5}; //unlink alpha parameters base frequencies and Q matrices //across all partitions pllLinkAlphaParameters("0,1,2", partitions); pllLinkFrequencies("0,0,1", partitions); pllLinkRates("0,1,2", partitions); //set alpha to a fixed value of 1.0 for partition 0 and //parition 1 pllSetFixedAlpha(1.0, 0, partitions, tr); pllSetFixedAlpha(1.0, 1, partitions, tr); //fix the base frequencies to 0.25 for //partitions 0 and 1 pllSetFixedBaseFrequencies(f, 4, 0, partitions, tr); pllSetFixedBaseFrequencies(f, 4, 1, partitions, tr); //set the Q matrix to fixed values for partition //0 pllSetFixedSubstitutionMatrix(q, 6, 0, partitions, tr); } break; default: assert(0); } evaluateGeneric(tr, partitions, tr->start, PLL_TRUE, PLL_FALSE); printf("%f \n", tr->likelihood); pllOptimizeModelParameters(tr, partitions, 10.0); //print the model parameters printModelParameters(partitions); printf("%f \n", tr->likelihood); //cleanup pllAlignmentDataDestroy (alignmentData1); pllNewickParseDestroy (&newick); pllPartitionsDestroy (&partitions, tr->mxtips); pllTreeDestroy (tr); } testProteinStuff(); return (EXIT_SUCCESS); }
int main (int argc, char * argv[]) { pllAlignmentData * alignmentData; pllInstance * tr; pllNewickTree * newick; partitionList * partitions; pllQueue * partitionInfo; int i; pllInstanceAttr attr; pllRearrangeList * rearrangeList; #ifdef _FINE_GRAIN_MPI pllInitMPI (&argc, &argv); #endif if (argc < 4) { fprintf (stderr, "usage: %s [phylip-file] [newick-file] [partition-file] [threads]\n", argv[0]); return (EXIT_FAILURE); } /* Set the PLL instance attributes */ attr.rateHetModel = PLL_GAMMA; attr.fastScaling = PLL_FALSE; attr.saveMemory = PLL_FALSE; attr.useRecom = PLL_FALSE; attr.randomNumberSeed = 0xDEADBEEF; attr.numberOfThreads = (argc > 4) ? (atoi(argv[4]) > 0 ? atoi(argv[4]) : 8) : 8; /* This only affects the pthreads version */ /* Create a PLL tree */ tr = pllCreateInstance (&attr); /* Parse a PHYLIP file */ alignmentData = pllParseAlignmentFile (PLL_FORMAT_PHYLIP, argv[1]); if (!alignmentData) { fprintf (stderr, "Error while parsing %s\n", argv[1]); return (EXIT_FAILURE); } /* Parse a NEWICK file */ newick = pllNewickParseFile (argv[2]); if (!newick) { fprintf (stderr, "Error while parsing newick file %s\n", argv[2]); return (EXIT_FAILURE); } if (!pllValidateNewick (newick)) /* check whether the valid newick tree is also a tree that can be processed with our nodeptr structure */ { fprintf (stderr, "Invalid phylogenetic tree\n"); printf ("%d\n", errno); //return (EXIT_FAILURE); } /* Parse the partitions file into a partition queue structure */ partitionInfo = pllPartitionParse (argv[3]); /* Validate the partitions */ if (!pllPartitionsValidate (partitionInfo, alignmentData)) { fprintf (stderr, "Error: Partitions do not cover all sites\n"); return (EXIT_FAILURE); } /* Commit the partitions and build a partitions structure */ partitions = pllPartitionsCommit (partitionInfo, alignmentData); /* We don't need the the intermedia partition queue structure anymore */ pllQueuePartitionsDestroy (&partitionInfo); /* eliminate duplicate sites from the alignment and update weights vector */ pllAlignmentRemoveDups (alignmentData, partitions); /* Set the topology of the PLL tree from a parsed newick tree */ pllTreeInitTopologyNewick (tr, newick, PLL_TRUE); /* Or instead of the previous function use the next commented line to create a random tree topology pllTreeInitTopologyRandom (tr, alignmentData->sequenceCount, alignmentData->sequenceLabels); */ /* Connect the alignment and partition structure with the tree structure */ if (!pllLoadAlignment (tr, alignmentData, partitions)) { fprintf (stderr, "Incompatible tree/alignment combination\n"); return (EXIT_FAILURE); } /* Initialize the model. Note that this function will also perform a full tree traversal and evaluate the likelihood of the tree. Therefore, you have the guarantee that tr->likelihood the valid likelihood */ pllInitModel(tr, partitions); pllOptimizeBranchLengths (tr, partitions, 64); printf ("Log-likelihood of topology: %f\n", tr->likelihood); /* Create a list that will hold information for at most 20 rearrangement moves */ rearrangeList = pllCreateRearrangeList (20); /* The next flag specifies that PLL optimizes the length of the new branch that is created by an SPR move */ tr->thoroughInsertion = PLL_FALSE; /* Note that the following commands will fill the list with at most 20 SPR and NNI rearrangement moves, i.e. the best 20 will appear in the list */ printf ("Computing the best 20 SPR and NNI rearrangements in radius (1,20)\n"); pllRearrangeSearch (tr, partitions, PLL_REARRANGE_SPR, tr->nodep[tr->mxtips + 1], 1, 20, rearrangeList); pllRearrangeSearch (tr, partitions, PLL_REARRANGE_NNI, tr->nodep[tr->mxtips + 1], 1, 20, rearrangeList); printf ("Number of computed rearrangements: %d\n", rearrangeList->entries); printf ("------------------------------------\n"); for (i = 0; i < rearrangeList->entries; ++ i) { printf ("%2d Type: %s Log-likelihood: %f\n", i, rearrangeList->rearr[i].rearrangeType == PLL_REARRANGE_SPR ? "SPR" : "NNI", rearrangeList->rearr[i].likelihood); } printf ("Committing move 0\n"); pllRearrangeCommit(tr, partitions, &(rearrangeList->rearr[0]), PLL_TRUE); pllEvaluateLikelihood (tr, partitions, tr->start, PLL_TRUE, PLL_FALSE); printf ("New log-likelihood: %f\n\n", tr->likelihood); /* We don't need the rearrange list anymore */ pllDestroyRearrangeList (&rearrangeList); /* Now let's create another list and compute 30 rearrangement moves */ rearrangeList = pllCreateRearrangeList (30); /* The next flag specifies that the length of the new branch that is created by an SPR move need not be optimized */ tr->thoroughInsertion = PLL_TRUE; printf ("Computing the best 30 SPR in radius (1,30)\n"); pllRearrangeSearch (tr, partitions, PLL_REARRANGE_SPR, tr->nodep[tr->mxtips + 1], 1, 30, rearrangeList); printf ("Number of computed rearrangements: %d\n", rearrangeList->entries); printf ("------------------------------------\n"); for (i = 0; i < rearrangeList->entries; ++ i) { printf ("%2d Type: SPR Likelihood: %f\n", i, rearrangeList->rearr[i].likelihood); } printf ("Committing rearrangeList->rearr[0]\n"); pllRearrangeCommit (tr, partitions, &(rearrangeList->rearr[0]), PLL_TRUE); pllEvaluateLikelihood (tr, partitions, tr->start, PLL_FALSE, PLL_FALSE); printf ("New log-likelihood: %f\n\n", tr->likelihood); /* Rolling back to the previous topology. Note that if we evaluate the likelihood with a partial traversal we might get an invalid log likelihood. This is due to the fact that the likelihood vectors no longer correspond to the old topology, hence we need to do full traversal. I left the partial traversal here as an example */ printf ("Rolling back...\n"); pllRearrangeRollback (tr, partitions); pllEvaluateLikelihood (tr, partitions, tr->start, PLL_FALSE, PLL_FALSE); printf ("New log-likelihood: %f\n\n", tr->likelihood); /* We do one more rollback to get to the original topology, but this time we do a full traversal to fix the log-likelihood to the correct value plus we do branch-length optimization */ printf ("Rolling back...\n"); pllRearrangeRollback (tr, partitions); pllEvaluateLikelihood (tr, partitions, tr->start, PLL_TRUE, PLL_FALSE); pllOptimizeBranchLengths (tr, partitions, 64); printf ("New log-likelihood: %f\n\n", tr->likelihood); /* DEallocate the rearrange list */ pllDestroyRearrangeList (&rearrangeList); /* Do some cleanup */ pllAlignmentDataDestroy (alignmentData); pllNewickParseDestroy (&newick); pllPartitionsDestroy (tr, &partitions); pllDestroyInstance (tr); return (EXIT_SUCCESS); }