static void test_penalize_0(CuTest *testCase) {
    stList *observedList = stList_construct3(0, free);
    stList *expectedList = stList_construct3(0, free);
    stHash *observedHash = createBlockHashFromString("a score=0\n"
                                                     "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n"
                                                     "s name.chr1      0 10 +       100 ATGT---ATGCCG\n"
                                                     "s name2.chr1     0 10 +       100 ATGT---ATGCCG\n",
                                                     observedList);
    stHash *expectedHash = createBlockHashFromString("a score=0\n"
                                                     "s reference.chr0 0 13 + 158545518 gcagctgaaaaca-----\n"
                                                     "s name.chr1      0 15 +       100 ATGT---ATGCCGNNNNN\n"
                                                     "s name2.chr1     0 10 +       100 ATGT---ATGCCG-----\n",
                                                     expectedList);
    penalize(observedHash, "name.chr1", 5);
    CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash));
    CuAssertTrue(testCase, listsAreEqual(observedList, expectedList));
    // clean up
    stHash_destruct(observedHash);
    stHash_destruct(expectedHash);
    stList_destruct(observedList);
    stList_destruct(expectedList);
}
Example #2
0
/* write a malnSet to a MAF file  */
void malnSet_writeMaf(struct malnSet *malnSet, char *mafFileName) {
    malnSet_assert(malnSet);
    stList *sorted = buildRootSorted(malnSet);

    FILE *mafFh = mustOpen(mafFileName, "w");
    mafWriteStart(mafFh, NULL);
    stListIterator *iter = stList_getIterator(sorted);
    struct malnBlk *blk;
    while ((blk = stList_getNext(iter)) != NULL) {
        writeBlkToMaf(blk, mafFh);
    }
    stList_destructIterator(iter);
    mafWriteEnd(mafFh);
    carefulClose(&mafFh);
    stList_destruct(sorted);
}
Example #3
0
/* clone the root. */
static stTree *subrangeCloneRoot(stTree *srcRoot, struct malnCompCompMap *srcDestCompMap) {
    // clone root, if deleted, these must only be one child (due to the way
    // the trees are constructed).
    stList *pendingSubtrees = stList_construct();
    stTree *destRoot = subrangeCloneNode(srcRoot, srcDestCompMap, pendingSubtrees);
    if (destRoot == NULL) {
        if (stList_length(pendingSubtrees) > 1) {
            struct mafTreeNodeCompLink *srcNcLink = getNodeCompLink(srcRoot);
            errAbort("deleted tree root %s (component: %s:%d-%d/%c)) has more that one child", stTree_getLabel(srcRoot), srcNcLink->comp->seq->orgSeqName, srcNcLink->comp->start, srcNcLink->comp->end, srcNcLink->comp->strand);
        } else if (stList_length(pendingSubtrees) == 1) {
            destRoot = stList_pop(pendingSubtrees);
        }
    }
    stList_destruct(pendingSubtrees);
    return destRoot;
}
Example #4
0
static void test_stSet_removeAndFreeKey(CuTest* testCase) {
    stSet *set2 = stSet_construct2(free);
    stList *keys = stList_construct();
    int64_t keyNumber = 1000;
    for (int64_t i = 0; i < keyNumber; i++) {
        int64_t *key = st_malloc(sizeof(*key));
        stList_append(keys, key);
        stSet_insert(set2, key);
    }
    for (int64_t i = 0; i < keyNumber; i++) {
        int64_t *key = stList_get(keys, i);
        CuAssertPtrEquals(testCase, key, stSet_removeAndFreeKey(set2, key));
    }
    CuAssertIntEquals(testCase, 0, stSet_size(set2));
    stSet_destruct(set2);
    stList_destruct(keys);
}
/*
 * A thread is trivial if all the segments it contains come from blocks containing only a reference segment.
 * These reference only segments represent scaffold gaps. At the same time, it processes the thread string
 * to remove the boolean values use to indicate if a thread is trivial or not.
 */
static bool isTrivialString(char **threadString) {
    stList *strings = stString_split(*threadString); //Split splits into individual segments.
    bool trivialString = 1;
    for(int64_t i=0; i<stList_length(strings); i++) {
        char *segmentString = stList_get(strings, i);
        int64_t j = strlen(segmentString)-1; //Location of the boolean value within a segment.
        assert(j > 0);
        assert(segmentString[j] == '0' || segmentString[j] == '1');
        if(segmentString[j] == '1') { //Found a non-trivial segment, hence the thread is non-trivial.
            trivialString = 0;
        }
        segmentString[j] = '\0';
    }
    free(*threadString); //Free old thread string. Doing it this way is a bit more memory efficient, as we don't keep two copies of the string around.
    *threadString = stString_join2("", strings); //Concatenation makes one sequence, now without the booleans.
    stList_destruct(strings);
    return trivialString;
}
Example #6
0
void cactusDisk_destruct(CactusDisk *cactusDisk) {
    Flower *flower;
    MetaSequence *metaSequence;

    while ((flower = stSortedSet_getFirst(cactusDisk->flowers)) != NULL) {
        flower_destruct(flower, FALSE);
    }
    stSortedSet_destruct(cactusDisk->flowers);

    stSortedSet_destruct(cactusDisk->flowerNamesMarkedForDeletion);

    while ((metaSequence = stSortedSet_getFirst(cactusDisk->metaSequences)) != NULL) {
        metaSequence_destruct(metaSequence);
    }
    stSortedSet_destruct(cactusDisk->metaSequences);

    //close DB
    stKVDatabase_destruct(cactusDisk->database);

    //Close the sequences files.
    if (cactusDisk->storeSequencesInAFile) {
        free(cactusDisk->sequencesFileName);
        free(cactusDisk->absSequencesFileName);
        if (cactusDisk->sequencesReadFileHandle != NULL) {
            fclose(cactusDisk->sequencesReadFileHandle);
        }
        if (cactusDisk->sequencesWriteFileHandle != NULL) {
            fsync(fileno(cactusDisk->sequencesWriteFileHandle));
            fclose(cactusDisk->sequencesWriteFileHandle);
        }
    } else {
        assert(cactusDisk->sequencesFileName == NULL);
        assert(cactusDisk->sequencesReadFileHandle == NULL);
        assert(cactusDisk->absSequencesFileName == NULL);
    }

    stCache_destruct(cactusDisk->cache); //Get rid of the cache
    stCache_destruct(cactusDisk->stringCache);

    stList_destruct(cactusDisk->updateRequests);

    free(cactusDisk);
}
stList *getComponents2(stList *adjacencyEdges, stList *stubEdges,
        stList *chainEdges) {
    /*
     * Gets a list of connected components for a set of adjacency, stub and chain edges.
     * If adjacencyEdges, stubEdges or chainEdges are NULL then they are ignored.
     */
    stList *allEdges = stList_construct(); //Build a concatenated list of all the chain, stub and adjacency edges.
    if (adjacencyEdges != NULL) {
        stList_appendAll(allEdges, adjacencyEdges);
    }
    if (stubEdges != NULL) {
        stList_appendAll(allEdges, stubEdges);
    }
    if (chainEdges != NULL) {
        stList_appendAll(allEdges, chainEdges);
    }
    stList *components = getComponents(allEdges); //Gets the graph components.
    stList_destruct(allEdges); //Cleanup the all edges.
    return components;
}
Example #8
0
static void testBulkRemoveRecords(CuTest *testCase) {
    /*
     * Tests doing a bulk update of a set of records.
     */
    setup();
    int64_t i = 100, j = 110, k = 120, l = 130;
    stKVDatabase_insertRecord(database, 1, &i, sizeof(int64_t));
    stKVDatabase_insertRecord(database, 2, &j, sizeof(int64_t));
    stKVDatabase_insertRecord(database, 3, &k, sizeof(int64_t));
    stKVDatabase_insertRecord(database, 4, &l, sizeof(int64_t));
    stKVDatabase_insertRecord(database, 5, &i, 0); //Test null record addition
    CuAssertTrue(testCase, stKVDatabase_containsRecord(database, 1));
    CuAssertTrue(testCase, stKVDatabase_containsRecord(database, 2));
    CuAssertTrue(testCase, stKVDatabase_containsRecord(database, 3));
    CuAssertTrue(testCase, stKVDatabase_containsRecord(database, 4));
    CuAssertTrue(testCase, stKVDatabase_containsRecord(database, 5));
    CuAssertTrue(testCase, stKVDatabase_getNumberOfRecords(database) == 5);

    stList *requests = stList_construct3(0, (void(*)(void *)) stInt64Tuple_destruct);

    // test empty request list
    stKVDatabase_bulkRemoveRecords(database, requests);

    stList_append(requests, stInt64Tuple_construct(1, (int64_t)1));
    stList_append(requests, stInt64Tuple_construct(1, (int64_t)2));
    stList_append(requests, stInt64Tuple_construct(1, (int64_t)3));
    stList_append(requests, stInt64Tuple_construct(1, (int64_t)4));
    stList_append(requests, stInt64Tuple_construct(1, (int64_t)5));
    stKVDatabase_bulkRemoveRecords(database, requests);

    CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, 1));
    CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, 2));
    CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, 3));
    CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, 4));
    CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, 5));
    CuAssertTrue(testCase, stKVDatabase_getNumberOfRecords(database) == 0);

    stList_destruct(requests);

    teardown();
}
Example #9
0
static void testPinchIteratorFromFile(CuTest *testCase) {
    for (int64_t test = 0; test < 100; test++) {
        stList *pairwiseAlignments = getRandomPairwiseAlignments();
        st_logInfo("Doing a random pinch iterator from file test %" PRIi64 " with %" PRIi64 " alignments\n", test, stList_length(pairwiseAlignments));
        //Put alignments in a file
        char *tempFile = "tempFileForPinchIteratorTest.cig";
        FILE *fileHandle = fopen(tempFile, "w");
        for (int64_t i = 0; i < stList_length(pairwiseAlignments); i++) {
            cigarWrite(fileHandle, stList_get(pairwiseAlignments, i), 0);
        }
        fclose(fileHandle);
        //Get an iterator
        stPinchIterator *pinchIterator = stPinchIterator_constructFromFile(tempFile);
        //Now test it
        testIterator(testCase, pinchIterator, pairwiseAlignments);
        //Cleanup
        stPinchIterator_destruct(pinchIterator);
        stFile_rmrf(tempFile);
        stList_destruct(pairwiseAlignments);
    }
}
void checkInputs(stSortedSet *nodes, stList *adjacencyEdges,
        stList *stubEdges, stList *chainEdges) {
    /*
     * Checks the inputs to the algorithm are as expected.
     */
    int64_t nodeNumber = stSortedSet_size(nodes);
    assert(nodeNumber % 2 == 0);
    if (nodeNumber > 0) {
        assert(stList_length(stubEdges) > 0);
    }
    assert(
            stList_length(stubEdges) + stList_length(chainEdges) == (nodeNumber
                    / 2));
    checkEdges(stubEdges, nodes, 0, 0);
    checkEdges(chainEdges, nodes, 0, 0);
    stList *stubsAndChainEdges = stList_copy(stubEdges, NULL);
    stList_appendAll(stubsAndChainEdges, chainEdges);
    checkEdges(stubsAndChainEdges, nodes, 1, 0);
    stList_destruct(stubsAndChainEdges);
    checkEdges(adjacencyEdges, nodes, 1, 1);
}
Example #11
0
static void test_st_randomChoice(CuTest *testCase) {
    /*
     * Excercies the random int function.
     */
    stList *list = stList_construct3(0, (void (*)(void *))stIntTuple_destruct);

    stTry {
        st_randomChoice(list);
    } stCatch(except) {
        CuAssertTrue(testCase, stExcept_getId(except) == RANDOM_EXCEPTION_ID);
    }
    stTryEnd

    for(int32_t i = 0; i < 10; i++) {
        stList_append(list, stIntTuple_construct(1, i));
    }
    for(int32_t i = 0; i < 100; i++) {
        CuAssertTrue(testCase, stList_contains(list, st_randomChoice(list)));
    }
    stList_destruct(list);
}
stList *makeMatchingObeyCyclicConstraints(stSortedSet *nodes,
        stList *chosenEdges,
        stSortedSet *allAdjacencyEdges, stList *nonZeroWeightAdjacencyEdges,
        stList *stubEdges, stList *chainEdges,
        bool makeStubCyclesDisjoint) {
    if (stSortedSet_size(nodes) == 0) { //Some of the following functions assume there are at least 2 nodes.
        return stList_construct();
    }

    /*
     * Merge in the stub free components.
     */
    chosenEdges = mergeSimpleCycles2(chosenEdges,
            nonZeroWeightAdjacencyEdges, allAdjacencyEdges, stubEdges,
            chainEdges);

    st_logDebug(
            "After merging in chain only cycles the matching has %" PRIi64 " edges, %" PRIi64 " cardinality and %" PRIi64 " weight\n",
            stList_length(chosenEdges), matchingCardinality(chosenEdges),
            matchingWeight(chosenEdges));

    /*
     * Split stub components.
     */
    if (makeStubCyclesDisjoint) {
        stList *updatedChosenEdges = splitMultipleStubCycles(chosenEdges,
                nonZeroWeightAdjacencyEdges, allAdjacencyEdges, stubEdges,
                chainEdges);
        stList_destruct(chosenEdges);
        chosenEdges = updatedChosenEdges;
        st_logDebug(
                "After making stub cycles disjoint the matching has %" PRIi64 " edges, %" PRIi64 " cardinality and %" PRIi64 " weight\n",
                stList_length(chosenEdges), matchingCardinality(chosenEdges),
                matchingWeight(chosenEdges));
    } else {
        st_logDebug("Not making stub cycles disjoint\n");
    }

    return chosenEdges;
}
Example #13
0
char *cactusDisk_getString(CactusDisk *cactusDisk, Name name, int64_t start, int64_t length, int64_t strand,
        int64_t totalSequenceLength) {
    /*
     * Gets a string from the database.
     *
     */
    assert(length >= 0);
    if (length == 0) {
        return stString_copy("");
    }
    //First try getting it from the cache
    char *string = cactusDisk_getStringFromCache(cactusDisk, name, start, length, strand);
    if (string == NULL) { //If not in the cache, add it to the cache and then get it from the cache.
        stList *list = stList_construct3(0, (void (*)(void *)) substring_destruct);
        stList_append(list, substring_construct(name, start, length));
        cacheSubstringsFromDB(cactusDisk, list);
        stList_destruct(list);
        string = cactusDisk_getStringFromCache(cactusDisk, name, start, length, strand);
    }
    assert(string != NULL);
    return string;
}
/*
 * Recursive function which fills a givenlist with the
 * connected nodes within a module and fills their lifted
 * edges in the same pass
 */
static void buildFaces_fillTopNodeList2(Cap * cap, stList *list,
        stHash *liftedEdgesTable) {
    stList *liftedEdges = stList_construct3(2,
                        buildFaces_stList_destructElem);
    int64_t index;

    // Orientation check
    cap = cap_getPositiveOrientation(cap);

    // Limit of recursion
    if (stList_contains(list, cap))
        return;

    // Actual filling
    st_logInfo("Adding cap %p to face\n", cap);
    stList_append(list, cap);

    // Compute lifted edges
    for (index = 0; index < cap_getChildNumber(cap); index++) 
	buildFaces_computeLiftedEdgesAtTopNode(cap_getChild(cap, index), liftedEdges);

    // If emptylist...
    if (stList_length(liftedEdges) == 0) 
	stList_destruct(liftedEdges);
    // Recursion through lifted edges
    else {
	stHash_insert(liftedEdgesTable, cap, liftedEdges);
        for (index = 0; index < stList_length(liftedEdges); index++)
            buildFaces_fillTopNodeList2(
                    ((LiftedEdge *) stList_get(liftedEdges, index))->destination,
                   list, liftedEdgesTable);
    }

    // Recursion through adjacency
    if (cap_getAdjacency(cap))
        buildFaces_fillTopNodeList2(cap_getAdjacency(cap),list,
                liftedEdgesTable);
}
stList *getPerfectMatching(stSortedSet *nodes,
        stList *adjacencyEdges,
        stList *(*matchingAlgorithm)(stList *edges, int64_t nodeNumber)) {

    checkEdges(adjacencyEdges, nodes, 1, 1); //Checks edges are clique

    if (stSortedSet_size(nodes) == 0) { //Some of the following functions assume there are at least 2 nodes.
        return stList_construct();
    }

    stList *nonZeroWeightAdjacencyEdges = getEdgesWithGreaterThanZeroWeight(
                adjacencyEdges);
    stList *chosenEdges = getSparseMatching(nodes, nonZeroWeightAdjacencyEdges, matchingAlgorithm);
    stList_destruct(nonZeroWeightAdjacencyEdges);
    makeMatchingPerfect(chosenEdges, adjacencyEdges, nodes);

    st_logDebug(
                "Chosen a perfect matching with %" PRIi64 " edges, %" PRIi64 " cardinality and %" PRIi64 " weight\n",
                stList_length(chosenEdges), matchingCardinality(chosenEdges),
                matchingWeight(chosenEdges));

    return chosenEdges;
}
static void getComponentsP(stHash *nodesToEdges, int64_t node,
        stSortedSet *component) {
    stIntTuple *key = stIntTuple_construct1( node);
    stList *edges = stHash_search(nodesToEdges, key);
    if (edges != NULL) {
        stHash_remove(nodesToEdges, key);
        for (int64_t i = 0; i < stList_length(edges); i++) {
            stIntTuple *edge = stList_get(edges, i);
            if (stSortedSet_search(component, edge) == NULL) {
                stSortedSet_insert(component, edge);
            }
            /*
             * Recursion on stack could equal the total number of nodes.
             */
            getComponentsP(nodesToEdges, stIntTuple_get(edge, 0),
                    component);
            getComponentsP(nodesToEdges, stIntTuple_get(edge, 1),
                    component);
        }
        stList_destruct(edges);
    }
    stIntTuple_destruct(key);
}
Example #17
0
static void test_stSortedSet_searchLessThan(CuTest* testCase) {
    sonLibSortedSetTestSetup();
    for(int32_t i=0; i<size; i++) {
        stSortedSet_insert(sortedSet, stIntTuple_construct(1, input[i]));
    }
    //static int32_t sortedInput[] = { -10, -1, 1, 3, 5, 10, 12 };
    CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet,
                 stIntTuple_construct(1, -11)) == NULL);
    CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet,
                 stIntTuple_construct(1, -10)) == NULL);
    CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet,
                 stIntTuple_construct(1, -5)) == stSortedSet_search(sortedSet, stIntTuple_construct(1, -10)));
    CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet,
                 stIntTuple_construct(1, 13)) == stSortedSet_search(sortedSet, stIntTuple_construct(1, 12)));
    CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet,
                 stIntTuple_construct(1, 8)) == stSortedSet_search(sortedSet, stIntTuple_construct(1, 5)));
    CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet,
                 stIntTuple_construct(1, 1)) == stSortedSet_search(sortedSet, stIntTuple_construct(1, -1)));
    CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet,
                 stIntTuple_construct(1, 10)) == stSortedSet_search(sortedSet, stIntTuple_construct(1, 5)));

    for(int32_t i=0; i<100; i++) {
        stSortedSet_insert(sortedSet, stIntTuple_construct(1, st_randomInt(-1000, 1000)));
    }
    stList *list = stSortedSet_getList(sortedSet);
    for(int32_t i=1; i<stList_length(list); i++) {
        stIntTuple *p = stList_get(list, i-1);
        stIntTuple *j = stList_get(list, i);
        stIntTuple *k = stIntTuple_construct(1, st_randomInt(stIntTuple_getPosition(p, 0)+1, stIntTuple_getPosition(j, 0)+1));
        CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet, k) == p);
        stIntTuple_destruct(k);
    }
    stList_destruct(list);

    sonLibSortedSetTestTeardown();
}
Example #18
0
stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t maxSequenceLength,
        bool useProgressiveMerging, float gapGamma,
        PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters) {
    //Make an alignment of the sequences in the ends

    //Get the adjacency sequences to be aligned.
    Cap *cap;
    End_InstanceIterator *it = end_getInstanceIterator(end);
    stList *sequences = stList_construct3(0, (void (*)(void *))adjacencySequence_destruct);
    stList *seqFrags = stList_construct3(0, (void (*)(void *))seqFrag_destruct);
    stHash *endInstanceNumbers = stHash_construct2(NULL, free);
    while((cap = end_getNext(it)) != NULL) {
        if(cap_getSide(cap)) {
            cap = cap_getReverse(cap);
        }
        AdjacencySequence *adjacencySequence = adjacencySequence_construct(cap, maxSequenceLength);
        stList_append(sequences, adjacencySequence);
        assert(cap_getAdjacency(cap) != NULL);
        End *otherEnd = end_getPositiveOrientation(cap_getEnd(cap_getAdjacency(cap)));
        stList_append(seqFrags, seqFrag_construct(adjacencySequence->string, 0, end_getName(otherEnd)));
        //Increase count of seqfrags with a given end.
        int64_t *c = stHash_search(endInstanceNumbers, otherEnd);
        if(c == NULL) {
            c = st_calloc(1, sizeof(int64_t));
            assert(*c == 0);
            stHash_insert(endInstanceNumbers, otherEnd, c);
        }
        (*c)++;
    }
    end_destructInstanceIterator(it);

    //Get the alignment.
    MultipleAlignment *mA = makeAlignment(sM, seqFrags, spanningTrees, 100000000, useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters);

    //Build an array of weights to reweight pairs in the alignment.
    int64_t *pairwiseAlignmentsPerSequenceNonCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t));
    int64_t *pairwiseAlignmentsPerSequenceCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t));
    //First build array on number of pairwise alignments to each sequence, distinguishing alignments between sequences sharing
    //common ends.
    for(int64_t i=0; i<stList_length(mA->chosenPairwiseAlignments); i++) {
        stIntTuple *pairwiseAlignment = stList_get(mA->chosenPairwiseAlignments, i);
        int64_t seq1 = stIntTuple_get(pairwiseAlignment, 1);
        int64_t seq2 = stIntTuple_get(pairwiseAlignment, 2);
        assert(seq1 != seq2);
        SeqFrag *seqFrag1 = stList_get(seqFrags, seq1);
        SeqFrag *seqFrag2 = stList_get(seqFrags, seq2);
        int64_t *pairwiseAlignmentsPerSequence = seqFrag1->rightEndId == seqFrag2->rightEndId
                ? pairwiseAlignmentsPerSequenceCommonEnds : pairwiseAlignmentsPerSequenceNonCommonEnds;
        pairwiseAlignmentsPerSequence[seq1]++;
        pairwiseAlignmentsPerSequence[seq2]++;
    }
    //Now calculate score adjustments.
    double *scoreAdjustmentsNonCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double));
    double *scoreAdjustmentsCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double));
    for(int64_t i=0; i<stList_length(seqFrags); i++) {
        SeqFrag *seqFrag = stList_get(seqFrags, i);
        End *otherEnd = flower_getEnd(end_getFlower(end), seqFrag->rightEndId);
        assert(otherEnd != NULL);
        assert(stHash_search(endInstanceNumbers, otherEnd) != NULL);
        int64_t commonInstanceNumber = *(int64_t *)stHash_search(endInstanceNumbers, otherEnd);
        int64_t nonCommonInstanceNumber = stList_length(seqFrags) - commonInstanceNumber;

        assert(commonInstanceNumber > 0 && nonCommonInstanceNumber >= 0);
        assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] <= nonCommonInstanceNumber);
        assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] >= 0);
        assert(pairwiseAlignmentsPerSequenceCommonEnds[i] < commonInstanceNumber);
        assert(pairwiseAlignmentsPerSequenceCommonEnds[i] >= 0);

        //scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber + commonInstanceNumber - 1)/(pairwiseAlignmentsPerSequenceNonCommonEnds[i] + pairwiseAlignmentsPerSequenceCommonEnds[i]);
        //scoreAdjustmentsCommonEnds[i] = scoreAdjustmentsNonCommonEnds[i];
        if(pairwiseAlignmentsPerSequenceNonCommonEnds[i] > 0) {
            scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber)/pairwiseAlignmentsPerSequenceNonCommonEnds[i];
            assert(scoreAdjustmentsNonCommonEnds[i] >= 1.0);
            assert(scoreAdjustmentsNonCommonEnds[i] <= nonCommonInstanceNumber);
        }
        else {
            scoreAdjustmentsNonCommonEnds[i] = INT64_MIN;
        }
        if(pairwiseAlignmentsPerSequenceCommonEnds[i] > 0) {
            scoreAdjustmentsCommonEnds[i] = ((double)commonInstanceNumber-1)/pairwiseAlignmentsPerSequenceCommonEnds[i];
            assert(scoreAdjustmentsCommonEnds[i] >= 1.0);
            assert(scoreAdjustmentsCommonEnds[i] <= commonInstanceNumber-1);
        }
        else {
            scoreAdjustmentsCommonEnds[i] = INT64_MIN;
        }
    }

	//Convert the alignment pairs to an alignment of the caps..
    stSortedSet *sortedAlignment =
                stSortedSet_construct3((int (*)(const void *, const void *))alignedPair_cmpFn,
                (void (*)(void *))alignedPair_destruct);
    while(stList_length(mA->alignedPairs) > 0) {
        stIntTuple *alignedPair = stList_pop(mA->alignedPairs);
        assert(stIntTuple_length(alignedPair) == 5);
        int64_t seqIndex1 = stIntTuple_get(alignedPair, 1);
        int64_t seqIndex2 = stIntTuple_get(alignedPair, 3);
        AdjacencySequence *i = stList_get(sequences, seqIndex1);
        AdjacencySequence *j = stList_get(sequences, seqIndex2);
        assert(i != j);
        int64_t offset1 = stIntTuple_get(alignedPair, 2);
        int64_t offset2 = stIntTuple_get(alignedPair, 4);
        int64_t score = stIntTuple_get(alignedPair, 0);
        if(score <= 0) { //Happens when indel probs are included
            score = 1; //This is the minimum
        }
        assert(score > 0 && score <= PAIR_ALIGNMENT_PROB_1);
        SeqFrag *seqFrag1 = stList_get(seqFrags, seqIndex1);
        SeqFrag *seqFrag2 = stList_get(seqFrags, seqIndex2);
        assert(seqFrag1 != seqFrag2);
        double *scoreAdjustments = seqFrag1->rightEndId == seqFrag2->rightEndId ? scoreAdjustmentsCommonEnds : scoreAdjustmentsNonCommonEnds;
        assert(scoreAdjustments[seqIndex1] != INT64_MIN);
        assert(scoreAdjustments[seqIndex2] != INT64_MIN);
        AlignedPair *alignedPair2 = alignedPair_construct(
                i->subsequenceIdentifier, i->start + (i->strand ? offset1 : -offset1), i->strand,
                j->subsequenceIdentifier, j->start + (j->strand ? offset2 : -offset2), j->strand,
                score*scoreAdjustments[seqIndex1], score*scoreAdjustments[seqIndex2]); //Do the reweighting here.
        assert(stSortedSet_search(sortedAlignment, alignedPair2) == NULL);
        assert(stSortedSet_search(sortedAlignment, alignedPair2->reverse) == NULL);
        stSortedSet_insert(sortedAlignment, alignedPair2);
        stSortedSet_insert(sortedAlignment, alignedPair2->reverse);
        stIntTuple_destruct(alignedPair);
    }

    //Cleanup
    stList_destruct(seqFrags);
    stList_destruct(sequences);
    free(pairwiseAlignmentsPerSequenceNonCommonEnds);
    free(pairwiseAlignmentsPerSequenceCommonEnds);
    free(scoreAdjustmentsNonCommonEnds);
    free(scoreAdjustmentsCommonEnds);
    multipleAlignment_destruct(mA);
    stHash_destruct(endInstanceNumbers);

    return sortedAlignment;
}
static void test_addBlockToHash_4(CuTest *testCase) {
    // concatenation with sequnece breakpoint due to *strand* alone
    // note that name3 is well within the interstitial boundary, the two blocks
    // essentially looking like >>>>>>>>>>>>> <<<<< (strand diffs)
    options_t *options = options_construct();
    options->breakpointPenalty = 10;
    options->interstitialSequence = 5;
    stList *observedList = stList_construct3(0, free);
    stList *expectedList = stList_construct3(0, free);
    stHash *observedHash = createBlockHashFromString("a score=0\n"
                                                     "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n"
                                                     "s name.chr1      0 10 +       100 ATGT---ATGCCG\n"
                                                     "s name2.chr1     0 10 +       100 ATGT---ATGCCG\n"
                                                     "s name3.chr1     0 13 +       100 GCAGCTGAAAACA\n",
                                                     observedList
                                                     );
    mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n"
                                                   "s reference.chr0 13  5 + 158545518 ACGTA\n"
                                                   "s name.chr1      12  5 +       100 gtcGG\n"
                                                   "s name2.chr1     10  5 +       100 ATGTg\n"
                                                   "s name3.chr1     82  5 -       100 GGGGG\n"
                                                   , 3);
    stHash *expectedHash = NULL;
    expectedHash = createBlockHashFromString("a score=0\n"
                                             "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n"
                                             "s name.chr1      0 17 +       100 ATGT---ATGCCGac----------gtcGG\n"
                                             "s name2.chr1     0 15 +       100 ATGT---ATGCCG------------ATGTg\n"
                                             "s name3.chr1     0 28 +       100 GCAGCTGAAAACA--NNNNNNNNNNGGGGG\n",
                                             expectedList
                                             );
    row_t *r = stHash_search(expectedHash, "name3");
    r->prevRightPos = 86;
    r->strand = '*';
    r->prevStrand = '-';
    stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc"
                                              "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"
                                              "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG");
    mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA"
                                         "tttttttttttttttttttttttttttttttt"
                                         "tttttttttttttttttttttttttttttttttttttttttttttttttt");
    stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs);
    mtfs = newMtfseqFromString("ATGTATGCCGATGTg"
                               "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"
                               "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC");
    stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs);
    mtfs = newMtfseqFromString("GCAGCTGAAAACACCCCCgggggggggggggggggggggggggggggggg"
                               "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                               );
    stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs);
    addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options);
    CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash));
    CuAssertTrue(testCase, listsAreEqual(observedList, expectedList));
    // clean up
    stHash_destruct(observedHash);
    stHash_destruct(expectedHash);
    stHash_destruct(seqHash);
    stList_destruct(observedList);
    stList_destruct(expectedList);
    maf_destroyMafBlockList(mb);
    destroyOptions(options);
}
static void test_addBlockToHash_6(CuTest *testCase) {
    // concatenation with sequence breakpoint due to sequence name
    options_t *options = options_construct();
    options->breakpointPenalty = 10;
    options->interstitialSequence = 5;
    stList *observedList = stList_construct3(0, free);
    stList *expectedList = stList_construct3(0, free);
    stHash *observedHash = createBlockHashFromString("a score=0\n"
                                                     "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n"
                                                     "s name.chr1      0 10 +       100 ATGT---ATGCCG\n"
                                                     "s name2.chr1     0 10 +       100 ATGT---ATGCCG\n"
                                                     "s name3.chr1     0 13 +       100 GCAGCTGAAAACA\n",
                                                     observedList
                                                     );
    mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n"
                                                   "s reference.chr0 13  5 + 158545518 ACGTA\n"
                                                   "s name.chr1      12  5 +       100 gtcGG\n"
                                                   "s name2.chr1     10  5 +       100 ATGTg\n"
                                                   "s name3.chr2     13  5 +       100 aaaaa\n"
                                                   , 3);
    stHash *expectedHash = NULL;
    expectedHash = createBlockHashFromString("a score=0\n"
                                             "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n"
                                             "s name.chr1      0 17 +       100 ATGT---ATGCCGac----------gtcGG\n"
                                             "s name2.chr1     0 15 +       100 ATGT---ATGCCG------------ATGTg\n"
                                             "s name3          0 28 +        28 GCAGCTGAAAACA--NNNNNNNNNNaaaaa\n",
                                             expectedList
                                             );
    row_t *r = stHash_search(expectedHash, "name3");
    r->prevRightPos = 17;
    r->strand = '*';
    r->prevStrand = '+';
    r->multipleNames = true;
    free(r->prevName);
    r->prevName = stString_copy("name3.chr2");
    stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc"
                                              "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"
                                              "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG");
    mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA"
                                         "tttttttttttttttttttttttttttttttt"
                                         "tttttttttttttttttttttttttttttttttttttttttttttttttt");
    stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs);
    mtfs = newMtfseqFromString("ATGTATGCCGATGTg"
                               "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"
                               "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC");
    stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs);
    mtfs = newMtfseqFromString("GCAGCTGAAAACAggggggggggggggggggggggggggggggggggggg"
                               "CCCCCaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                               );
    stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs);
    mtfs = newMtfseqFromString("TTTTTTTTTTTTTaaaaaGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"
                               "CCCCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
                               );
    stHash_insert(seqHash, stString_copy("name3.chr2"), mtfs);
    addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options);
    CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash));
    CuAssertTrue(testCase, listsAreEqual(observedList, expectedList));
    // clean up
    stHash_destruct(observedHash);
    stHash_destruct(expectedHash);
    stHash_destruct(seqHash);
    stList_destruct(observedList);
    stList_destruct(expectedList);
    maf_destroyMafBlockList(mb);
    destroyOptions(options);
}
Example #21
0
Name cactusDisk_addString(CactusDisk *cactusDisk, const char *string) {
    /*
     * Adds a string to the database.
     */
    if (cactusDisk->storeSequencesInAFile) {
        if (cactusDisk->sequencesWriteFileHandle == NULL) {
            //We do not allow the read file handle to be open at the same time.
            if (cactusDisk->sequencesReadFileHandle != NULL) {
                fclose(cactusDisk->sequencesReadFileHandle);
                cactusDisk->sequencesReadFileHandle = NULL;
            }
            cactusDisk->sequencesWriteFileHandle = fopen(cactusDisk->absSequencesFileName, "a");
            assert(cactusDisk->sequencesWriteFileHandle != NULL);
        }
        else {
            //The read file handle should not be open at the same time.
            assert(cactusDisk->sequencesReadFileHandle == NULL);
        }
        Name name = ftell(cactusDisk->sequencesWriteFileHandle) + 1;

        //Extra temporary cheesy code to avoid potential overflow in fprintf
        int64_t chunkSize = 1000000000; //1 gig approx chunks, to avoid a possible overflow issue with fprintf
        int64_t length = strlen(string);
        if (length > chunkSize) {
            fprintf(cactusDisk->sequencesWriteFileHandle, ">");
            for (int64_t i = 0; i < length;) {
                int64_t j = i + chunkSize <= length ? chunkSize : length - i;
                char *string2 = memcpy(st_malloc(sizeof(char) * (j + 1)), string + i, sizeof(char) * j);
                string2[j] = '\0';
                int64_t k = fprintf(cactusDisk->sequencesWriteFileHandle, "%s", string2);
                (void) k;
                assert(k == j);
                free(string2);
                i += j;
            }
        } else {
            //Replacing this line
            int64_t k = fprintf(cactusDisk->sequencesWriteFileHandle, ">%s", string);
            (void) k;
            assert(k == length + 1);
        }

#ifndef NDEBUG
        // Extra fsync may not be necessary.
        fsync(fileno(cactusDisk->sequencesWriteFileHandle));
        fclose(cactusDisk->sequencesWriteFileHandle);
        cactusDisk->sequencesWriteFileHandle = NULL;
        cactusDisk->sequencesReadFileHandle = fopen(cactusDisk->absSequencesFileName, "r");
        char *string2 = getStringFromDisk(cactusDisk->sequencesReadFileHandle, name, 0, length);
        for (int64_t i = 0; i < length; i++) {
            assert(string[i] == string2[i]);
        }
        free(string2);
#endif
        return name;
    } else {
        int64_t stringSize = strlen(string);
        int64_t intervalSize = ceil((double) stringSize / CACTUS_DISK_SEQUENCE_CHUNK_SIZE);
        Name name = cactusDisk_getUniqueIDInterval(cactusDisk, intervalSize);
        stList *insertRequests = stList_construct3(0, (void (*)(void *)) stKVDatabaseBulkRequest_destruct);
        for (int64_t i = 0; i * CACTUS_DISK_SEQUENCE_CHUNK_SIZE < stringSize; i++) {
            int64_t j =
                    (i + 1) * CACTUS_DISK_SEQUENCE_CHUNK_SIZE < stringSize ?
                            CACTUS_DISK_SEQUENCE_CHUNK_SIZE : stringSize - i * CACTUS_DISK_SEQUENCE_CHUNK_SIZE;
            char *subString = stString_getSubString(string, i * CACTUS_DISK_SEQUENCE_CHUNK_SIZE, j);
            stList_append(insertRequests, stKVDatabaseBulkRequest_constructInsertRequest(name + i, subString, j + 1));
            free(subString);
        }
        stTry
            {
                stKVDatabase_bulkSetRecords(cactusDisk->database, insertRequests);
            }
            stCatch(except)
                {
                    stThrowNewCause(except, ST_KV_DATABASE_EXCEPTION_ID,
                            "An unknown database error occurred when we tried to add a string to the cactus disk");
                }stTryEnd
        ;
        stList_destruct(insertRequests);
        return name;
    }
}
int main(int argc, char *argv[]) {
    /*
     * Script for adding a reference genome to a flower.
     */

    /*
     * Arguments/options
     */
    char * logLevelString = NULL;
    char * cactusDiskDatabaseString = NULL;
    char * secondaryDatabaseString = NULL;
    char *referenceEventString = (char *) cactusMisc_getDefaultReferenceEventHeader();
    bool bottomUpPhase = 0;

    ///////////////////////////////////////////////////////////////////////////
    // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
    ///////////////////////////////////////////////////////////////////////////

    while (1) {
        static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, { "secondaryDisk", required_argument, 0, 'd' }, { "referenceEventString", required_argument, 0, 'g' }, { "help", no_argument,
                0, 'h' }, { "bottomUpPhase", no_argument, 0, 'j' }, { 0, 0, 0, 0 } };

        int option_index = 0;

        int key = getopt_long(argc, argv, "a:b:c:d:e:g:hi:j", long_options, &option_index);

        if (key == -1) {
            break;
        }

        switch (key) {
            case 'a':
                logLevelString = stString_copy(optarg);
                break;
            case 'b':
                cactusDiskDatabaseString = stString_copy(optarg);
                break;
            case 'd':
                secondaryDatabaseString = stString_copy(optarg);
                break;
            case 'g':
                referenceEventString = stString_copy(optarg);
                break;
            case 'h':
                usage();
                return 0;
            case 'j':
                bottomUpPhase = 1;
                break;
            default:
                usage();
                return 1;
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    // (0) Check the inputs.
    ///////////////////////////////////////////////////////////////////////////

    assert(cactusDiskDatabaseString != NULL);

    //////////////////////////////////////////////
    //Set up logging
    //////////////////////////////////////////////

    st_setLogLevelFromString(logLevelString);

    //////////////////////////////////////////////
    //Load the database
    //////////////////////////////////////////////

    st_logInfo("referenceEventString = %s\n", referenceEventString);
    st_logInfo("bottomUpPhase = %i\n", bottomUpPhase);

    stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString);
    CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, false, true);
    stKVDatabaseConf_destruct(kvDatabaseConf);
    st_logInfo("Set up the flower disk\n");

    stKVDatabase *sequenceDatabase = NULL;
    if (secondaryDatabaseString != NULL) {
        kvDatabaseConf = stKVDatabaseConf_constructFromString(secondaryDatabaseString);
        sequenceDatabase = stKVDatabase_construct(kvDatabaseConf, 0);
        stKVDatabaseConf_destruct(kvDatabaseConf);
    }

    FlowerStream *flowerStream = flowerWriter_getFlowerStream(cactusDisk, stdin);
    Flower *flower;
    while ((flower = flowerStream_getNext(flowerStream)) != NULL) {
        st_logDebug("Processing flower %" PRIi64 "\n", flower_getName(flower));

        ///////////////////////////////////////////////////////////////////////////
        // Get the appropriate event names
        ///////////////////////////////////////////////////////////////////////////

        st_logInfo("%s\n", eventTree_makeNewickString(flower_getEventTree(flower)));
        Event *referenceEvent = eventTree_getEventByHeader(flower_getEventTree(flower), referenceEventString);
        if (referenceEvent == NULL) {
            st_errAbort("Reference event %s not found in tree. Check your "
                        "--referenceEventString option", referenceEventString);
        }
        Name referenceEventName = event_getName(referenceEvent);

        ///////////////////////////////////////////////////////////////////////////
        // Now do bottom up or top down, depending
        ///////////////////////////////////////////////////////////////////////////
        stList *flowers = stList_construct();
        stList_append(flowers, flower);
        preCacheNestedFlowers(cactusDisk, flowers);
        if (bottomUpPhase) {
            assert(sequenceDatabase != NULL);

            cactusDisk_preCacheSegmentStrings(cactusDisk, flowers);
            bottomUp(flowers, sequenceDatabase, referenceEventName, !flower_hasParentGroup(flower), generateJukesCantorMatrix);

            // Unload the nested flowers to save memory. They haven't
            // been changed, so we don't write them to the cactus
            // disk.
            Flower_GroupIterator *groupIt = flower_getGroupIterator(flower);
            Group *group;
            while ((group = flower_getNextGroup(groupIt)) != NULL) {
                if (!group_isLeaf(group)) {
                    flower_unload(group_getNestedFlower(group));
                }
            }
            flower_destructGroupIterator(groupIt);
            assert(!flower_isParentLoaded(flower));

            // Write this flower to disk.
            cactusDisk_addUpdateRequest(cactusDisk, flower);
        } else {
            topDown(flower, referenceEventName);

            // We've changed the nested flowers, but not this
            // flower. We write the nested flowers to disk, then
            // unload them to save memory. This flower will be
            // unloaded by the flower-stream code.
            Flower_GroupIterator *groupIt = flower_getGroupIterator(flower);
            Group *group;
            while ((group = flower_getNextGroup(groupIt)) != NULL) {
                if (!group_isLeaf(group)) {
                    cactusDisk_addUpdateRequest(cactusDisk, group_getNestedFlower(group));
                    flower_unload(group_getNestedFlower(group));
                }
            }
            flower_destructGroupIterator(groupIt);
        }
        stList_destruct(flowers);
    }

    ///////////////////////////////////////////////////////////////////////////
    // Write the flower(s) back to disk.
    ///////////////////////////////////////////////////////////////////////////

    cactusDisk_write(cactusDisk);
    st_logInfo("Updated the flower on disk\n");

    ///////////////////////////////////////////////////////////////////////////
    //Clean up.
    ///////////////////////////////////////////////////////////////////////////

    if (sequenceDatabase != NULL) {
        stKVDatabase_destruct(sequenceDatabase);
    }

    cactusDisk_destruct(cactusDisk);

    return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection.

    free(cactusDiskDatabaseString);
    free(referenceEventString);
    free(logLevelString);

    st_logInfo("Cleaned stuff up and am finished\n");

    return 0;
}
Example #23
0
static void cacheSubstringsFromDB(CactusDisk *cactusDisk, stList *substrings) {
    /*
     * Caches the given set of substrings in the cactusDisk cache.
     */
    if (cactusDisk->storeSequencesInAFile) {
        if (cactusDisk->sequencesReadFileHandle == NULL) {
            if(cactusDisk->sequencesWriteFileHandle != NULL) {
                fsync(fileno(cactusDisk->sequencesWriteFileHandle));
                fclose(cactusDisk->sequencesWriteFileHandle);
                cactusDisk->sequencesWriteFileHandle = NULL;
            }
            cactusDisk->sequencesReadFileHandle = fopen(cactusDisk->absSequencesFileName, "r");
            assert(cactusDisk->sequencesReadFileHandle != NULL);
        }
        else {
            assert(cactusDisk->sequencesWriteFileHandle == NULL);
        }
        for (int64_t i = 0; i < stList_length(substrings); i++) {
            Substring *substring = stList_get(substrings, i);
            char *string = getStringFromDisk(cactusDisk->sequencesReadFileHandle, substring->name, substring->start,
                    substring->length);
            stCache_setRecord(cactusDisk->stringCache, substring->name, substring->start, substring->length, string);
#ifndef NDEBUG
            int64_t bytesRead;
            char *string2 = stCache_getRecord(cactusDisk->stringCache, substring->name, substring->start,
                    substring->length, &bytesRead);
            assert(bytesRead == substring->length);
            for (int64_t j = 0; j < substring->length; j++) {
                assert(string2[j] == string[j]);
            }
            free(string2);
#endif
            free(string);
        }
    } else {
        stList *getRequests = stList_construct3(0, free);
        for (int64_t i = 0; i < stList_length(substrings); i++) {
            Substring *substring = stList_get(substrings, i);
            int64_t intervalSize = (substring->length + substring->start - 1) / CACTUS_DISK_SEQUENCE_CHUNK_SIZE
                    - substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE + 1;
            Name shiftedName = substring->name + substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE;
            for (int64_t j = 0; j < intervalSize; j++) {
                int64_t *k = st_malloc(sizeof(int64_t));
                k[0] = shiftedName + j;
                stList_append(getRequests, k);
            }
        }
        if (stList_length(getRequests) == 0) {
            stList_destruct(getRequests);
            return;
        }
        stList *records = NULL;
        stTry
            {
                records = stKVDatabase_bulkGetRecords(cactusDisk->database, getRequests);
            }
            stCatch(except)
                {
                    stThrowNewCause(except, ST_KV_DATABASE_EXCEPTION_ID,
                            "An unknown database error occurred when getting a sequence string");
                }stTryEnd
        ;
        assert(records != NULL);
        assert(stList_length(records) == stList_length(getRequests));
        stList_destruct(getRequests);
        stListIterator *recordsIt = stList_getIterator(records);
        for (int64_t i = 0; i < stList_length(substrings); i++) {
            Substring *substring = stList_get(substrings, i);
            int64_t intervalSize = (substring->length + substring->start - 1) / CACTUS_DISK_SEQUENCE_CHUNK_SIZE
                    - substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE + 1;
            stList *strings = stList_construct();
            while (intervalSize-- > 0) {
                int64_t recordSize;
                stKVDatabaseBulkResult *result = stList_getNext(recordsIt);
                assert(result != NULL);
                char *string = stKVDatabaseBulkResult_getRecord(result, &recordSize);
                assert(string != NULL);
                assert(strlen(string) == recordSize - 1);
                stList_append(strings, string);
                assert(recordSize <= CACTUS_DISK_SEQUENCE_CHUNK_SIZE + 1);
            }
            assert(stList_length(strings) > 0);
            char *joinedString = stString_join2("", strings);
            stCache_setRecord(cactusDisk->stringCache, substring->name,
                    (substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE) * CACTUS_DISK_SEQUENCE_CHUNK_SIZE,
                    strlen(joinedString), joinedString);
            free(joinedString);
            stList_destruct(strings);
        }
        assert(stList_getNext(recordsIt) == NULL);
        stList_destructIterator(recordsIt);
        stList_destruct(records);
    }
}
Example #24
0
static void teardown() {
    if(list != NULL) {
        stList_destruct(list);
        list = NULL;
    }
}
Example #25
0
int main(int argc, char *argv[]) {
    /*
     * Script for adding alignments to cactus tree.
     */
    int64_t startTime;
    stKVDatabaseConf *kvDatabaseConf;
    CactusDisk *cactusDisk;
    int key, k;

    bool (*filterFn)(stPinchSegment *, stPinchSegment *) = NULL;
    stSet *outgroupThreads = NULL;

    /*
     * Arguments/options
     */
    char * logLevelString = NULL;
    char * alignmentsFile = NULL;
    char * constraintsFile = NULL;
    char * cactusDiskDatabaseString = NULL;
    char * lastzArguments = "";
    int64_t minimumSequenceLengthForBlast = 1;

    //Parameters for annealing/melting rounds
    int64_t *annealingRounds = NULL;
    int64_t annealingRoundsLength = 0;
    int64_t *meltingRounds = NULL;
    int64_t meltingRoundsLength = 0;

    //Parameters for melting
    float maximumAdjacencyComponentSizeRatio = 10;
    int64_t blockTrim = 0;
    int64_t alignmentTrimLength = 0;
    int64_t *alignmentTrims = NULL;
    int64_t chainLengthForBigFlower = 1000000;
    int64_t longChain = 2;
    int64_t minLengthForChromosome = 1000000;
    float proportionOfUnalignedBasesForNewChromosome = 0.8;
    bool breakChainsAtReverseTandems = 1;
    int64_t maximumMedianSequenceLengthBetweenLinkedEnds = INT64_MAX;
    bool realign = 0;
    char *realignArguments = "";
    bool removeRecoverableChains = false;
    bool (*recoverableChainsFilter)(stCactusEdgeEnd *, Flower *) = NULL;
    int64_t maxRecoverableChainsIterations = 1;
    int64_t maxRecoverableChainLength = INT64_MAX;

    //Parameters for removing ancient homologies
    bool doPhylogeny = false;
    int64_t phylogenyNumTrees = 1;
    enum stCaf_RootingMethod phylogenyRootingMethod = BEST_RECON;
    enum stCaf_ScoringMethod phylogenyScoringMethod = COMBINED_LIKELIHOOD;
    double breakpointScalingFactor = 1.0;
    bool phylogenySkipSingleCopyBlocks = 0;
    int64_t phylogenyMaxBaseDistance = 1000;
    int64_t phylogenyMaxBlockDistance = 100;
    bool phylogenyKeepSingleDegreeBlocks = 0;
    stList *phylogenyTreeBuildingMethods = stList_construct();
    enum stCaf_TreeBuildingMethod defaultMethod = GUIDED_NEIGHBOR_JOINING;
    stList_append(phylogenyTreeBuildingMethods, &defaultMethod);
    double phylogenyCostPerDupPerBase = 0.2;
    double phylogenyCostPerLossPerBase = 0.2;
    const char *debugFileName = NULL;
    const char *referenceEventHeader = NULL;
    double phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce = 1.0;
    int64_t numTreeBuildingThreads = 2;
    int64_t minimumBlockDegreeToCheckSupport = 10;
    double minimumBlockHomologySupport = 0.7;
    double nucleotideScalingFactor = 1.0;
    HomologyUnitType phylogenyHomologyUnitType = BLOCK;
    enum stCaf_DistanceCorrectionMethod phylogenyDistanceCorrectionMethod = JUKES_CANTOR;
    bool sortAlignments = false;

    ///////////////////////////////////////////////////////////////////////////
    // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
    ///////////////////////////////////////////////////////////////////////////

    while (1) {
        static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "alignments", required_argument, 0, 'b' }, {
                "cactusDisk", required_argument, 0, 'c' }, { "lastzArguments", required_argument, 0, 'd' },
                { "help", no_argument, 0, 'h' }, { "annealingRounds", required_argument, 0, 'i' }, { "trim", required_argument, 0, 'k' }, {
                        "trimChange", required_argument, 0, 'l', }, { "minimumTreeCoverage", required_argument, 0, 'm' }, { "blockTrim",
                        required_argument, 0, 'n' }, { "deannealingRounds", required_argument, 0, 'o' }, { "minimumDegree",
                        required_argument, 0, 'p' }, { "minimumIngroupDegree", required_argument, 0, 'q' }, {
                        "minimumOutgroupDegree", required_argument, 0, 'r' }, { "alignmentFilter", required_argument, 0, 't' }, {
                        "minimumSequenceLengthForBlast", required_argument, 0, 'v' }, { "maxAdjacencyComponentSizeRatio",
                        required_argument, 0, 'w' }, { "constraints", required_argument, 0, 'x' }, { "minLengthForChromosome",
                        required_argument, 0, 'y' }, { "proportionOfUnalignedBasesForNewChromosome", required_argument, 0, 'z' },
                        { "maximumMedianSequenceLengthBetweenLinkedEnds", required_argument, 0, 'A' },
                        { "realign", no_argument, 0, 'B' }, { "realignArguments", required_argument, 0, 'C' },
                        { "phylogenyNumTrees", required_argument, 0, 'D' },
                        { "phylogenyRootingMethod", required_argument, 0, 'E' },
                        { "phylogenyScoringMethod", required_argument, 0, 'F' },
                        { "phylogenyBreakpointScalingFactor", required_argument, 0, 'G' },
                        { "phylogenySkipSingleCopyBlocks", no_argument, 0, 'H' },
                        { "phylogenyMaxBaseDistance", required_argument, 0, 'I' },
                        { "phylogenyMaxBlockDistance", required_argument, 0, 'J' },
                        { "phylogenyDebugFile", required_argument, 0, 'K' },
                        { "phylogenyKeepSingleDegreeBlocks", no_argument, 0, 'L' },
                        { "phylogenyTreeBuildingMethod", required_argument, 0, 'M' },
                        { "phylogenyCostPerDupPerBase", required_argument, 0, 'N' },
                        { "phylogenyCostPerLossPerBase", required_argument, 0, 'O' },
                        { "referenceEventHeader", required_argument, 0, 'P' },
                        { "phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce", required_argument, 0, 'Q' },
                        { "numTreeBuildingThreads", required_argument, 0, 'R' },
                        { "phylogeny", no_argument, 0, 'S' },
                        { "minimumBlockHomologySupport", required_argument, 0, 'T' },
                        { "phylogenyNucleotideScalingFactor", required_argument, 0, 'U' },
                        { "minimumBlockDegreeToCheckSupport", required_argument, 0, 'V' },
                        { "removeRecoverableChains", required_argument, 0, 'W' },
                        { "minimumNumberOfSpecies", required_argument, 0, 'X' },
                        { "phylogenyHomologyUnitType", required_argument, 0, 'Y' },
                        { "phylogenyDistanceCorrectionMethod", required_argument, 0, 'Z' },
                        { "maxRecoverableChainsIterations", required_argument, 0, '1' },
                        { "maxRecoverableChainLength", required_argument, 0, '2' },
                        { 0, 0, 0, 0 } };

        int option_index = 0;

        key = getopt_long(argc, argv, "a:b:c:hi:k:m:n:o:p:q:r:stv:w:x:y:z:A:BC:D:E:", long_options, &option_index);

        if (key == -1) {
            break;
        }

        switch (key) {
            case 'a':
                logLevelString = stString_copy(optarg);
                st_setLogLevelFromString(logLevelString);
                break;
            case 'b':
                alignmentsFile = stString_copy(optarg);
                break;
            case 'c':
                cactusDiskDatabaseString = stString_copy(optarg);
                break;
            case 'd':
                lastzArguments = stString_copy(optarg);
                break;
            case 'h':
                usage();
                return 0;
            case 'i':
                annealingRounds = getInts(optarg, &annealingRoundsLength);
                break;
            case 'o':
                meltingRounds = getInts(optarg, &meltingRoundsLength);
                break;
            case 'k':
                alignmentTrims = getInts(optarg, &alignmentTrimLength);
                break;
            case 'm':
                k = sscanf(optarg, "%f", &minimumTreeCoverage);
                assert(k == 1);
                break;
            case 'n':
                k = sscanf(optarg, "%" PRIi64 "", &blockTrim);
                assert(k == 1);
                break;
            case 'p':
                k = sscanf(optarg, "%" PRIi64 "", &minimumDegree);
                assert(k == 1);
                break;
            case 'q':
                k = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree);
                assert(k == 1);
                break;
            case 'r':
                k = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree);
                assert(k == 1);
                break;
            case 't':
                if (strcmp(optarg, "singleCopyOutgroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_filterByOutgroup;
                } else if (strcmp(optarg, "relaxedSingleCopyOutgroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_relaxedFilterByOutgroup;
                } else if (strcmp(optarg, "singleCopy") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_filterByRepeatSpecies;
                } else if (strcmp(optarg, "relaxedSingleCopy") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_relaxedFilterByRepeatSpecies;
                } else if (strcmp(optarg, "singleCopyChr") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_singleCopyChr;
                } else if (strcmp(optarg, "singleCopyIngroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_singleCopyIngroup;
                } else if (strcmp(optarg, "relaxedSingleCopyIngroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_relaxedSingleCopyIngroup;
                } else if (strcmp(optarg, "none") == 0) {
                    sortAlignments = false;
                    filterFn = NULL;
                } else {
                    st_errAbort("Could not recognize alignmentFilter option %s", optarg);
                }
                break;
            case 'v':
                k = sscanf(optarg, "%" PRIi64 "", &minimumSequenceLengthForBlast);
                assert(k == 1);
                break;
            case 'w':
                k = sscanf(optarg, "%f", &maximumAdjacencyComponentSizeRatio);
                assert(k == 1);
                break;
            case 'x':
                constraintsFile = stString_copy(optarg);
                break;
            case 'y':
                k = sscanf(optarg, "%" PRIi64 "", &minLengthForChromosome);
                assert(k == 1);
                break;
            case 'z':
                k = sscanf(optarg, "%f", &proportionOfUnalignedBasesForNewChromosome);
                assert(k == 1);
                break;
            case 'A':
                k = sscanf(optarg, "%" PRIi64 "", &maximumMedianSequenceLengthBetweenLinkedEnds);
                assert(k == 1);
                break;
            case 'B':
                realign = 1;
                break;
            case 'C':
                realignArguments = stString_copy(optarg);
                break;
            case 'D':
                k = sscanf(optarg, "%" PRIi64, &phylogenyNumTrees);
                assert(k == 1);
                break;
            case 'E':
                if (!strcmp(optarg, "outgroupBranch")) {
                    phylogenyRootingMethod = OUTGROUP_BRANCH;
                } else if (!strcmp(optarg, "longestBranch")) {
                    phylogenyRootingMethod = LONGEST_BRANCH;
                } else if (!strcmp(optarg, "bestRecon")) {
                    phylogenyRootingMethod = BEST_RECON;
                } else {
                    st_errAbort("Invalid tree rooting method: %s", optarg);
                }
                break;
            case 'F':
                if (!strcmp(optarg, "reconCost")) {
                    phylogenyScoringMethod = RECON_COST;
                } else if (!strcmp(optarg, "nucLikelihood")) {
                    phylogenyScoringMethod = NUCLEOTIDE_LIKELIHOOD;
                } else if (!strcmp(optarg, "reconLikelihood")) {
                    phylogenyScoringMethod = RECON_LIKELIHOOD;
                } else if (!strcmp(optarg, "combinedLikelihood")) {
                    phylogenyScoringMethod = COMBINED_LIKELIHOOD;
                } else {
                    st_errAbort("Invalid tree scoring method: %s", optarg);
                }
                break;
            case 'G':
                k = sscanf(optarg, "%lf", &breakpointScalingFactor);
                assert(k == 1);
                break;
            case 'H':
                phylogenySkipSingleCopyBlocks = true;
                break;
            case 'I':
                k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBaseDistance);
                assert(k == 1);
                break;
            case 'J':
                k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBlockDistance);
                assert(k == 1);
                break;
            case 'K':
                debugFileName = stString_copy(optarg);
                break;
            case 'L':
                phylogenyKeepSingleDegreeBlocks = true;
                break;
            case 'M':
                // clear the default setting of the list
                stList_destruct(phylogenyTreeBuildingMethods);
                phylogenyTreeBuildingMethods = stList_construct();
                stList *methodStrings = stString_splitByString(optarg, ",");

                for (int64_t i = 0; i < stList_length(methodStrings); i++) {
                    char *methodString = stList_get(methodStrings, i);
                    enum stCaf_TreeBuildingMethod *method = st_malloc(sizeof(enum stCaf_TreeBuildingMethod));
                    if (strcmp(methodString, "neighborJoining") == 0) {
                        *method = NEIGHBOR_JOINING;
                    } else if (strcmp(methodString, "guidedNeighborJoining") == 0) {
                        *method = GUIDED_NEIGHBOR_JOINING;
                    } else if (strcmp(methodString, "splitDecomposition") == 0) {
                        *method = SPLIT_DECOMPOSITION;
                    } else if (strcmp(methodString, "strictSplitDecomposition") == 0) {
                        *method = STRICT_SPLIT_DECOMPOSITION;
                    } else if (strcmp(methodString, "removeBadChains") == 0) {
                        *method = REMOVE_BAD_CHAINS;
                    } else {
                        st_errAbort("Unknown tree building method: %s", methodString);
                    }
                    stList_append(phylogenyTreeBuildingMethods, method);
                }
                stList_destruct(methodStrings);
                break;
            case 'N':
                k = sscanf(optarg, "%lf", &phylogenyCostPerDupPerBase);
                assert(k == 1);
                break;
            case 'O':
                k = sscanf(optarg, "%lf", &phylogenyCostPerLossPerBase);
                assert(k == 1);
                break;
            case 'P':
                referenceEventHeader = stString_copy(optarg);
                break;
            case 'Q':
                k = sscanf(optarg, "%lf", &phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce);
                assert(k == 1);
                break;
            case 'R':
                k = sscanf(optarg, "%" PRIi64, &numTreeBuildingThreads);
                assert(k == 1);
                break;
            case 'S':
                doPhylogeny = true;
                break;
            case 'T':
                k = sscanf(optarg, "%lf", &minimumBlockHomologySupport);
                assert(k == 1);
                assert(minimumBlockHomologySupport <= 1.0);
                assert(minimumBlockHomologySupport >= 0.0);
                break;
            case 'U':
                k = sscanf(optarg, "%lf", &nucleotideScalingFactor);
                assert(k == 1);
                break;
            case 'V':
                k = sscanf(optarg, "%" PRIi64, &minimumBlockDegreeToCheckSupport);
                assert(k == 1);
                break;
            case 'W':
                if (strcmp(optarg, "1") == 0) {
                    removeRecoverableChains = true;
                    recoverableChainsFilter = NULL;
                } else if (strcmp(optarg, "unequalNumberOfIngroupCopies") == 0) {
                    removeRecoverableChains = true;
                    recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopies;
                } else if (strcmp(optarg, "unequalNumberOfIngroupCopiesOrNoOutgroup") == 0) {
                    removeRecoverableChains = true;
                    recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopiesOrNoOutgroup;
                } else if (strcmp(optarg, "0") == 0) {
                    removeRecoverableChains = false;
                } else {
                    st_errAbort("Could not parse removeRecoverableChains argument");
                }
                break;
            case 'X':
                k = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies);
                if (k != 1) {
                    st_errAbort("Error parsing the minimumNumberOfSpecies argument");
                }
                break;
            case 'Y':
                if (strcmp(optarg, "chain") == 0) {
                    phylogenyHomologyUnitType = CHAIN;
                } else if (strcmp(optarg, "block") == 0) {
                    phylogenyHomologyUnitType = BLOCK;
                } else {
                    st_errAbort("Could not parse the phylogenyHomologyUnitType argument");
                }
                break;
            case 'Z':
                if (strcmp(optarg, "jukesCantor") == 0) {
                    phylogenyDistanceCorrectionMethod = JUKES_CANTOR;
                } else if (strcmp(optarg, "none") == 0 ) {
                    phylogenyDistanceCorrectionMethod = NONE;
                } else {
                    st_errAbort("Could not parse the phylogenyDistanceCorrectionMethod argument");
                }
                break;
            case '1':
                k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainsIterations);
                if (k != 1) {
                    st_errAbort("Error parsing the maxRecoverableChainsIterations argument");
                }
                break;
            case '2':
                k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainLength);
                if (k != 1) {
                    st_errAbort("Error parsing the maxRecoverableChainLength argument");
                }
                break;
            default:
                usage();
                return 1;
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    // (0) Check the inputs.
    ///////////////////////////////////////////////////////////////////////////

    assert(cactusDiskDatabaseString != NULL);
    assert(minimumTreeCoverage >= 0.0);
    assert(minimumTreeCoverage <= 1.0);
    assert(blockTrim >= 0);
    assert(annealingRoundsLength >= 0);
    for (int64_t i = 0; i < annealingRoundsLength; i++) {
        assert(annealingRounds[i] >= 0);
    }
    assert(meltingRoundsLength >= 0);
    for (int64_t i = 1; i < meltingRoundsLength; i++) {
        assert(meltingRounds[i - 1] < meltingRounds[i]);
        assert(meltingRounds[i - 1] >= 1);
    }
    assert(alignmentTrimLength >= 0);
    for (int64_t i = 0; i < alignmentTrimLength; i++) {
        assert(alignmentTrims[i] >= 0);
    }
    assert(minimumOutgroupDegree >= 0);
    assert(minimumIngroupDegree >= 0);

    //////////////////////////////////////////////
    //Set up logging
    //////////////////////////////////////////////

    st_setLogLevelFromString(logLevelString);

    //////////////////////////////////////////////
    //Log (some of) the inputs
    //////////////////////////////////////////////

    st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString);

    //////////////////////////////////////////////
    //Load the database
    //////////////////////////////////////////////

    kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString);
    cactusDisk = cactusDisk_construct(kvDatabaseConf, 0);
    st_logInfo("Set up the flower disk\n");

    ///////////////////////////////////////////////////////////////////////////
    // Sort the constraints
    ///////////////////////////////////////////////////////////////////////////

    stPinchIterator *pinchIteratorForConstraints = NULL;
    if (constraintsFile != NULL) {
        pinchIteratorForConstraints = stPinchIterator_constructFromFile(constraintsFile);
        st_logInfo("Created an iterator for the alignment constaints from file: %s\n", constraintsFile);
    }

    ///////////////////////////////////////////////////////////////////////////
    // Do the alignment
    ///////////////////////////////////////////////////////////////////////////

    startTime = time(NULL);

    stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk);
    if (alignmentsFile == NULL) {
        cactusDisk_preCacheStrings(cactusDisk, flowers);
    }
    char *tempFile1 = NULL;
    for (int64_t i = 0; i < stList_length(flowers); i++) {
        flower = stList_get(flowers, i);
        if (!flower_builtBlocks(flower)) { // Do nothing if the flower already has defined blocks
            st_logDebug("Processing flower: %lli\n", flower_getName(flower));

            stCaf_setFlowerForAlignmentFiltering(flower);

            //Set up the graph and add the initial alignments
            stPinchThreadSet *threadSet = stCaf_setup(flower);

            //Build the set of outgroup threads
            outgroupThreads = stCaf_getOutgroupThreads(flower, threadSet);

            //Setup the alignments
            stPinchIterator *pinchIterator;
            stList *alignmentsList = NULL;
            if (alignmentsFile != NULL) {
                assert(i == 0);
                assert(stList_length(flowers) == 1);
                if (sortAlignments) {
                    tempFile1 = getTempFile();
                    stCaf_sortCigarsFileByScoreInDescendingOrder(alignmentsFile, tempFile1);
                    pinchIterator = stPinchIterator_constructFromFile(tempFile1);
                } else {
                    pinchIterator = stPinchIterator_constructFromFile(alignmentsFile);
                }
            } else {
                if (tempFile1 == NULL) {
                    tempFile1 = getTempFile();
                }
                alignmentsList = stCaf_selfAlignFlower(flower, minimumSequenceLengthForBlast, lastzArguments, realign, realignArguments, tempFile1);
                if (sortAlignments) {
                    stCaf_sortCigarsByScoreInDescendingOrder(alignmentsList);
                }
                st_logDebug("Ran lastz and have %" PRIi64 " alignments\n", stList_length(alignmentsList));
                pinchIterator = stPinchIterator_constructFromList(alignmentsList);
            }

            for (int64_t annealingRound = 0; annealingRound < annealingRoundsLength; annealingRound++) {
                int64_t minimumChainLength = annealingRounds[annealingRound];
                int64_t alignmentTrim = annealingRound < alignmentTrimLength ? alignmentTrims[annealingRound] : 0;
                st_logDebug("Starting annealing round with a minimum chain length of %" PRIi64 " and an alignment trim of %" PRIi64 "\n", minimumChainLength, alignmentTrim);
                stPinchIterator_setTrim(pinchIterator, alignmentTrim);

                //Add back in the constraints
                if (pinchIteratorForConstraints != NULL) {
                    stCaf_anneal(threadSet, pinchIteratorForConstraints, filterFn);
                }

                //Do the annealing
                if (annealingRound == 0) {
                    stCaf_anneal(threadSet, pinchIterator, filterFn);
                } else {
                    stCaf_annealBetweenAdjacencyComponents(threadSet, pinchIterator, filterFn);
                }

                // Dump the block degree and length distribution to a file
                if (debugFileName != NULL) {
                    dumpBlockInfo(threadSet, stString_print("%s-blockStats-preMelting", debugFileName));
                }

                printf("Sequence graph statistics after annealing:\n");
                printThreadSetStatistics(threadSet, flower, stdout);

                // Check for poorly-supported blocks--those that have
                // been transitively aligned together but with very
                // few homologies supporting the transitive
                // alignment. These "megablocks" can snarl up the
                // graph so that a lot of extra gets thrown away in
                // the first melting step.
                stPinchThreadSetBlockIt blockIt = stPinchThreadSet_getBlockIt(threadSet);
                stPinchBlock *block;
                while ((block = stPinchThreadSetBlockIt_getNext(&blockIt)) != NULL) {
                    if (stPinchBlock_getDegree(block) > minimumBlockDegreeToCheckSupport) {
                        uint64_t supportingHomologies = stPinchBlock_getNumSupportingHomologies(block);
                        uint64_t possibleSupportingHomologies = numPossibleSupportingHomologies(block, flower);
                        double support = ((double) supportingHomologies) / possibleSupportingHomologies;
                        if (support < minimumBlockHomologySupport) {
                            fprintf(stdout, "Destroyed a megablock with degree %" PRIi64
                                    " and %" PRIi64 " supporting homologies out of a maximum "
                                    "of %" PRIi64 " (%lf%%).\n", stPinchBlock_getDegree(block),
                                    supportingHomologies, possibleSupportingHomologies, support);
                            stPinchBlock_destruct(block);
                        }
                    }
                }

                //Do the melting rounds
                for (int64_t meltingRound = 0; meltingRound < meltingRoundsLength; meltingRound++) {
                    int64_t minimumChainLengthForMeltingRound = meltingRounds[meltingRound];
                    st_logDebug("Starting melting round with a minimum chain length of %" PRIi64 " \n", minimumChainLengthForMeltingRound);
                    if (minimumChainLengthForMeltingRound >= minimumChainLength) {
                        break;
                    }
                    stCaf_melt(flower, threadSet, NULL, 0, minimumChainLengthForMeltingRound, 0, INT64_MAX);
                } st_logDebug("Last melting round of cycle with a minimum chain length of %" PRIi64 " \n", minimumChainLength);
                stCaf_melt(flower, threadSet, NULL, 0, minimumChainLength, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds);
                //This does the filtering of blocks that do not have the required species/tree-coverage/degree.
                stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX);
            }

            if (removeRecoverableChains) {
                stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength);
            }
            if (debugFileName != NULL) {
                dumpBlockInfo(threadSet, stString_print("%s-blockStats-postMelting", debugFileName));
            }

            printf("Sequence graph statistics after melting:\n");
            printThreadSetStatistics(threadSet, flower, stdout);

            // Build a tree for each block, then use each tree to
            // partition the homologies between the ingroups sequences
            // into those that occur before the speciation with the
            // outgroup and those which occur late.

            if (stSet_size(outgroupThreads) > 0 && doPhylogeny) {
                st_logDebug("Starting to build trees and partition ingroup homologies\n");
                stHash *threadStrings = stCaf_getThreadStrings(flower, threadSet);
                st_logDebug("Got sets of thread strings and set of threads that are outgroups\n");
                stCaf_PhylogenyParameters params;
                params.distanceCorrectionMethod = phylogenyDistanceCorrectionMethod;
                params.treeBuildingMethods = phylogenyTreeBuildingMethods;
                params.rootingMethod = phylogenyRootingMethod;
                params.scoringMethod = phylogenyScoringMethod;
                params.breakpointScalingFactor = breakpointScalingFactor;
                params.nucleotideScalingFactor = nucleotideScalingFactor;
                params.skipSingleCopyBlocks = phylogenySkipSingleCopyBlocks;
                params.keepSingleDegreeBlocks = phylogenyKeepSingleDegreeBlocks;
                params.costPerDupPerBase = phylogenyCostPerDupPerBase;
                params.costPerLossPerBase = phylogenyCostPerLossPerBase;
                params.maxBaseDistance = phylogenyMaxBaseDistance;
                params.maxBlockDistance = phylogenyMaxBlockDistance;
                params.numTrees = phylogenyNumTrees;
                params.ignoreUnalignedBases = 1;
                params.onlyIncludeCompleteFeatureBlocks = 0;
                params.doSplitsWithSupportHigherThanThisAllAtOnce = phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce;
                params.numTreeBuildingThreads = numTreeBuildingThreads;

                assert(params.numTreeBuildingThreads >= 1);

                stCaf_buildTreesToRemoveAncientHomologies(
                    threadSet, phylogenyHomologyUnitType, threadStrings, outgroupThreads, flower, &params,
                    debugFileName == NULL ? NULL : stString_print("%s-phylogeny", debugFileName), referenceEventHeader);
                stHash_destruct(threadStrings);
                st_logDebug("Finished building trees\n");

                if (removeRecoverableChains) {
                    // We melt recoverable chains after splitting, as
                    // well as before, to alleviate coverage loss
                    // caused by bad splits.
                    stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength);
                }

                // Enforce the block constraints on minimum degree,
                // etc. after splitting.
                stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX);
            }

            //Sort out case when we allow blocks of degree 1
            if (minimumDegree < 2) {
                st_logDebug("Creating degree 1 blocks\n");
                stCaf_makeDegreeOneBlocks(threadSet);
                stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX);
            } else if (maximumAdjacencyComponentSizeRatio < INT64_MAX) { //Deal with giant components
                st_logDebug("Breaking up components greedily\n");
                stCaf_breakupComponentsGreedily(threadSet, maximumAdjacencyComponentSizeRatio);
            }

            //Finish up
            stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, minLengthForChromosome,
                    proportionOfUnalignedBasesForNewChromosome); //Flower is then destroyed at this point.
            st_logInfo("Ran the cactus core script\n");

            //Cleanup
            stPinchThreadSet_destruct(threadSet);
            stPinchIterator_destruct(pinchIterator);
            stSet_destruct(outgroupThreads);

            if (alignmentsList != NULL) {
                stList_destruct(alignmentsList);
            }
            st_logInfo("Cleaned up from main loop\n");
        } else {
            st_logInfo("We've already built blocks / alignments for this flower\n");
        }
    }
    stList_destruct(flowers);
    if (tempFile1 != NULL) {
        st_system("rm %s", tempFile1);
    }

    if (constraintsFile != NULL) {
        stPinchIterator_destruct(pinchIteratorForConstraints);
    }

    ///////////////////////////////////////////////////////////////////////////
    // Write the flower to disk.
    ///////////////////////////////////////////////////////////////////////////
    st_logDebug("Writing the flowers to disk\n");
    cactusDisk_write(cactusDisk);
    st_logInfo("Updated the flower on disk and %" PRIi64 " seconds have elapsed\n", time(NULL) - startTime);

    ///////////////////////////////////////////////////////////////////////////
    // Clean up.
    ///////////////////////////////////////////////////////////////////////////

    cactusDisk_destruct(cactusDisk);
}
Example #26
0
int main(int argc, char *argv[]) {
    // Parse arguments
    if (argc != 3) {
        usage(argv);
        return 1;
    }

    // You would load a custom HMM here if you wanted using
    // hmm_getStateMachine (see the realign code)
    StateMachine *stateMachine  = stateMachine5_construct(fiveState);

    PairwiseAlignmentParameters *parameters = pairwiseAlignmentBandingParameters_construct();

    stHash *targetSequences = readFastaFile(argv[1]);
    stHash *querySequences = readFastaFile(argv[2]);

    // For each query sequence, align it against all target sequences.
    stHashIterator *queryIt = stHash_getIterator(querySequences);
    char *queryHeader;
    while ((queryHeader = stHash_getNext(queryIt)) != NULL) {
        char *querySeq = stHash_search(querySequences, queryHeader);
        stHashIterator *targetIt = stHash_getIterator(targetSequences);
        char *targetHeader;
        while ((targetHeader = stHash_getNext(targetIt)) != NULL) {
            char *targetSeq = stHash_search(targetSequences, targetHeader);
            // Here we should try both the target sequence and its
            // reverse-complemented version


            // Aligns the sequences.
            // If you have alignment constraints (anchors) you should
            // replace this with getAlignedPairsUsingAnchors.
            stList *alignedPairs = getAlignedPairs(stateMachine, targetSeq,
                                                   querySeq, parameters,
                                                   true, true);
            // Takes into account the probability of aligning to a
            // gap, by transforming the posterior probability into the
            // AMAP objective function (see Schwartz & Pachter, 2007).
            alignedPairs = reweightAlignedPairs2(alignedPairs, strlen(targetSeq),
                                                 strlen(querySeq),
                                                 parameters->gapGamma);
            // I think this calculates the optimal ordered set of
            // alignments from the unordered set of aligned pairs, not
            // completely sure.
            alignedPairs = filterPairwiseAlignmentToMakePairsOrdered(alignedPairs,
                                                                     targetSeq,
                                                                     querySeq,
                                                                     // This parameter says that the minimum posterior probability we will accept has to be at least 0.9.
                                                                     0.9);

            // After this the "aligned pairs" data structure changes,
            // which is a little sketchy. It's just so that the
            // alignment can be printed properly.
            stList_mapReplace(alignedPairs, convertToAnchorPair, NULL);
            stList_sort(alignedPairs, (int (*)(const void *, const void *)) stIntTuple_cmpFn);
            struct PairwiseAlignment *alignment = convertAlignedPairsToPairwiseAlignment(targetHeader, queryHeader,
                                                                                  0, strlen(targetSeq), strlen(querySeq), alignedPairs);
            // Output the cigar string
            cigarWrite(stdout, alignment, 0);

            stList_destruct(alignedPairs);
            destructPairwiseAlignment(alignment);
        }
        stHash_destructIterator(targetIt);
    }
    stHash_destructIterator(queryIt);

    // Clean up
    stHash_destruct(targetSequences);
    stHash_destruct(querySequences);

    pairwiseAlignmentBandingParameters_destruct(parameters);
    stateMachine_destruct(stateMachine);
}
Example #27
0
static void partialRecordRetrieval(CuTest *testCase) {
    setup();

    //Make some number of large records
    stList *records = stList_construct3(0, free);
    stList *recordSizes = stList_construct3(0, (void(*)(void *)) stIntTuple_destruct);
    for (int32_t i = 0; i < 300; i++) {
        int32_t size = st_randomInt(0, 80);
        size = size * size * size; //Use cubic size distribution
        char *randomRecord = st_malloc(size * sizeof(char));
        for (int32_t j = 0; j < size; j++) {
            randomRecord[j] = (char) st_randomInt(0, 100);
        }
        stList_append(records, randomRecord);
        stList_append(recordSizes, stIntTuple_construct(1, size));
        CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, i));
        stKVDatabase_insertRecord(database, i, randomRecord, size * sizeof(char));
        CuAssertTrue(testCase, stKVDatabase_containsRecord(database, i));
        //st_uglyf("I am creating the record %i %i\n", i, size);
    }

    while (st_random() > 0.001) {
        int32_t recordKey = st_randomInt(0, stList_length(records));
        CuAssertTrue(testCase, stKVDatabase_containsRecord(database, recordKey));

        char *record = stList_get(records, recordKey);
        int32_t size = stIntTuple_getPosition(stList_get(recordSizes, recordKey), 0);

        //Get partial record
        int32_t start = size > 0 ? st_randomInt(0, size) : 0;
        int32_t partialSize = size - start > 0 ? st_randomInt(start, size) - start : 0;
        assert(start >= 0);
        assert(partialSize >= 0);
        assert(partialSize + start <= size);
        //st_uglyf("I am getting record %i %i %i %i\n", recordKey, start, partialSize, size);
        char *partialRecord = stKVDatabase_getPartialRecord(database, recordKey, start * sizeof(char),
                partialSize * sizeof(char), size * sizeof(char));

        //Check they are equivalent..
        for (int32_t i = 0; i < partialSize; i++) {
            if (record[start + i] != partialRecord[i]) {
                st_uglyf("There was a difference %i %i for record %i %i\n", record[start + i], partialRecord[i], i,
                        partialSize);
            }
            //CuAssertTrue(testCase, record[start + i] == partialRecord[i]);
        }

        //Check we can not get out of bounds.. (start less than zero)
        stTry {
                stKVDatabase_getPartialRecord(database, recordKey, -1, 1, size * sizeof(char));
            }stCatch(except)
                {
                    CuAssertTrue(testCase, stExcept_getId(except) == ST_KV_DATABASE_EXCEPTION_ID);
                }stTryEnd;

        //Check we can not get out of bounds.. (start greater than index start)
        stTry {
                stKVDatabase_getPartialRecord(database, recordKey, size, 1, size * sizeof(char));
            }stCatch(except)
                {
                    CuAssertTrue(testCase, stExcept_getId(except) == ST_KV_DATABASE_EXCEPTION_ID);
                }stTryEnd;

        //Check we can not get out of bounds.. (total size if greater than record length)
        stTry {
                stKVDatabase_getPartialRecord(database, recordKey, 0, size + 1, size * sizeof(char));
            }stCatch(except)
                {
                    CuAssertTrue(testCase, stExcept_getId(except) == ST_KV_DATABASE_EXCEPTION_ID);
                }stTryEnd;

        //Check we can not get non existent record
        stTry {
                stKVDatabase_getPartialRecord(database, 1000000, 0, size, size * sizeof(char));
            }stCatch(except)
                {
                    CuAssertTrue(testCase, stExcept_getId(except) == ST_KV_DATABASE_EXCEPTION_ID);
                }stTryEnd;
    }

    stList_destruct(records);
    stList_destruct(recordSizes);

    teardown();
}
Example #28
0
static void testBulkGetRecords(CuTest* testCase) {
	/*
	 * Tests the new bulk get functions
	 */
	setup();
    int64_t i = 100, j = 110, k = 120, l = 130;
    int64_t bigRecSize = 184500800;
    int64_t* m = (int64_t*)st_malloc(bigRecSize);

    int64_t ki = 4, kj = 5, kk = 3, kl = 1, km = 2;
    stKVDatabase_insertRecord(database, 1, &i, sizeof(int64_t));

    stList *requests = stList_construct3(0, (void(*)(void *)) stKVDatabaseBulkRequest_destruct);
    stList_append(requests, stKVDatabaseBulkRequest_constructInsertRequest(ki, &i, sizeof(int64_t)));
    stList_append(requests, stKVDatabaseBulkRequest_constructInsertRequest(kj, &j, sizeof(int64_t)));
    stList_append(requests, stKVDatabaseBulkRequest_constructSetRequest(kk, &k, sizeof(int64_t)));
    stList_append(requests, stKVDatabaseBulkRequest_constructUpdateRequest(kl, &l, sizeof(int64_t)));

    stKVDatabase_bulkSetRecords(database, requests);

    stList_destruct(requests);

    stKVDatabase_setRecord(database, km, m, bigRecSize);

    stList* keys = stList_construct2(5);
    stList_set(keys, 0, &ki);
    stList_set(keys, 1, &kj);
    stList_set(keys, 2, &kk);
    stList_set(keys, 3, &kl);
    stList_set(keys, 4, &km);

    stList* results = stKVDatabase_bulkGetRecords(database, keys);
    CuAssertTrue(testCase, stList_length(results) == 5);

    void* record;
    int64_t size;

    stKVDatabaseBulkResult* res0 = (stKVDatabaseBulkResult*)stList_get(results, 0);
    record = stKVDatabaseBulkResult_getRecord(res0, &size);
    CuAssertTrue(testCase, record != NULL);
    CuAssertTrue(testCase, *(int64_t*)record == i && size == sizeof(int64_t));

    stKVDatabaseBulkResult* res1 = (stKVDatabaseBulkResult*)stList_get(results, 1);
	record = stKVDatabaseBulkResult_getRecord(res1, &size);
	CuAssertTrue(testCase, record != NULL);
	CuAssertTrue(testCase, *(int64_t*)record == j && size == sizeof(int64_t));

	stKVDatabaseBulkResult* res2 = (stKVDatabaseBulkResult*)stList_get(results, 2);
	record = stKVDatabaseBulkResult_getRecord(res2, &size);
	CuAssertTrue(testCase, record != NULL);
	CuAssertTrue(testCase, *(int64_t*)record == k && size == sizeof(int64_t));

	stKVDatabaseBulkResult* res3 = (stKVDatabaseBulkResult*)stList_get(results, 3);
	record = stKVDatabaseBulkResult_getRecord(res3, &size);
	CuAssertTrue(testCase, record != NULL);
	CuAssertTrue(testCase, *(int64_t*)record == l && size == sizeof(int64_t));

	stKVDatabaseBulkResult* res4 = (stKVDatabaseBulkResult*)stList_get(results, 4);
	record = stKVDatabaseBulkResult_getRecord(res4, &size);
	CuAssertTrue(testCase, record != NULL);
	CuAssertTrue(testCase, size == bigRecSize);

    stList_destruct(results);

    results = stKVDatabase_bulkGetRecordsRange(database, 1, 6);
    CuAssertTrue(testCase, stList_length(results) == 6);

    res0 = (stKVDatabaseBulkResult*)stList_get(results, 0);
	record = stKVDatabaseBulkResult_getRecord(res0, &size);
	CuAssertTrue(testCase, record != NULL);
	CuAssertTrue(testCase, *(int64_t*)record == l && size == sizeof(int64_t));

	res1 = (stKVDatabaseBulkResult*)stList_get(results, 1);
	record = stKVDatabaseBulkResult_getRecord(res1, &size);
	CuAssertTrue(testCase, record != NULL);
	CuAssertTrue(testCase, size == bigRecSize);

	res2 = (stKVDatabaseBulkResult*)stList_get(results, 2);
	record = stKVDatabaseBulkResult_getRecord(res2, &size);
	CuAssertTrue(testCase, record != NULL);
	CuAssertTrue(testCase, *(int64_t*)record == k && size == sizeof(int64_t));

	res3 = (stKVDatabaseBulkResult*)stList_get(results, 3);
	record = stKVDatabaseBulkResult_getRecord(res3, &size);
	CuAssertTrue(testCase, record != NULL);
	CuAssertTrue(testCase, *(int64_t*)record == i && size == sizeof(int64_t));

	res4 = (stKVDatabaseBulkResult*)stList_get(results, 4);
	record = stKVDatabaseBulkResult_getRecord(res4, &size);
	CuAssertTrue(testCase, record != NULL);
	CuAssertTrue(testCase, *(int64_t*)record == j && size == sizeof(int64_t));

	stKVDatabaseBulkResult* res5 = (stKVDatabaseBulkResult*)stList_get(results, 5);
	record = stKVDatabaseBulkResult_getRecord(res5, &size);
	CuAssertTrue(testCase, record == NULL);

    stList_destruct(results);
    free(m);
    teardown();
}
Example #29
0
int main(int argc, char *argv[]) {
    /*
     * Open the database.
     * Construct a flower.
     * Construct an event tree representing the species tree.
     * For each sequence contruct two ends each containing an cap.
     * Make a file for the sequence.
     * Link the two caps.
     * Finish!
     */

    int64_t key, j;
    Group *group;
    Flower_EndIterator *endIterator;
    End *end;
    bool makeEventHeadersAlphaNumeric = 0;

    /*
     * Arguments/options
     */
    char * logLevelString = NULL;
    char * speciesTree = NULL;
    char * outgroupEvents = NULL;

    ///////////////////////////////////////////////////////////////////////////
    // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
    ///////////////////////////////////////////////////////////////////////////

    while (1) {
        static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, {
                "speciesTree", required_argument, 0, 'f' }, { "outgroupEvents", required_argument, 0, 'g' },
                { "help", no_argument, 0, 'h' }, { "makeEventHeadersAlphaNumeric", no_argument, 0, 'i' }, { 0, 0, 0, 0 } };

        int option_index = 0;

        key = getopt_long(argc, argv, "a:b:f:hg:i", long_options, &option_index);

        if (key == -1) {
            break;
        }

        switch (key) {
            case 'a':
                logLevelString = optarg;
                break;
            case 'b':
                cactusDiskDatabaseString = optarg;
                break;
            case 'f':
                speciesTree = optarg;
                break;
            case 'g':
                outgroupEvents = optarg;
                break;
            case 'h':
                usage();
                return 0;
            case 'i':
                makeEventHeadersAlphaNumeric = 1;
                break;
            default:
                usage();
                return 1;
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    // (0) Check the inputs.
    ///////////////////////////////////////////////////////////////////////////

    //assert(logLevelString == NULL || strcmp(logLevelString, "CRITICAL") == 0 || strcmp(logLevelString, "INFO") == 0 || strcmp(logLevelString, "DEBUG") == 0);
    assert(cactusDiskDatabaseString != NULL);
    assert(speciesTree != NULL);

    //////////////////////////////////////////////
    //Set up logging
    //////////////////////////////////////////////

    st_setLogLevelFromString(logLevelString);

    //////////////////////////////////////////////
    //Log (some of) the inputs
    //////////////////////////////////////////////

    st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString);

    for (j = optind; j < argc; j++) {
        st_logInfo("Sequence file/directory %s\n", argv[j]);
    }

    //////////////////////////////////////////////
    //Load the database
    //////////////////////////////////////////////

    stKVDatabaseConf *kvDatabaseConf = kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString);
    if (stKVDatabaseConf_getType(kvDatabaseConf) == stKVDatabaseTypeTokyoCabinet || stKVDatabaseConf_getType(kvDatabaseConf)
            == stKVDatabaseTypeKyotoTycoon) {
        assert(stKVDatabaseConf_getDir(kvDatabaseConf) != NULL);
        cactusDisk = cactusDisk_construct2(kvDatabaseConf, "cactusSequences");
    } else {
        cactusDisk = cactusDisk_construct(kvDatabaseConf, 1);
    }
    st_logInfo("Set up the flower disk\n");

    //////////////////////////////////////////////
    //Construct the flower
    //////////////////////////////////////////////

    if (cactusDisk_getFlower(cactusDisk, 0) != NULL) {
        cactusDisk_destruct(cactusDisk);
        st_logInfo("The first flower already exists\n");
        return 0;
    }
    flower = flower_construct2(0, cactusDisk);
    assert(flower_getName(flower) == 0);
    st_logInfo("Constructed the flower\n");

    //////////////////////////////////////////////
    //Construct the event tree
    //////////////////////////////////////////////

    st_logInfo("Going to build the event tree with newick string: %s\n", speciesTree);
    stTree *tree = stTree_parseNewickString(speciesTree);
    st_logInfo("Parsed the tree\n");
    if (makeEventHeadersAlphaNumeric) {
        makeEventHeadersAlphaNumericFn(tree);
    }
    stTree_setBranchLength(tree, INT64_MAX);
    checkBranchLengthsAreDefined(tree);
    eventTree = eventTree_construct2(flower); //creates the event tree and the root even
    totalEventNumber = 1;
    st_logInfo("Constructed the basic event tree\n");

    // Construct a set of outgroup names so that ancestral outgroups
    // get recognized.
    stSet *outgroupNameSet = stSet_construct3(stHash_stringKey,
                                              stHash_stringEqualKey,
                                              free);
    if(outgroupEvents != NULL) {
        stList *outgroupNames = stString_split(outgroupEvents);
        for(int64_t i = 0; i < stList_length(outgroupNames); i++) {
            char *outgroupName = stList_get(outgroupNames, i);
            stSet_insert(outgroupNameSet, stString_copy(outgroupName));
        }
        stList_destruct(outgroupNames);
    }

    //now traverse the tree
    j = optind;
    assignEventsAndSequences(eventTree_getRootEvent(eventTree), tree,
                             outgroupNameSet, argv, &j);

    char *eventTreeString = eventTree_makeNewickString(eventTree);
    st_logInfo(
            "Constructed the initial flower with %" PRIi64 " sequences and %" PRIi64 " events with string: %s\n",
            totalSequenceNumber, totalEventNumber, eventTreeString);
    assert(event_getSubTreeBranchLength(eventTree_getRootEvent(eventTree)) >= 0.0);
    free(eventTreeString);
    //assert(0);

    //////////////////////////////////////////////
    //Label any outgroup events.
    //////////////////////////////////////////////

    if (outgroupEvents != NULL) {
        stList *outgroupEventsList = stString_split(outgroupEvents);
        for (int64_t i = 0; i < stList_length(outgroupEventsList); i++) {
            char *outgroupEvent = makeEventHeadersAlphaNumeric ? makeAlphaNumeric(stList_get(outgroupEventsList, i)) : stString_copy(stList_get(outgroupEventsList, i));
            Event *event = eventTree_getEventByHeader(eventTree, outgroupEvent);
            if (event == NULL) {
                st_errAbort("Got an outgroup string that does not match an event, outgroup string %s", outgroupEvent);
            }
            assert(!event_isOutgroup(event));
            event_setOutgroupStatus(event, 1);
            assert(event_isOutgroup(event));
            free(outgroupEvent);
        }
        stList_destruct(outgroupEventsList);
    }

    //////////////////////////////////////////////
    //Construct the terminal group.
    //////////////////////////////////////////////

    if (flower_getEndNumber(flower) > 0) {
        group = group_construct2(flower);
        endIterator = flower_getEndIterator(flower);
        while ((end = flower_getNextEnd(endIterator)) != NULL) {
            end_setGroup(end, group);
        }
        flower_destructEndIterator(endIterator);
        assert(group_isLeaf(group));

        // Create a one link chain if there is only one pair of attached ends..
        group_constructChainForLink(group);
        assert(!flower_builtBlocks(flower));
    } else {
        flower_setBuiltBlocks(flower, 1);
    }

    ///////////////////////////////////////////////////////////////////////////
    // Write the flower to disk.
    ///////////////////////////////////////////////////////////////////////////

    //flower_check(flower);
    cactusDisk_write(cactusDisk);
    st_logInfo("Updated the flower on disk\n");

    ///////////////////////////////////////////////////////////////////////////
    // Cleanup.
    ///////////////////////////////////////////////////////////////////////////

    cactusDisk_destruct(cactusDisk);

    return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection.

    stSet_destruct(outgroupNameSet);
    stTree_destruct(tree);
    stKVDatabaseConf_destruct(kvDatabaseConf);

    return 0;
}
static stList *splitMultipleStubCycle(stList *cycle,
        stList *nonZeroWeightAdjacencyEdges, stSortedSet *allAdjacencyEdges,
        stList *stubEdges, stList *chainEdges) {
    /*
     *  Takes a simple cycle containing k stub edges and splits into k cycles, each containing 1 stub edge.
     */

    /*
     * Get sub-components containing only adjacency and chain edges.
     */

    stSortedSet *stubAndChainEdgesSet = getSetOfMergedLists(stubEdges,
            chainEdges);
    stList *adjacencyEdgeMatching =
            stList_filterToExclude(cycle, stubAndChainEdgesSet); //Filter out the the non-adjacency edges
    //Make it only the chain edges present in the original component
    stList *stubFreePaths = getComponents2(adjacencyEdgeMatching, NULL,
            chainEdges);
    stList_destruct(adjacencyEdgeMatching);
    assert(stList_length(stubFreePaths) >= 1);

    stList *splitCycles = stList_construct3(0,
            (void(*)(void *)) stList_destruct); //The list to return.


    if (stList_length(stubFreePaths) > 1) {
        /*
         * Build the list of adjacency edges acceptable in the merge
         */
        stSortedSet *oddNodes = getOddNodes(cycle);
        stList *oddToEvenNonZeroWeightAdjacencyEdges =
                getOddToEvenAdjacencyEdges(oddNodes,
                        nonZeroWeightAdjacencyEdges);
        stSortedSet *oddToEvenAllAdjacencyEdges = getOddToEvenAdjacencyEdges2(oddNodes, allAdjacencyEdges);

        /*
         * Merge together the best two components.
         */
        stList *l = filterListsToExclude(stubFreePaths, stubAndChainEdgesSet);
        doBestMergeOfTwoSimpleCycles(l, oddToEvenNonZeroWeightAdjacencyEdges,
                oddToEvenAllAdjacencyEdges); //This is inplace.
        stList *l2 = stList_join(l);
        stList_destruct(l);
        l = getComponents2(l2, stubEdges, chainEdges);
        assert(stList_length(l) == 2);
        stList_destruct(l2);

        /*
         * Cleanup
         */
        stSortedSet_destruct(oddNodes);
        stList_destruct(oddToEvenNonZeroWeightAdjacencyEdges);
        stSortedSet_destruct(oddToEvenAllAdjacencyEdges);

        /*
         * Call procedure recursively.
         */
        for (int64_t i = 0; i < stList_length(l); i++) {
            /*
             * Split into adjacency edges, stub edges and chain edges.
             */
            stList *subCycle = stList_get(l, i);
            stList *subAdjacencyEdges;
            stList *subStubEdges;
            stList *subChainEdges;
            splitIntoAdjacenciesStubsAndChains(subCycle,
                    nonZeroWeightAdjacencyEdges, stubEdges, chainEdges,
                    &subAdjacencyEdges, &subStubEdges, &subChainEdges);

            /*
             * Call recursively.
             */
            l2 = splitMultipleStubCycle(subCycle, subAdjacencyEdges,
                    allAdjacencyEdges, subStubEdges, subChainEdges);
            stList_appendAll(splitCycles, l2);

            /*
             * Clean up
             */
            stList_setDestructor(l2, NULL);
            stList_destruct(l2);
            stList_destruct(subAdjacencyEdges);
            stList_destruct(subStubEdges);
            stList_destruct(subChainEdges);
        }
        stList_destruct(l);
    } else {
        stList_append(splitCycles, stList_copy(cycle, NULL));
    }

    stSortedSet_destruct(stubAndChainEdgesSet);
    stList_destruct(stubFreePaths);

    return splitCycles;
}