Esempio n. 1
0
/* Parse XML string into a hash.  This parses all attributes of all tags
 * into values.  st_kv_database_conf type is stored as conf_type,
 * database tag is stores as db_tag.  This does minimal error checking
 * and is really lame.
 */
static stHash *hackParseXmlString(const char *xmlString) {
    stHash *hash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, free);
    char *toReplace[5] = { "</", "<", "/>", ">", "=" };
    char *cA = stString_replace(xmlString, toReplace[0], " "), *cA2;
    for (int64_t i = 1; i < 5; i++) {
        cA2 = stString_replace(cA, toReplace[i], " ");
        free(cA);
        cA = cA2;
    }
    getExpectedToken(&cA2, "st_kv_database_conf");
    stHash_insert(hash, stString_copy("conf_type"), getKeyValue(&cA2, "type"));
    stHash_insert(hash, stString_copy("db_tag"), getNextToken(&cA2));

    char *key;
    while (((key = getNextToken(&cA2)) != NULL) && !stString_eq(key, "st_kv_database_conf")) {
        char *value = getNextToken(&cA2);
        if (value == NULL) {
            stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "failed to to get value for key \"%s\"", key);
        }
        if (stHash_search(hash, key) != NULL) {
            stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "got a duplicate entry in the database conf string \"%s\"", key);
        }
        stHash_insert(hash, key, value);
    }
    if(!stString_eq(key, "st_kv_database_conf")) {
        stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "got an unexpected final entry \"%s\"", key);
    }
    free(key);
    free(cA);
    return hash;
}
static void test_addBlockToHash_3(CuTest *testCase) {
    // concatenation with 2 bases of interstitial and a sequence length breakpoint
    options_t *options = options_construct();
    options->breakpointPenalty = 10;
    options->interstitialSequence = 5;
    stList *observedList = stList_construct3(0, free);
    stList *expectedList = stList_construct3(0, free);
    stHash *observedHash = createBlockHashFromString("a score=0\n"
                                                     "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n"
                                                     "s name.chr1      0 10 +       100 ATGT---ATGCCG\n"
                                                     "s name2.chr1     0 10 +       100 ATGT---ATGCCG\n"
                                                     "s name3.chr1     0 13 +       100 GCAGCTGAAAACA\n",
                                                     observedList
                                                     );
    mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n"
                                                   "s reference.chr0 13  5 + 158545518 ACGTA\n"
                                                   "s name.chr1      12  5 +       100 gtcGG\n"
                                                   "s name2.chr1     10  5 +       100 ATGTg\n"
                                                   "s name3.chr1     50  5 +       100 CCCCC\n"
                                                   , 3);
    stHash *expectedHash = NULL;
    expectedHash = createBlockHashFromString("a score=0\n"
                                             "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n"
                                             "s name.chr1      0 17 +       100 ATGT---ATGCCGac----------gtcGG\n"
                                             "s name2.chr1     0 15 +       100 ATGT---ATGCCG------------ATGTg\n"
                                             "s name3          0 28 +        28 GCAGCTGAAAACA--NNNNNNNNNNCCCCC\n",
                                             expectedList
                                             );
    row_t *r = stHash_search(expectedHash, "name3");
    r->prevRightPos = 54;
    free(r->prevName);
    r->prevName = stString_copy("name3.chr1");
    r->multipleNames = true;
    stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc"
                                              "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"
                                              "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG");
    mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA"
                                         "tttttttttttttttttttttttttttttttt"
                                         "tttttttttttttttttttttttttttttttttttttttttttttttttt");
    stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs);
    mtfs = newMtfseqFromString("ATGTATGCCGATGTg"
                               "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"
                               "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC");
    stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs);
    mtfs = newMtfseqFromString("GCAGCTGAAAACAggggggggggggggggggggggggggggggggggggg"
                               "CCCCCaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
                               );
    stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs);
    addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options);
    CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash));
    CuAssertTrue(testCase, listsAreEqual(observedList, expectedList));
    // clean up
    stHash_destruct(observedHash);
    stHash_destruct(expectedHash);
    stHash_destruct(seqHash);
    stList_destruct(observedList);
    stList_destruct(expectedList);
    maf_destroyMafBlockList(mb);
    destroyOptions(options);
}
Esempio n. 3
0
static char *tree_getNewickTreeStringP(stTree *tree) {
    char *cA, *cA2;
    if(stTree_getChildNumber(tree) > 0) {
        int32_t i;
        cA = stString_copy("(");
        for(i=0; i<stTree_getChildNumber(tree); i++) {
            cA2 = tree_getNewickTreeStringP(stTree_getChild(tree, i));
            char *cA3 = stString_print((i+1 < stTree_getChildNumber(tree) ? "%s%s," : "%s%s"), cA, cA2);
            free(cA);
            free(cA2);
            cA = cA3;
        }
        cA2 = stString_print("%s)", cA);
        free(cA);
        cA = cA2;
    }
    else {
        cA = stString_copy("");
    }
    if(stTree_getLabel(tree) != NULL) {
        cA2 = stString_print("%s%s", cA, stTree_getLabel(tree));
        free(cA);
        cA = cA2;
    }
    if(stTree_getBranchLength(tree) != INFINITY) {
        char *cA2 = stString_print("%s:%g", cA, stTree_getBranchLength(tree));
        free(cA);
        cA = cA2;
    }
    return cA;
}
static void test_addMafLineToRow_1(CuTest *testCase) {
    row_t *obs = newRow(20);
    obs->name = stString_copy("seq1.chr0");
    obs->prevName = stString_copy("seq1.amazing");
    obs->multipleNames = true;
    obs->start = 3;
    obs->length = 10;
    obs->prevRightPos = 20;
    obs->strand = '+';
    obs->prevStrand = '+';
    obs->sourceLength = 100;
    row_copyIn(obs, "acgtacgtac");
    mafLine_t *ml = maf_newMafLineFromString("s seq1.chr_bleh 13 5 + 100 ACGTA", 10);
    addMafLineToRow(obs, ml);
    row_t *exp = newRow(20);
    exp->name = stString_copy("seq1.chr0");
    exp->prevName = stString_copy("seq1.chr_bleh");
    exp->multipleNames = true;
    exp->start = 3;
    exp->length = 15;
    exp->prevRightPos = 17;
    exp->strand = '+';
    exp->prevStrand = '+';
    exp->sourceLength = 15;
    row_copyIn(exp, "acgtacgtacACGTA");
    CuAssertTrue(testCase, rowsAreEqual(obs, exp));
    destroyRow(obs);
    destroyRow(exp);
    maf_destroyMafLineList(ml);
}
Esempio n. 5
0
static stKVDatabaseConf *constructSql(stKVDatabaseType type, const char *host, unsigned port, const char *user, const char *password,
                                      const char *databaseName, const char *tableName) {
    stKVDatabaseConf *conf = stSafeCCalloc(sizeof(stKVDatabaseConf));
    conf->type = type;
    conf->host = stString_copy(host);
    conf->port = port;
    conf->user = stString_copy(user);
    conf->password = stString_copy(password);
    conf->databaseName = stString_copy(databaseName);
    conf->tableName = stString_copy(tableName);
    return conf;
}
static void test_addBlockToHash_2(CuTest *testCase) {
    // concatenation with 2 bases of interstitial AND a previously unobserved sequence
    options_t *options = options_construct();
    options->breakpointPenalty = 10;
    options->interstitialSequence = 5;
    stList *observedList = stList_construct3(0, free);
    stList *expectedList = stList_construct3(0, free);
    stHash *observedHash = createBlockHashFromString("a score=0\n"
                                                     "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n"
                                                     "s name.chr1      0 10 +       100 ATGT---ATGCCG\n"
                                                     "s name2.chr1     0 10 +       100 ATGT---ATGCCG\n",
                                                     observedList);
    mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n"
                                                   "s reference.chr0 13  5 + 158545518 ACGTA\n"
                                                   "s name.chr1      12  5 +       100 gTcGG\n"
                                                   "s name2.chr1     10  5 +       100 ATGTg\n"
                                                   "s name3.chr@      0  5 +        20 aaccg\n"
                                                   , 3);
    stHash *expectedHash = createBlockHashFromString("a score=0\n"
                                                     "s reference.chr0 0 18 + 158545518 gcagctgaaaaca--ACGTA\n"
                                                     "s name.chr1      0 17 +       100 ATGT---ATGCCGacgTcGG\n"
                                                     "s name2.chr1     0 15 +       100 ATGT---ATGCCG--ATGTg\n"
                                                     "s name3.chr@     0  5 +        20 ---------------aaccg\n",
                                                     expectedList
                                                     );
    stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgTc"
                                              "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"
                                              "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG");
    mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA"
                                         "tttttttttttttttttttttttttttttttt"
                                         "tttttttttttttttttttttttttttttttttttttttttttttttttt");
    stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs);
    mtfs = newMtfseqFromString("ATGTATGCCGATGTg"
                               "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"
                               "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC");
    stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs);
    mtfs = newMtfseqFromString("aaccgTTTTTTTTTTTTTTT");
    stHash_insert(seqHash, stString_copy("name3.chr@"), mtfs);
    addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options);
    CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash));
    CuAssertTrue(testCase, listsAreEqual(observedList, expectedList));
    // clean up
    stHash_destruct(observedHash);
    stHash_destruct(expectedHash);
    stHash_destruct(seqHash);
    stList_destruct(observedList);
    stList_destruct(expectedList);
    maf_destroyMafBlockList(mb);
    destroyOptions(options);
}
Esempio n. 7
0
stKVDatabaseConf *stKVDatabaseConf_constructKyotoTycoon(const char *host, unsigned port, int timeout,
														int64_t maxRecordSize, int64_t maxBulkSetSize,
														int64_t maxBulkSetNumRecords,
														const char *databaseDir, const char* databaseName) {
    stKVDatabaseConf *conf = stSafeCCalloc(sizeof(stKVDatabaseConf));
    conf->type = stKVDatabaseTypeKyotoTycoon;
    conf->databaseDir = stString_copy(databaseDir);
    conf->host = stString_copy(host);
    conf->port = port;
    conf->timeout = timeout;
    conf->maxKTRecordSize = maxRecordSize;
    conf->maxKTBulkSetSize = maxBulkSetSize;
    conf->maxKTBulkSetNumRecords = maxBulkSetNumRecords;
    conf->databaseName = stString_copy(databaseName);
    return conf;
}
Esempio n. 8
0
// Returns a hash mapping from sequence header to sequence data.
static stHash *readFastaFile(char *filename) {
    FILE *fasta = fopen(filename, "r");
    if (fasta == NULL) {
        st_errnoAbort("Could not open fasta file %s", filename);
    }
    stHash *headerToData = stHash_construct3(stHash_stringKey,
                                             stHash_stringEqualKey,
                                             free,
                                             free);
    struct List *seqs = constructEmptyList(0, NULL);
    struct List *seqLengths = constructEmptyList(0, free);
    struct List *headers = constructEmptyList(0, free);
    fastaRead(fasta, seqs, seqLengths, headers);

    for (int64_t i = 0; i < seqs->length; i++) {
        char *fullHeader = headers->list[i];
        stList *headerTokens = stString_splitByString(fullHeader, " ");
        char *usableHeader = stString_copy(stList_get(headerTokens, 0));
        stHash_insert(headerToData, usableHeader, seqs->list[i]);
        stList_destruct(headerTokens);
    }
    destructList(seqs);
    destructList(seqLengths);
    destructList(headers);

    return headerToData;
}
static stHash *createSeqHashFromString(char *name, char *input) {
    mtfseq_t *mtfs = newMtfseq(strlen(input));
    stHash *hash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, destroyMtfseq);
    seq_copyIn(mtfs, input);
    stHash_insert(hash, stString_copy(name), mtfs);
    return hash;
}
Esempio n. 10
0
char *makeAlphaNumeric(const char *string) {
    char *cA = stString_copy(string);
    int64_t j = 0;
    for (int64_t i = 0; i < strlen(string); i++) {
        if (isalpha(string[i]) || isdigit(string[i])) {
            cA[j++] = string[i];
        }
    }
    cA[j] = '\0';
    return cA;
}
Esempio n. 11
0
static int64_t *getInts(const char *string, int64_t *arrayLength) {
    int64_t *iA = st_malloc(sizeof(int64_t) * strlen(string));
    char *cA = stString_copy(string);
    char *cA2 = cA;
    char *cA3;
    *arrayLength = 0;
    while ((cA3 = stString_getNextWord(&cA)) != NULL) {
        int64_t i = sscanf(cA3, "%" PRIi64 "", &iA[(*arrayLength)++]);
        (void) i;
        assert(i == 1);
        free(cA3);
    }
    free(cA2);
    return iA;
}
Esempio n. 12
0
int main(int argc, char *argv[])
{
    char *cactusDiskString = NULL;
    stKVDatabaseConf *kvDatabaseConf;
    CactusDisk *cactusDisk;
    Flower *flower;
    Flower_SequenceIterator *flowerIt;
    Sequence *sequence;
    struct option longopts[] = { {"cactusDisk", required_argument, NULL, 'c' },
                                 {0, 0, 0, 0} };
    int flag;
    while((flag = getopt_long(argc, argv, "", longopts, NULL)) != -1) {
        switch(flag) {
        case 'c':
            cactusDiskString = stString_copy(optarg);
            break;
        case '?':
        default:
            usage();
            return 1;
        }
    }
    if (cactusDiskString == NULL) {
        st_errAbort("--cactusDisk option must be provided");
    }
    kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskString);
    cactusDisk = cactusDisk_construct(kvDatabaseConf, 0);
    // Get top-level flower.
    flower = cactusDisk_getFlower(cactusDisk, 0);
    flowerIt = flower_getSequenceIterator(flower);
    while((sequence = flower_getNextSequence(flowerIt)) != NULL) {
        MetaSequence *metaSequence = sequence_getMetaSequence(sequence);
        const char *header;
        char *firstToken, *newHeader;
        stList *tokens;
        // Strip the ID token from the header (should be the first
        // |-separated token) and complain if there isn't one.
        header = metaSequence_getHeader(metaSequence);
        tokens = fastaDecodeHeader(header);
        assert(stList_length(tokens) > 1);
        firstToken = stList_removeFirst(tokens);
        assert(!strncmp(firstToken, "id=", 3));
        free(firstToken);
        newHeader = fastaEncodeHeader(tokens);
        metaSequence_setHeader(metaSequence, newHeader);
    }
    cactusDisk_write(cactusDisk);
}
Esempio n. 13
0
Event *event_construct(Name name, const char *header, float branchLength, Event *parentEvent,
        EventTree *eventTree) {
    assert(eventTree_getEvent(eventTree, name) == NULL); //the event must not already exist in the tree.
    Event *event;
    event = st_malloc(sizeof(Event));
    event->name = name;
    event->parent = parentEvent;
    event->children = constructEmptyList(0, NULL);
    event->header = stString_copy(header == NULL ? "" : header);
    event->branchLength = branchLength < 0.0 ? 0.0 : branchLength;
    event->isOutgroup = 0;
    if (parentEvent != NULL) {
        listAppend(parentEvent->children, event);
    }
    event->eventTree = eventTree;
    eventTree_addEvent(eventTree, event);
    return event;
}
Esempio n. 14
0
MetaSequence *metaSequence_construct2(Name name, int64_t start,
		int64_t length, Name stringName, const char *header,
		Name eventName, bool isTrivialSequence, CactusDisk *cactusDisk) {
	MetaSequence *metaSequence;

	metaSequence = st_malloc(sizeof(MetaSequence));
	metaSequence->name = name;
	assert(length >= 0);
	metaSequence->start = start;
	metaSequence->length = length;
	metaSequence->stringName = stringName;
	metaSequence->eventName = eventName;
	metaSequence->cactusDisk = cactusDisk;
	metaSequence->header = stString_copy(header != NULL ? header : "");
	metaSequence->isTrivialSequence = isTrivialSequence;

	cactusDisk_addMetaSequence(cactusDisk, metaSequence);
	return metaSequence;
}
char *getTerminalAdjacencySubString(Cap *cap) {
    if(getTerminalAdjacencyLength_ignoreAdjacencies) {
        return stString_copy("");
    }
    cap = getTerminalCap(cap);
    cap = cap_getStrand(cap) ? cap : cap_getReverse(cap); //This ensures the asserts are as expected.
    Cap *adjacentCap = cap_getAdjacency(cap);
    int64_t i = cap_getCoordinate(cap) - cap_getCoordinate(adjacentCap);
    assert(i != 0);
    if (i > 0) {
        assert(cap_getSide(cap));
        assert(!cap_getSide(adjacentCap));
        return sequence_getString(cap_getSequence(cap),
                cap_getCoordinate(adjacentCap) + 1, i - 1, 1);
    } else {
        assert(cap_getSide(adjacentCap));
        assert(!cap_getSide(cap));
        return sequence_getString(cap_getSequence(cap), cap_getCoordinate(cap) + 1, -i - 1, 1);
    }
}
Esempio n. 16
0
stKVDatabaseConf *stKVDatabaseConf_constructClone(stKVDatabaseConf *srcConf) {
    stKVDatabaseConf *conf = stSafeCCalloc(sizeof(stKVDatabaseConf));
    conf->type = srcConf->type;
    conf->databaseDir = stString_copy(srcConf->databaseDir);
    conf->host = stString_copy(srcConf->host);
    conf->port = srcConf->port;
    conf->timeout = srcConf->timeout;
    conf->maxKTRecordSize = srcConf->maxKTRecordSize;
    conf->maxKTBulkSetSize = srcConf->maxKTBulkSetSize;
    conf->maxKTBulkSetNumRecords = srcConf->maxKTBulkSetNumRecords;
    conf->user = stString_copy(srcConf->user);
    conf->password = stString_copy(srcConf->password);
    conf->databaseName = stString_copy(srcConf->databaseName);
    conf->tableName = stString_copy(srcConf->tableName);
    return conf;
}
Esempio n. 17
0
char *cactusDisk_getString(CactusDisk *cactusDisk, Name name, int64_t start, int64_t length, int64_t strand,
        int64_t totalSequenceLength) {
    /*
     * Gets a string from the database.
     *
     */
    assert(length >= 0);
    if (length == 0) {
        return stString_copy("");
    }
    //First try getting it from the cache
    char *string = cactusDisk_getStringFromCache(cactusDisk, name, start, length, strand);
    if (string == NULL) { //If not in the cache, add it to the cache and then get it from the cache.
        stList *list = stList_construct3(0, (void (*)(void *)) substring_destruct);
        stList_append(list, substring_construct(name, start, length));
        cacheSubstringsFromDB(cactusDisk, list);
        stList_destruct(list);
        string = cactusDisk_getStringFromCache(cactusDisk, name, start, length, strand);
    }
    assert(string != NULL);
    return string;
}
Esempio n. 18
0
/* connect to a database server */
static MySqlDb *connect(stKVDatabaseConf *conf) {
    MySqlDb *dbImpl = stSafeCCalloc(sizeof(MySqlDb));
    if ((dbImpl->conn = mysql_init(NULL)) == NULL) {
        disconnect(dbImpl);
        stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "mysql_init failed");
    }
    if (mysql_real_connect(dbImpl->conn, stKVDatabaseConf_getHost(conf), stKVDatabaseConf_getUser(conf), stKVDatabaseConf_getPassword(conf), stKVDatabaseConf_getDatabaseName(conf), stKVDatabaseConf_getPort(conf), NULL, 0) == NULL) {
        stExcept *ex = createMySqlExcept(dbImpl, "failed to connect to MySql database: %s on %s as user %s",
                                         stKVDatabaseConf_getDatabaseName(conf), stKVDatabaseConf_getHost(conf), stKVDatabaseConf_getUser(conf));
        disconnect(dbImpl);
        stThrow(ex);
    }
    dbImpl->table = stString_copy(stKVDatabaseConf_getTableName(conf));

    // disable report of notes, so only warnings and errors come back
    sqlExec(dbImpl, "set sql_notes=0");

    // set max sizes of an sql statment to 1G.  This must also be specified
    // for the server by adding "max_allowed_packet = 1G" to the [mysqld]
    // section of my.cnf
    sqlExec(dbImpl, "set global max_allowed_packet=1073741824");

    // set the idle timeout to a week
    int waitTimeout = 7 * 24 * 60 * 60;  // 1 week
    sqlExec(dbImpl, "set wait_timeout=%d", waitTimeout);

    // set the read timeout to an hour
    int readTimeout = 60 * 60;  // 1 hour
    sqlExec(dbImpl, "set net_read_timeout=%d", readTimeout);

    // NOTE: commit will not return an error, this does row-level locking on
    // the select done before the update
    sqlExec(dbImpl, "set autocommit = 0;");
    sqlExec(dbImpl, "set session transaction isolation level serializable;");
    return dbImpl;
}
Esempio n. 19
0
static CactusDisk *cactusDisk_constructPrivate(stKVDatabaseConf *conf, bool create, const char *sequencesFileName) {
    //sequencesFileName = NULL; //Disable the ability to store the sequences on disk.
    CactusDisk *cactusDisk = st_calloc(1, sizeof(CactusDisk));

    //construct lists of in memory objects
    cactusDisk->metaSequences = stSortedSet_construct3(cactusDisk_constructMetaSequencesP, NULL);
    cactusDisk->flowers = stSortedSet_construct3(cactusDisk_constructFlowersP, NULL);
    cactusDisk->flowerNamesMarkedForDeletion = stSortedSet_construct3((int (*)(const void *, const void *)) strcmp,
            free);
    cactusDisk->updateRequests = stList_construct3(0, (void (*)(void *)) stKVDatabaseBulkRequest_destruct);

    //Now open the database
    cactusDisk->database = stKVDatabase_construct(conf, create);
    cactusDisk->cache = stCache_construct();
    cactusDisk->stringCache = stCache_construct();

    //initialise the unique ids.
    int64_t seed = (clock() << 24) | (time(NULL) << 16) | (getpid() & 65535); //Likely to be unique
    st_logDebug("The cactus disk is seeding the random number generator with the value %" PRIi64 "\n", seed);
    st_randomSeed(seed);
    cactusDisk->uniqueNumber = 0;
    cactusDisk->maxUniqueNumber = 0;

    //Now load any stuff..
    if (containsRecord(cactusDisk, CACTUS_DISK_PARAMETER_KEY)) {
        if (create) {
            stThrowNew(CACTUS_DISK_EXCEPTION_ID, "Tried to create a cactus disk, but the cactus disk already exists");
        }
        if (sequencesFileName != NULL) {
            stThrowNew(CACTUS_DISK_EXCEPTION_ID,
                    "A sequences file name is specified, but the cactus disk is not being created");
        }
        void *record = getRecord(cactusDisk, CACTUS_DISK_PARAMETER_KEY, "cactus_disk parameters");
        void *record2 = record;
        cactusDisk_loadFromBinaryRepresentation(&record, cactusDisk, conf);
        free(record2);
    } else {
        assert(create);
        if (sequencesFileName == NULL) {
            cactusDisk->storeSequencesInAFile = 0;
            cactusDisk->sequencesFileName = NULL;
            cactusDisk->sequencesReadFileHandle = NULL;
            cactusDisk->sequencesWriteFileHandle = NULL;
            cactusDisk->absSequencesFileName = NULL;
        } else {
            if (stKVDatabaseConf_getDir(conf) == NULL) {
                stThrowNew(CACTUS_DISK_EXCEPTION_ID,
                        "The database conf does not contain a directory in which the sequence file is to be found!\n");
            }
            cactusDisk->storeSequencesInAFile = 1;
            cactusDisk->sequencesFileName = stString_copy(sequencesFileName);
            cactusDisk->absSequencesFileName = stString_print("%s/%s", stKVDatabaseConf_getDir(conf),
                    cactusDisk->sequencesFileName);
            //Make sure the file exists
            cactusDisk->sequencesReadFileHandle = fopen(cactusDisk->absSequencesFileName, "w");
            assert(cactusDisk->sequencesReadFileHandle != NULL);
            fclose(cactusDisk->sequencesReadFileHandle); //Flush it first time.
            cactusDisk->sequencesReadFileHandle = NULL;
            cactusDisk->sequencesWriteFileHandle = NULL;
        }
    }

    return cactusDisk;
}
Esempio n. 20
0
int main(int argc, char *argv[]) {
    /*
     * Script for adding alignments to cactus tree.
     */
    int64_t startTime;
    stKVDatabaseConf *kvDatabaseConf;
    CactusDisk *cactusDisk;
    int key, k;

    bool (*filterFn)(stPinchSegment *, stPinchSegment *) = NULL;
    stSet *outgroupThreads = NULL;

    /*
     * Arguments/options
     */
    char * logLevelString = NULL;
    char * alignmentsFile = NULL;
    char * constraintsFile = NULL;
    char * cactusDiskDatabaseString = NULL;
    char * lastzArguments = "";
    int64_t minimumSequenceLengthForBlast = 1;

    //Parameters for annealing/melting rounds
    int64_t *annealingRounds = NULL;
    int64_t annealingRoundsLength = 0;
    int64_t *meltingRounds = NULL;
    int64_t meltingRoundsLength = 0;

    //Parameters for melting
    float maximumAdjacencyComponentSizeRatio = 10;
    int64_t blockTrim = 0;
    int64_t alignmentTrimLength = 0;
    int64_t *alignmentTrims = NULL;
    int64_t chainLengthForBigFlower = 1000000;
    int64_t longChain = 2;
    int64_t minLengthForChromosome = 1000000;
    float proportionOfUnalignedBasesForNewChromosome = 0.8;
    bool breakChainsAtReverseTandems = 1;
    int64_t maximumMedianSequenceLengthBetweenLinkedEnds = INT64_MAX;
    bool realign = 0;
    char *realignArguments = "";
    bool removeRecoverableChains = false;
    bool (*recoverableChainsFilter)(stCactusEdgeEnd *, Flower *) = NULL;
    int64_t maxRecoverableChainsIterations = 1;
    int64_t maxRecoverableChainLength = INT64_MAX;

    //Parameters for removing ancient homologies
    bool doPhylogeny = false;
    int64_t phylogenyNumTrees = 1;
    enum stCaf_RootingMethod phylogenyRootingMethod = BEST_RECON;
    enum stCaf_ScoringMethod phylogenyScoringMethod = COMBINED_LIKELIHOOD;
    double breakpointScalingFactor = 1.0;
    bool phylogenySkipSingleCopyBlocks = 0;
    int64_t phylogenyMaxBaseDistance = 1000;
    int64_t phylogenyMaxBlockDistance = 100;
    bool phylogenyKeepSingleDegreeBlocks = 0;
    stList *phylogenyTreeBuildingMethods = stList_construct();
    enum stCaf_TreeBuildingMethod defaultMethod = GUIDED_NEIGHBOR_JOINING;
    stList_append(phylogenyTreeBuildingMethods, &defaultMethod);
    double phylogenyCostPerDupPerBase = 0.2;
    double phylogenyCostPerLossPerBase = 0.2;
    const char *debugFileName = NULL;
    const char *referenceEventHeader = NULL;
    double phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce = 1.0;
    int64_t numTreeBuildingThreads = 2;
    int64_t minimumBlockDegreeToCheckSupport = 10;
    double minimumBlockHomologySupport = 0.7;
    double nucleotideScalingFactor = 1.0;
    HomologyUnitType phylogenyHomologyUnitType = BLOCK;
    enum stCaf_DistanceCorrectionMethod phylogenyDistanceCorrectionMethod = JUKES_CANTOR;
    bool sortAlignments = false;

    ///////////////////////////////////////////////////////////////////////////
    // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
    ///////////////////////////////////////////////////////////////////////////

    while (1) {
        static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "alignments", required_argument, 0, 'b' }, {
                "cactusDisk", required_argument, 0, 'c' }, { "lastzArguments", required_argument, 0, 'd' },
                { "help", no_argument, 0, 'h' }, { "annealingRounds", required_argument, 0, 'i' }, { "trim", required_argument, 0, 'k' }, {
                        "trimChange", required_argument, 0, 'l', }, { "minimumTreeCoverage", required_argument, 0, 'm' }, { "blockTrim",
                        required_argument, 0, 'n' }, { "deannealingRounds", required_argument, 0, 'o' }, { "minimumDegree",
                        required_argument, 0, 'p' }, { "minimumIngroupDegree", required_argument, 0, 'q' }, {
                        "minimumOutgroupDegree", required_argument, 0, 'r' }, { "alignmentFilter", required_argument, 0, 't' }, {
                        "minimumSequenceLengthForBlast", required_argument, 0, 'v' }, { "maxAdjacencyComponentSizeRatio",
                        required_argument, 0, 'w' }, { "constraints", required_argument, 0, 'x' }, { "minLengthForChromosome",
                        required_argument, 0, 'y' }, { "proportionOfUnalignedBasesForNewChromosome", required_argument, 0, 'z' },
                        { "maximumMedianSequenceLengthBetweenLinkedEnds", required_argument, 0, 'A' },
                        { "realign", no_argument, 0, 'B' }, { "realignArguments", required_argument, 0, 'C' },
                        { "phylogenyNumTrees", required_argument, 0, 'D' },
                        { "phylogenyRootingMethod", required_argument, 0, 'E' },
                        { "phylogenyScoringMethod", required_argument, 0, 'F' },
                        { "phylogenyBreakpointScalingFactor", required_argument, 0, 'G' },
                        { "phylogenySkipSingleCopyBlocks", no_argument, 0, 'H' },
                        { "phylogenyMaxBaseDistance", required_argument, 0, 'I' },
                        { "phylogenyMaxBlockDistance", required_argument, 0, 'J' },
                        { "phylogenyDebugFile", required_argument, 0, 'K' },
                        { "phylogenyKeepSingleDegreeBlocks", no_argument, 0, 'L' },
                        { "phylogenyTreeBuildingMethod", required_argument, 0, 'M' },
                        { "phylogenyCostPerDupPerBase", required_argument, 0, 'N' },
                        { "phylogenyCostPerLossPerBase", required_argument, 0, 'O' },
                        { "referenceEventHeader", required_argument, 0, 'P' },
                        { "phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce", required_argument, 0, 'Q' },
                        { "numTreeBuildingThreads", required_argument, 0, 'R' },
                        { "phylogeny", no_argument, 0, 'S' },
                        { "minimumBlockHomologySupport", required_argument, 0, 'T' },
                        { "phylogenyNucleotideScalingFactor", required_argument, 0, 'U' },
                        { "minimumBlockDegreeToCheckSupport", required_argument, 0, 'V' },
                        { "removeRecoverableChains", required_argument, 0, 'W' },
                        { "minimumNumberOfSpecies", required_argument, 0, 'X' },
                        { "phylogenyHomologyUnitType", required_argument, 0, 'Y' },
                        { "phylogenyDistanceCorrectionMethod", required_argument, 0, 'Z' },
                        { "maxRecoverableChainsIterations", required_argument, 0, '1' },
                        { "maxRecoverableChainLength", required_argument, 0, '2' },
                        { 0, 0, 0, 0 } };

        int option_index = 0;

        key = getopt_long(argc, argv, "a:b:c:hi:k:m:n:o:p:q:r:stv:w:x:y:z:A:BC:D:E:", long_options, &option_index);

        if (key == -1) {
            break;
        }

        switch (key) {
            case 'a':
                logLevelString = stString_copy(optarg);
                st_setLogLevelFromString(logLevelString);
                break;
            case 'b':
                alignmentsFile = stString_copy(optarg);
                break;
            case 'c':
                cactusDiskDatabaseString = stString_copy(optarg);
                break;
            case 'd':
                lastzArguments = stString_copy(optarg);
                break;
            case 'h':
                usage();
                return 0;
            case 'i':
                annealingRounds = getInts(optarg, &annealingRoundsLength);
                break;
            case 'o':
                meltingRounds = getInts(optarg, &meltingRoundsLength);
                break;
            case 'k':
                alignmentTrims = getInts(optarg, &alignmentTrimLength);
                break;
            case 'm':
                k = sscanf(optarg, "%f", &minimumTreeCoverage);
                assert(k == 1);
                break;
            case 'n':
                k = sscanf(optarg, "%" PRIi64 "", &blockTrim);
                assert(k == 1);
                break;
            case 'p':
                k = sscanf(optarg, "%" PRIi64 "", &minimumDegree);
                assert(k == 1);
                break;
            case 'q':
                k = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree);
                assert(k == 1);
                break;
            case 'r':
                k = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree);
                assert(k == 1);
                break;
            case 't':
                if (strcmp(optarg, "singleCopyOutgroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_filterByOutgroup;
                } else if (strcmp(optarg, "relaxedSingleCopyOutgroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_relaxedFilterByOutgroup;
                } else if (strcmp(optarg, "singleCopy") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_filterByRepeatSpecies;
                } else if (strcmp(optarg, "relaxedSingleCopy") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_relaxedFilterByRepeatSpecies;
                } else if (strcmp(optarg, "singleCopyChr") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_singleCopyChr;
                } else if (strcmp(optarg, "singleCopyIngroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_singleCopyIngroup;
                } else if (strcmp(optarg, "relaxedSingleCopyIngroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_relaxedSingleCopyIngroup;
                } else if (strcmp(optarg, "none") == 0) {
                    sortAlignments = false;
                    filterFn = NULL;
                } else {
                    st_errAbort("Could not recognize alignmentFilter option %s", optarg);
                }
                break;
            case 'v':
                k = sscanf(optarg, "%" PRIi64 "", &minimumSequenceLengthForBlast);
                assert(k == 1);
                break;
            case 'w':
                k = sscanf(optarg, "%f", &maximumAdjacencyComponentSizeRatio);
                assert(k == 1);
                break;
            case 'x':
                constraintsFile = stString_copy(optarg);
                break;
            case 'y':
                k = sscanf(optarg, "%" PRIi64 "", &minLengthForChromosome);
                assert(k == 1);
                break;
            case 'z':
                k = sscanf(optarg, "%f", &proportionOfUnalignedBasesForNewChromosome);
                assert(k == 1);
                break;
            case 'A':
                k = sscanf(optarg, "%" PRIi64 "", &maximumMedianSequenceLengthBetweenLinkedEnds);
                assert(k == 1);
                break;
            case 'B':
                realign = 1;
                break;
            case 'C':
                realignArguments = stString_copy(optarg);
                break;
            case 'D':
                k = sscanf(optarg, "%" PRIi64, &phylogenyNumTrees);
                assert(k == 1);
                break;
            case 'E':
                if (!strcmp(optarg, "outgroupBranch")) {
                    phylogenyRootingMethod = OUTGROUP_BRANCH;
                } else if (!strcmp(optarg, "longestBranch")) {
                    phylogenyRootingMethod = LONGEST_BRANCH;
                } else if (!strcmp(optarg, "bestRecon")) {
                    phylogenyRootingMethod = BEST_RECON;
                } else {
                    st_errAbort("Invalid tree rooting method: %s", optarg);
                }
                break;
            case 'F':
                if (!strcmp(optarg, "reconCost")) {
                    phylogenyScoringMethod = RECON_COST;
                } else if (!strcmp(optarg, "nucLikelihood")) {
                    phylogenyScoringMethod = NUCLEOTIDE_LIKELIHOOD;
                } else if (!strcmp(optarg, "reconLikelihood")) {
                    phylogenyScoringMethod = RECON_LIKELIHOOD;
                } else if (!strcmp(optarg, "combinedLikelihood")) {
                    phylogenyScoringMethod = COMBINED_LIKELIHOOD;
                } else {
                    st_errAbort("Invalid tree scoring method: %s", optarg);
                }
                break;
            case 'G':
                k = sscanf(optarg, "%lf", &breakpointScalingFactor);
                assert(k == 1);
                break;
            case 'H':
                phylogenySkipSingleCopyBlocks = true;
                break;
            case 'I':
                k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBaseDistance);
                assert(k == 1);
                break;
            case 'J':
                k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBlockDistance);
                assert(k == 1);
                break;
            case 'K':
                debugFileName = stString_copy(optarg);
                break;
            case 'L':
                phylogenyKeepSingleDegreeBlocks = true;
                break;
            case 'M':
                // clear the default setting of the list
                stList_destruct(phylogenyTreeBuildingMethods);
                phylogenyTreeBuildingMethods = stList_construct();
                stList *methodStrings = stString_splitByString(optarg, ",");

                for (int64_t i = 0; i < stList_length(methodStrings); i++) {
                    char *methodString = stList_get(methodStrings, i);
                    enum stCaf_TreeBuildingMethod *method = st_malloc(sizeof(enum stCaf_TreeBuildingMethod));
                    if (strcmp(methodString, "neighborJoining") == 0) {
                        *method = NEIGHBOR_JOINING;
                    } else if (strcmp(methodString, "guidedNeighborJoining") == 0) {
                        *method = GUIDED_NEIGHBOR_JOINING;
                    } else if (strcmp(methodString, "splitDecomposition") == 0) {
                        *method = SPLIT_DECOMPOSITION;
                    } else if (strcmp(methodString, "strictSplitDecomposition") == 0) {
                        *method = STRICT_SPLIT_DECOMPOSITION;
                    } else if (strcmp(methodString, "removeBadChains") == 0) {
                        *method = REMOVE_BAD_CHAINS;
                    } else {
                        st_errAbort("Unknown tree building method: %s", methodString);
                    }
                    stList_append(phylogenyTreeBuildingMethods, method);
                }
                stList_destruct(methodStrings);
                break;
            case 'N':
                k = sscanf(optarg, "%lf", &phylogenyCostPerDupPerBase);
                assert(k == 1);
                break;
            case 'O':
                k = sscanf(optarg, "%lf", &phylogenyCostPerLossPerBase);
                assert(k == 1);
                break;
            case 'P':
                referenceEventHeader = stString_copy(optarg);
                break;
            case 'Q':
                k = sscanf(optarg, "%lf", &phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce);
                assert(k == 1);
                break;
            case 'R':
                k = sscanf(optarg, "%" PRIi64, &numTreeBuildingThreads);
                assert(k == 1);
                break;
            case 'S':
                doPhylogeny = true;
                break;
            case 'T':
                k = sscanf(optarg, "%lf", &minimumBlockHomologySupport);
                assert(k == 1);
                assert(minimumBlockHomologySupport <= 1.0);
                assert(minimumBlockHomologySupport >= 0.0);
                break;
            case 'U':
                k = sscanf(optarg, "%lf", &nucleotideScalingFactor);
                assert(k == 1);
                break;
            case 'V':
                k = sscanf(optarg, "%" PRIi64, &minimumBlockDegreeToCheckSupport);
                assert(k == 1);
                break;
            case 'W':
                if (strcmp(optarg, "1") == 0) {
                    removeRecoverableChains = true;
                    recoverableChainsFilter = NULL;
                } else if (strcmp(optarg, "unequalNumberOfIngroupCopies") == 0) {
                    removeRecoverableChains = true;
                    recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopies;
                } else if (strcmp(optarg, "unequalNumberOfIngroupCopiesOrNoOutgroup") == 0) {
                    removeRecoverableChains = true;
                    recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopiesOrNoOutgroup;
                } else if (strcmp(optarg, "0") == 0) {
                    removeRecoverableChains = false;
                } else {
                    st_errAbort("Could not parse removeRecoverableChains argument");
                }
                break;
            case 'X':
                k = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies);
                if (k != 1) {
                    st_errAbort("Error parsing the minimumNumberOfSpecies argument");
                }
                break;
            case 'Y':
                if (strcmp(optarg, "chain") == 0) {
                    phylogenyHomologyUnitType = CHAIN;
                } else if (strcmp(optarg, "block") == 0) {
                    phylogenyHomologyUnitType = BLOCK;
                } else {
                    st_errAbort("Could not parse the phylogenyHomologyUnitType argument");
                }
                break;
            case 'Z':
                if (strcmp(optarg, "jukesCantor") == 0) {
                    phylogenyDistanceCorrectionMethod = JUKES_CANTOR;
                } else if (strcmp(optarg, "none") == 0 ) {
                    phylogenyDistanceCorrectionMethod = NONE;
                } else {
                    st_errAbort("Could not parse the phylogenyDistanceCorrectionMethod argument");
                }
                break;
            case '1':
                k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainsIterations);
                if (k != 1) {
                    st_errAbort("Error parsing the maxRecoverableChainsIterations argument");
                }
                break;
            case '2':
                k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainLength);
                if (k != 1) {
                    st_errAbort("Error parsing the maxRecoverableChainLength argument");
                }
                break;
            default:
                usage();
                return 1;
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    // (0) Check the inputs.
    ///////////////////////////////////////////////////////////////////////////

    assert(cactusDiskDatabaseString != NULL);
    assert(minimumTreeCoverage >= 0.0);
    assert(minimumTreeCoverage <= 1.0);
    assert(blockTrim >= 0);
    assert(annealingRoundsLength >= 0);
    for (int64_t i = 0; i < annealingRoundsLength; i++) {
        assert(annealingRounds[i] >= 0);
    }
    assert(meltingRoundsLength >= 0);
    for (int64_t i = 1; i < meltingRoundsLength; i++) {
        assert(meltingRounds[i - 1] < meltingRounds[i]);
        assert(meltingRounds[i - 1] >= 1);
    }
    assert(alignmentTrimLength >= 0);
    for (int64_t i = 0; i < alignmentTrimLength; i++) {
        assert(alignmentTrims[i] >= 0);
    }
    assert(minimumOutgroupDegree >= 0);
    assert(minimumIngroupDegree >= 0);

    //////////////////////////////////////////////
    //Set up logging
    //////////////////////////////////////////////

    st_setLogLevelFromString(logLevelString);

    //////////////////////////////////////////////
    //Log (some of) the inputs
    //////////////////////////////////////////////

    st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString);

    //////////////////////////////////////////////
    //Load the database
    //////////////////////////////////////////////

    kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString);
    cactusDisk = cactusDisk_construct(kvDatabaseConf, 0);
    st_logInfo("Set up the flower disk\n");

    ///////////////////////////////////////////////////////////////////////////
    // Sort the constraints
    ///////////////////////////////////////////////////////////////////////////

    stPinchIterator *pinchIteratorForConstraints = NULL;
    if (constraintsFile != NULL) {
        pinchIteratorForConstraints = stPinchIterator_constructFromFile(constraintsFile);
        st_logInfo("Created an iterator for the alignment constaints from file: %s\n", constraintsFile);
    }

    ///////////////////////////////////////////////////////////////////////////
    // Do the alignment
    ///////////////////////////////////////////////////////////////////////////

    startTime = time(NULL);

    stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk);
    if (alignmentsFile == NULL) {
        cactusDisk_preCacheStrings(cactusDisk, flowers);
    }
    char *tempFile1 = NULL;
    for (int64_t i = 0; i < stList_length(flowers); i++) {
        flower = stList_get(flowers, i);
        if (!flower_builtBlocks(flower)) { // Do nothing if the flower already has defined blocks
            st_logDebug("Processing flower: %lli\n", flower_getName(flower));

            stCaf_setFlowerForAlignmentFiltering(flower);

            //Set up the graph and add the initial alignments
            stPinchThreadSet *threadSet = stCaf_setup(flower);

            //Build the set of outgroup threads
            outgroupThreads = stCaf_getOutgroupThreads(flower, threadSet);

            //Setup the alignments
            stPinchIterator *pinchIterator;
            stList *alignmentsList = NULL;
            if (alignmentsFile != NULL) {
                assert(i == 0);
                assert(stList_length(flowers) == 1);
                if (sortAlignments) {
                    tempFile1 = getTempFile();
                    stCaf_sortCigarsFileByScoreInDescendingOrder(alignmentsFile, tempFile1);
                    pinchIterator = stPinchIterator_constructFromFile(tempFile1);
                } else {
                    pinchIterator = stPinchIterator_constructFromFile(alignmentsFile);
                }
            } else {
                if (tempFile1 == NULL) {
                    tempFile1 = getTempFile();
                }
                alignmentsList = stCaf_selfAlignFlower(flower, minimumSequenceLengthForBlast, lastzArguments, realign, realignArguments, tempFile1);
                if (sortAlignments) {
                    stCaf_sortCigarsByScoreInDescendingOrder(alignmentsList);
                }
                st_logDebug("Ran lastz and have %" PRIi64 " alignments\n", stList_length(alignmentsList));
                pinchIterator = stPinchIterator_constructFromList(alignmentsList);
            }

            for (int64_t annealingRound = 0; annealingRound < annealingRoundsLength; annealingRound++) {
                int64_t minimumChainLength = annealingRounds[annealingRound];
                int64_t alignmentTrim = annealingRound < alignmentTrimLength ? alignmentTrims[annealingRound] : 0;
                st_logDebug("Starting annealing round with a minimum chain length of %" PRIi64 " and an alignment trim of %" PRIi64 "\n", minimumChainLength, alignmentTrim);
                stPinchIterator_setTrim(pinchIterator, alignmentTrim);

                //Add back in the constraints
                if (pinchIteratorForConstraints != NULL) {
                    stCaf_anneal(threadSet, pinchIteratorForConstraints, filterFn);
                }

                //Do the annealing
                if (annealingRound == 0) {
                    stCaf_anneal(threadSet, pinchIterator, filterFn);
                } else {
                    stCaf_annealBetweenAdjacencyComponents(threadSet, pinchIterator, filterFn);
                }

                // Dump the block degree and length distribution to a file
                if (debugFileName != NULL) {
                    dumpBlockInfo(threadSet, stString_print("%s-blockStats-preMelting", debugFileName));
                }

                printf("Sequence graph statistics after annealing:\n");
                printThreadSetStatistics(threadSet, flower, stdout);

                // Check for poorly-supported blocks--those that have
                // been transitively aligned together but with very
                // few homologies supporting the transitive
                // alignment. These "megablocks" can snarl up the
                // graph so that a lot of extra gets thrown away in
                // the first melting step.
                stPinchThreadSetBlockIt blockIt = stPinchThreadSet_getBlockIt(threadSet);
                stPinchBlock *block;
                while ((block = stPinchThreadSetBlockIt_getNext(&blockIt)) != NULL) {
                    if (stPinchBlock_getDegree(block) > minimumBlockDegreeToCheckSupport) {
                        uint64_t supportingHomologies = stPinchBlock_getNumSupportingHomologies(block);
                        uint64_t possibleSupportingHomologies = numPossibleSupportingHomologies(block, flower);
                        double support = ((double) supportingHomologies) / possibleSupportingHomologies;
                        if (support < minimumBlockHomologySupport) {
                            fprintf(stdout, "Destroyed a megablock with degree %" PRIi64
                                    " and %" PRIi64 " supporting homologies out of a maximum "
                                    "of %" PRIi64 " (%lf%%).\n", stPinchBlock_getDegree(block),
                                    supportingHomologies, possibleSupportingHomologies, support);
                            stPinchBlock_destruct(block);
                        }
                    }
                }

                //Do the melting rounds
                for (int64_t meltingRound = 0; meltingRound < meltingRoundsLength; meltingRound++) {
                    int64_t minimumChainLengthForMeltingRound = meltingRounds[meltingRound];
                    st_logDebug("Starting melting round with a minimum chain length of %" PRIi64 " \n", minimumChainLengthForMeltingRound);
                    if (minimumChainLengthForMeltingRound >= minimumChainLength) {
                        break;
                    }
                    stCaf_melt(flower, threadSet, NULL, 0, minimumChainLengthForMeltingRound, 0, INT64_MAX);
                } st_logDebug("Last melting round of cycle with a minimum chain length of %" PRIi64 " \n", minimumChainLength);
                stCaf_melt(flower, threadSet, NULL, 0, minimumChainLength, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds);
                //This does the filtering of blocks that do not have the required species/tree-coverage/degree.
                stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX);
            }

            if (removeRecoverableChains) {
                stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength);
            }
            if (debugFileName != NULL) {
                dumpBlockInfo(threadSet, stString_print("%s-blockStats-postMelting", debugFileName));
            }

            printf("Sequence graph statistics after melting:\n");
            printThreadSetStatistics(threadSet, flower, stdout);

            // Build a tree for each block, then use each tree to
            // partition the homologies between the ingroups sequences
            // into those that occur before the speciation with the
            // outgroup and those which occur late.

            if (stSet_size(outgroupThreads) > 0 && doPhylogeny) {
                st_logDebug("Starting to build trees and partition ingroup homologies\n");
                stHash *threadStrings = stCaf_getThreadStrings(flower, threadSet);
                st_logDebug("Got sets of thread strings and set of threads that are outgroups\n");
                stCaf_PhylogenyParameters params;
                params.distanceCorrectionMethod = phylogenyDistanceCorrectionMethod;
                params.treeBuildingMethods = phylogenyTreeBuildingMethods;
                params.rootingMethod = phylogenyRootingMethod;
                params.scoringMethod = phylogenyScoringMethod;
                params.breakpointScalingFactor = breakpointScalingFactor;
                params.nucleotideScalingFactor = nucleotideScalingFactor;
                params.skipSingleCopyBlocks = phylogenySkipSingleCopyBlocks;
                params.keepSingleDegreeBlocks = phylogenyKeepSingleDegreeBlocks;
                params.costPerDupPerBase = phylogenyCostPerDupPerBase;
                params.costPerLossPerBase = phylogenyCostPerLossPerBase;
                params.maxBaseDistance = phylogenyMaxBaseDistance;
                params.maxBlockDistance = phylogenyMaxBlockDistance;
                params.numTrees = phylogenyNumTrees;
                params.ignoreUnalignedBases = 1;
                params.onlyIncludeCompleteFeatureBlocks = 0;
                params.doSplitsWithSupportHigherThanThisAllAtOnce = phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce;
                params.numTreeBuildingThreads = numTreeBuildingThreads;

                assert(params.numTreeBuildingThreads >= 1);

                stCaf_buildTreesToRemoveAncientHomologies(
                    threadSet, phylogenyHomologyUnitType, threadStrings, outgroupThreads, flower, &params,
                    debugFileName == NULL ? NULL : stString_print("%s-phylogeny", debugFileName), referenceEventHeader);
                stHash_destruct(threadStrings);
                st_logDebug("Finished building trees\n");

                if (removeRecoverableChains) {
                    // We melt recoverable chains after splitting, as
                    // well as before, to alleviate coverage loss
                    // caused by bad splits.
                    stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength);
                }

                // Enforce the block constraints on minimum degree,
                // etc. after splitting.
                stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX);
            }

            //Sort out case when we allow blocks of degree 1
            if (minimumDegree < 2) {
                st_logDebug("Creating degree 1 blocks\n");
                stCaf_makeDegreeOneBlocks(threadSet);
                stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX);
            } else if (maximumAdjacencyComponentSizeRatio < INT64_MAX) { //Deal with giant components
                st_logDebug("Breaking up components greedily\n");
                stCaf_breakupComponentsGreedily(threadSet, maximumAdjacencyComponentSizeRatio);
            }

            //Finish up
            stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, minLengthForChromosome,
                    proportionOfUnalignedBasesForNewChromosome); //Flower is then destroyed at this point.
            st_logInfo("Ran the cactus core script\n");

            //Cleanup
            stPinchThreadSet_destruct(threadSet);
            stPinchIterator_destruct(pinchIterator);
            stSet_destruct(outgroupThreads);

            if (alignmentsList != NULL) {
                stList_destruct(alignmentsList);
            }
            st_logInfo("Cleaned up from main loop\n");
        } else {
            st_logInfo("We've already built blocks / alignments for this flower\n");
        }
    }
    stList_destruct(flowers);
    if (tempFile1 != NULL) {
        st_system("rm %s", tempFile1);
    }

    if (constraintsFile != NULL) {
        stPinchIterator_destruct(pinchIteratorForConstraints);
    }

    ///////////////////////////////////////////////////////////////////////////
    // Write the flower to disk.
    ///////////////////////////////////////////////////////////////////////////
    st_logDebug("Writing the flowers to disk\n");
    cactusDisk_write(cactusDisk);
    st_logInfo("Updated the flower on disk and %" PRIi64 " seconds have elapsed\n", time(NULL) - startTime);

    ///////////////////////////////////////////////////////////////////////////
    // Clean up.
    ///////////////////////////////////////////////////////////////////////////

    cactusDisk_destruct(cactusDisk);
}
//============================== MAIN =========================================
int main(int argc, char *argv[]) {
   Flower *flower;

   /*
    * Arguments/options
    */
   char * st_logLevelString = NULL;
   char * cactusDiskDatabaseString = NULL;
   char * flowerName = "0";
   char * outputFile = NULL;
   char * species = NULL;
   char * geneFile = NULL;

   ///////////////////////////////////////////////////////////////////////////
   // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
   ///////////////////////////////////////////////////////////////////////////

   while(1) {
      static struct option long_options[] = {
         { "genePslFile", required_argument, 0, 'g' },
         { "species", required_argument, 0, 's' },
         { "st_logLevel", required_argument, 0, 'a' },
         { "cactusDisk", required_argument, 0, 'c' },
         { "outputFile", required_argument, 0, 'o' },
         { "help", no_argument, 0, 'h' },
         { 0, 0, 0, 0 }
      };

      int option_index = 0;

      int key = getopt_long(argc, argv, "s:g:o:a:c:h", long_options, &option_index);

      if(key == -1) {
         break;
      }

      switch(key) {
         case 'a':
            st_logLevelString = stString_copy(optarg);
            break;
         case 'c':
            cactusDiskDatabaseString = stString_copy(optarg);
            break;
         case 'o':
            outputFile = stString_copy(optarg);
            break;
         case 's':
            species = stString_copy(optarg);
            break;
         case 'g':
            geneFile = stString_copy(optarg);
            break;
         case 'h':
            usage();
            return 0;
         default:
            usage();
            return 1;
      }
   }

   ///////////////////////////////////////////////////////////////////////////
   // (0) Check the inputs.
   ///////////////////////////////////////////////////////////////////////////

   assert(cactusDiskDatabaseString != NULL);
   assert(outputFile != NULL);
   assert(species != NULL);
   assert(geneFile != NULL);

   //////////////////////////////////////////////
   //Set up st_logging
   //////////////////////////////////////////////

   st_setLogLevelFromString(st_logLevelString);

   //////////////////////////////////////////////
   //Log (some of) the inputs
   //////////////////////////////////////////////

   st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString);
   st_logInfo("Output file : %s\n", outputFile);
   st_logInfo("Species: %s\n", species);
   st_logInfo("GenePslFile: %s\n", geneFile);

   //////////////////////////////////////////////
   //Load the database
   //////////////////////////////////////////////

   stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString);
   CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0);
   st_logInfo("Set up the flower disk\n");

   ///////////////////////////////////////////////////////////////////////////
   // Parse the basic reconstruction problem
   ///////////////////////////////////////////////////////////////////////////
   flower = cactusDisk_getFlower(cactusDisk, cactusMisc_stringToName(flowerName));
   st_logInfo("Parsed the top level flower of the cactus tree to check\n");

   ///////////////////////////////////////////////////////////////////////////
   // Recursive check the flowers.
   ///////////////////////////////////////////////////////////////////////////

   int64_t startTime = time(NULL);
   FILE *fileHandle = fopen(outputFile, "w");
   struct bed *gene = bedLoadAll(geneFile);
   mapGenes(flower, fileHandle, gene, species);
   fclose(fileHandle);
   st_logInfo("Map genes in %" PRIi64 " seconds/\n", time(NULL) - startTime);

   ///////////////////////////////////////////////////////////////////////////
   // Clean up.
   ///////////////////////////////////////////////////////////////////////////

   cactusDisk_destruct(cactusDisk);

   return 0;
}
Esempio n. 22
0
void stTree_setLabel(stTree *tree, const char *label) {
    if(tree->label != NULL) {
        free(tree->label);
    }
    tree->label = label == NULL ? NULL : stString_copy(label);
}
struct bed *bedLoadAll(char *fileName){
    struct bed *list = NULL;
    struct bed *currbed = NULL;
    struct bed *prevbed = NULL;

    FILE *fp;
    char *line = st_malloc(sizeof(char)*LINEMAXSIZE);
    char *lend;
    struct List *lineList;

    fp = fopen(fileName, "r");
    assert(fp != NULL);
    
    while( fgets(line, LINEMAXSIZE, fp) != NULL ){//each line
        //catch if not enough buffer
        if ( (lend = strstr(line, "\n")) == NULL ){
            fprintf(stderr, "Input line is too long for buffer: \n*%s*\n", line);
	    exit(0);
        }else{//trim \n
            *lend = '\0';
        }
 
	//skip header line
	if(strstr(line, "track") == line){ continue; }

        lineList = splitString(line, "\t");

	//empty lines, skip
	if(lineList->length == 0){ continue; }

        if(lineList->length < 12){
	    fprintf(stderr, "Wrong input format. Need at least 12 fields for each bed\n");
	    exit(0);
        }else{
	    currbed = constructbed(); 
            currbed->chrom = stString_copy(lineList->list[0]);
            assert( sscanf(lineList->list[1], "%" PRIi64, &(currbed->chromStart)) == 1);
            assert( sscanf(lineList->list[2], "%" PRIi64, &(currbed->chromEnd)) == 1);
            currbed->name = stString_copy(lineList->list[3]);
            assert( sscanf(lineList->list[4], "%d", &(currbed->score)) == 1);
            currbed->strand = stString_copy(lineList->list[5]);
            assert( sscanf(lineList->list[6], "%" PRIi64, &(currbed->thickStart)) == 1);
            assert( sscanf(lineList->list[7], "%" PRIi64, &(currbed->thickEnd)) == 1);
            assert( sscanf(lineList->list[8], "%" PRIi64, &(currbed->itemRgb)) == 1);
            assert( sscanf(lineList->list[9], "%" PRIi64, &(currbed->blockCount)) == 1);
            currbed->blockSizes = splitIntString(stString_copy(lineList->list[10]), ",");
	    assert(currbed->blockSizes->length == currbed->blockCount);
            currbed->chromStarts = splitIntString(stString_copy(lineList->list[11]), ",");
	    assert(currbed->chromStarts->length == currbed->blockCount);
            //destructList(lineList);
	    if(list == NULL){ 
	        list = currbed; 
		prevbed = currbed;
	    }else{
                prevbed->next = currbed;
		prevbed = currbed;
	    }
	}
        st_logInfo("Gene: %s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\n", 
                    currbed->chrom, currbed->chromStart, currbed->chromEnd, currbed->name, 
                    currbed->score, currbed->strand, currbed->thickStart, currbed->thickEnd, 
                    currbed->itemRgb, currbed->blockCount);
    }

    fclose(fp);
    return list;
}
Esempio n. 24
0
int main(int argc, char *argv[]) {
    /*
     * Script for adding a reference genome to a flower.
     */

    /*
     * Arguments/options
     */
    char * logLevelString = NULL;
    char * cactusDiskDatabaseString = NULL;
    char *referenceEventString =
            (char *) cactusMisc_getDefaultReferenceEventHeader();
    char *outputFile = NULL;
    Name flowerName = NULL_NAME;

    ///////////////////////////////////////////////////////////////////////////
    // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
    ///////////////////////////////////////////////////////////////////////////

    while (1) {
        static struct option long_options[] = { { "logLevel",
                required_argument, 0, 'a' }, { "cactusDisk", required_argument,
                0, 'c' },  { "flowerName", required_argument,
                        0, 'd' },
                { "referenceEventString", required_argument, 0, 'g' }, {
                        "help", no_argument, 0, 'h' }, { "outputFile",
                        required_argument, 0, 'k' },
                { 0, 0, 0, 0 } };

        int option_index = 0;

        int key = getopt_long(argc, argv, "a:c:d:g:hk:", long_options,
                &option_index);

        if (key == -1) {
            break;
        }

        switch (key) {
            case 'a':
                logLevelString = stString_copy(optarg);
                break;
            case 'c':
                cactusDiskDatabaseString = stString_copy(optarg);
                break;
            case 'd':
                flowerName = cactusMisc_stringToName(optarg);
                break;
            case 'g':
                referenceEventString = stString_copy(optarg);
                break;
            case 'h':
                usage();
                return 0;
            case 'k':
                outputFile = stString_copy(optarg);
                break;
            default:
                usage();
                return 1;
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    // (0) Check the inputs.
    ///////////////////////////////////////////////////////////////////////////

    assert(cactusDiskDatabaseString != NULL);

    //////////////////////////////////////////////
    //Set up logging
    //////////////////////////////////////////////

    st_setLogLevelFromString(logLevelString);

    //////////////////////////////////////////////
    //Load the database
    //////////////////////////////////////////////

    stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(
            cactusDiskDatabaseString);
    CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0);
    stKVDatabaseConf_destruct(kvDatabaseConf);
    st_logInfo("Set up the flower disk\n");


    ///////////////////////////////////////////////////////////////////////////
    // Get the set of flowers to manipulate
    ///////////////////////////////////////////////////////////////////////////

    Flower *flower = cactusDisk_getFlower(cactusDisk, flowerName);

    ///////////////////////////////////////////////////////////////////////////
    // Get the reference event name
    ///////////////////////////////////////////////////////////////////////////

    Event *referenceEvent = eventTree_getEventByHeader(
            flower_getEventTree(flower), referenceEventString);
    assert(referenceEvent != NULL);
    Name referenceEventName = event_getName(referenceEvent);

    ///////////////////////////////////////////////////////////////////////////
    // Now process each flower in turn.
    ///////////////////////////////////////////////////////////////////////////
    
    if(outputFile == NULL) {
        st_errAbort("No output file specified\n");
    }
    FILE *fileHandle = fopen(outputFile, "w");
    printFastaSequences(flower, fileHandle, referenceEventName);
    if(fileHandle != NULL) {
        fclose(fileHandle);
    }

    ///////////////////////////////////////////////////////////////////////////
    //Clean up memory
    ///////////////////////////////////////////////////////////////////////////

    cactusDisk_destruct(cactusDisk);

    //return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection.

    free(cactusDiskDatabaseString);
    free(referenceEventString);
    free(logLevelString);

    st_logInfo("Cleaned stuff up and am finished\n");
    //while(1);
    return 0;
}
static char *terminalAdjacencyWriteFn(Cap *cap) {
    return stString_copy("");
}
int main(int argc, char *argv[]) {
    Flower *flower;
    FILE *fileHandle;

    /*
     * Arguments/options
     */
    char * logLevelString = NULL;
    char * cactusDiskDatabaseString = NULL;
    char * flowerName = NULL;
    char * outputFile = NULL;

    ///////////////////////////////////////////////////////////////////////////
    // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
    ///////////////////////////////////////////////////////////////////////////

    while (1) {
        static struct option long_options[] = { { "logLevel",
                required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0,
                'c' }, { "flowerName", required_argument, 0, 'd' }, {
                "outputFile", required_argument, 0, 'e' }, { "scaleNodeSizes",
                no_argument, 0, 'f' }, { "nameLabels", no_argument, 0, 'g' }, {
                "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } };

        int option_index = 0;

        int key = getopt_long(argc, argv, "a:c:d:e:fgh", long_options,
                &option_index);

        if (key == -1) {
            break;
        }

        switch (key) {
            case 'a':
                logLevelString = stString_copy(optarg);
                break;
            case 'c':
                cactusDiskDatabaseString = stString_copy(optarg);
                break;
            case 'd':
                flowerName = stString_copy(optarg);
                break;
            case 'e':
                outputFile = stString_copy(optarg);
                break;
            case 'f':
                scaleNodeSizes = !scaleNodeSizes;
                break;
            case 'g':
                nameLabels = !nameLabels;
                break;
            case 'h':
                usage();
                return 0;
            default:
                usage();
                return 1;
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    // (0) Check the inputs.
    ///////////////////////////////////////////////////////////////////////////

    assert(cactusDiskDatabaseString != NULL);
    assert(flowerName != NULL);
    assert(outputFile != NULL);

    //////////////////////////////////////////////
    //Set up logging
    //////////////////////////////////////////////

    st_setLogLevelFromString(logLevelString);

    //////////////////////////////////////////////
    //Log (some of) the inputs
    //////////////////////////////////////////////

    st_logInfo("Flower name : %s\n", flowerName);
    st_logInfo("Output graph file : %s\n", outputFile);

    //////////////////////////////////////////////
    //Load the database
    //////////////////////////////////////////////

    stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString);
    CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0);
    st_logInfo("Set up the flower disk\n");

    ///////////////////////////////////////////////////////////////////////////
    // Parse the basic reconstruction problem
    ///////////////////////////////////////////////////////////////////////////

    flower = cactusDisk_getFlower(cactusDisk, cactusMisc_stringToName(flowerName));
    st_logInfo("Parsed the top level flower of the cactus tree to build\n");

    ///////////////////////////////////////////////////////////////////////////
    // Build the graph.
    ///////////////////////////////////////////////////////////////////////////

    totalProblemSize = flower_getTotalBaseLength(flower);
    fileHandle = fopen(outputFile, "w");
    graphViz_setupGraphFile(fileHandle);
    makeCactusTree_flower(flower, fileHandle, NULL, NULL);
    graphViz_finishGraphFile(fileHandle);
    fclose(fileHandle);
    st_logInfo("Written the tree to file\n");

    ///////////////////////////////////////////////////////////////////////////
    // Clean up.
    ///////////////////////////////////////////////////////////////////////////

    cactusDisk_destruct(cactusDisk);
    stKVDatabaseConf_destruct(kvDatabaseConf);

    return 0;
}
Esempio n. 27
0
int main(int argc, char *argv[]) {
    /*
     * Open the database.
     * Construct a flower.
     * Construct an event tree representing the species tree.
     * For each sequence contruct two ends each containing an cap.
     * Make a file for the sequence.
     * Link the two caps.
     * Finish!
     */

    int64_t key, j;
    Group *group;
    Flower_EndIterator *endIterator;
    End *end;
    bool makeEventHeadersAlphaNumeric = 0;

    /*
     * Arguments/options
     */
    char * logLevelString = NULL;
    char * speciesTree = NULL;
    char * outgroupEvents = NULL;

    ///////////////////////////////////////////////////////////////////////////
    // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
    ///////////////////////////////////////////////////////////////////////////

    while (1) {
        static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, {
                "speciesTree", required_argument, 0, 'f' }, { "outgroupEvents", required_argument, 0, 'g' },
                { "help", no_argument, 0, 'h' }, { "makeEventHeadersAlphaNumeric", no_argument, 0, 'i' }, { 0, 0, 0, 0 } };

        int option_index = 0;

        key = getopt_long(argc, argv, "a:b:f:hg:i", long_options, &option_index);

        if (key == -1) {
            break;
        }

        switch (key) {
            case 'a':
                logLevelString = optarg;
                break;
            case 'b':
                cactusDiskDatabaseString = optarg;
                break;
            case 'f':
                speciesTree = optarg;
                break;
            case 'g':
                outgroupEvents = optarg;
                break;
            case 'h':
                usage();
                return 0;
            case 'i':
                makeEventHeadersAlphaNumeric = 1;
                break;
            default:
                usage();
                return 1;
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    // (0) Check the inputs.
    ///////////////////////////////////////////////////////////////////////////

    //assert(logLevelString == NULL || strcmp(logLevelString, "CRITICAL") == 0 || strcmp(logLevelString, "INFO") == 0 || strcmp(logLevelString, "DEBUG") == 0);
    assert(cactusDiskDatabaseString != NULL);
    assert(speciesTree != NULL);

    //////////////////////////////////////////////
    //Set up logging
    //////////////////////////////////////////////

    st_setLogLevelFromString(logLevelString);

    //////////////////////////////////////////////
    //Log (some of) the inputs
    //////////////////////////////////////////////

    st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString);

    for (j = optind; j < argc; j++) {
        st_logInfo("Sequence file/directory %s\n", argv[j]);
    }

    //////////////////////////////////////////////
    //Load the database
    //////////////////////////////////////////////

    stKVDatabaseConf *kvDatabaseConf = kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString);
    if (stKVDatabaseConf_getType(kvDatabaseConf) == stKVDatabaseTypeTokyoCabinet || stKVDatabaseConf_getType(kvDatabaseConf)
            == stKVDatabaseTypeKyotoTycoon) {
        assert(stKVDatabaseConf_getDir(kvDatabaseConf) != NULL);
        cactusDisk = cactusDisk_construct2(kvDatabaseConf, "cactusSequences");
    } else {
        cactusDisk = cactusDisk_construct(kvDatabaseConf, 1);
    }
    st_logInfo("Set up the flower disk\n");

    //////////////////////////////////////////////
    //Construct the flower
    //////////////////////////////////////////////

    if (cactusDisk_getFlower(cactusDisk, 0) != NULL) {
        cactusDisk_destruct(cactusDisk);
        st_logInfo("The first flower already exists\n");
        return 0;
    }
    flower = flower_construct2(0, cactusDisk);
    assert(flower_getName(flower) == 0);
    st_logInfo("Constructed the flower\n");

    //////////////////////////////////////////////
    //Construct the event tree
    //////////////////////////////////////////////

    st_logInfo("Going to build the event tree with newick string: %s\n", speciesTree);
    stTree *tree = stTree_parseNewickString(speciesTree);
    st_logInfo("Parsed the tree\n");
    if (makeEventHeadersAlphaNumeric) {
        makeEventHeadersAlphaNumericFn(tree);
    }
    stTree_setBranchLength(tree, INT64_MAX);
    checkBranchLengthsAreDefined(tree);
    eventTree = eventTree_construct2(flower); //creates the event tree and the root even
    totalEventNumber = 1;
    st_logInfo("Constructed the basic event tree\n");

    // Construct a set of outgroup names so that ancestral outgroups
    // get recognized.
    stSet *outgroupNameSet = stSet_construct3(stHash_stringKey,
                                              stHash_stringEqualKey,
                                              free);
    if(outgroupEvents != NULL) {
        stList *outgroupNames = stString_split(outgroupEvents);
        for(int64_t i = 0; i < stList_length(outgroupNames); i++) {
            char *outgroupName = stList_get(outgroupNames, i);
            stSet_insert(outgroupNameSet, stString_copy(outgroupName));
        }
        stList_destruct(outgroupNames);
    }

    //now traverse the tree
    j = optind;
    assignEventsAndSequences(eventTree_getRootEvent(eventTree), tree,
                             outgroupNameSet, argv, &j);

    char *eventTreeString = eventTree_makeNewickString(eventTree);
    st_logInfo(
            "Constructed the initial flower with %" PRIi64 " sequences and %" PRIi64 " events with string: %s\n",
            totalSequenceNumber, totalEventNumber, eventTreeString);
    assert(event_getSubTreeBranchLength(eventTree_getRootEvent(eventTree)) >= 0.0);
    free(eventTreeString);
    //assert(0);

    //////////////////////////////////////////////
    //Label any outgroup events.
    //////////////////////////////////////////////

    if (outgroupEvents != NULL) {
        stList *outgroupEventsList = stString_split(outgroupEvents);
        for (int64_t i = 0; i < stList_length(outgroupEventsList); i++) {
            char *outgroupEvent = makeEventHeadersAlphaNumeric ? makeAlphaNumeric(stList_get(outgroupEventsList, i)) : stString_copy(stList_get(outgroupEventsList, i));
            Event *event = eventTree_getEventByHeader(eventTree, outgroupEvent);
            if (event == NULL) {
                st_errAbort("Got an outgroup string that does not match an event, outgroup string %s", outgroupEvent);
            }
            assert(!event_isOutgroup(event));
            event_setOutgroupStatus(event, 1);
            assert(event_isOutgroup(event));
            free(outgroupEvent);
        }
        stList_destruct(outgroupEventsList);
    }

    //////////////////////////////////////////////
    //Construct the terminal group.
    //////////////////////////////////////////////

    if (flower_getEndNumber(flower) > 0) {
        group = group_construct2(flower);
        endIterator = flower_getEndIterator(flower);
        while ((end = flower_getNextEnd(endIterator)) != NULL) {
            end_setGroup(end, group);
        }
        flower_destructEndIterator(endIterator);
        assert(group_isLeaf(group));

        // Create a one link chain if there is only one pair of attached ends..
        group_constructChainForLink(group);
        assert(!flower_builtBlocks(flower));
    } else {
        flower_setBuiltBlocks(flower, 1);
    }

    ///////////////////////////////////////////////////////////////////////////
    // Write the flower to disk.
    ///////////////////////////////////////////////////////////////////////////

    //flower_check(flower);
    cactusDisk_write(cactusDisk);
    st_logInfo("Updated the flower on disk\n");

    ///////////////////////////////////////////////////////////////////////////
    // Cleanup.
    ///////////////////////////////////////////////////////////////////////////

    cactusDisk_destruct(cactusDisk);

    return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection.

    stSet_destruct(outgroupNameSet);
    stTree_destruct(tree);
    stKVDatabaseConf_destruct(kvDatabaseConf);

    return 0;
}
int main(int argc, char *argv[]) {
    char * logLevelString = NULL;
    char * cactusDiskDatabaseString = NULL;
    char * flowerName = NULL;
    char * outputFile = NULL;
    char *referenceEventString = NULL;

    ///////////////////////////////////////////////////////////////////////////
    // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
    ///////////////////////////////////////////////////////////////////////////

    while (1) {
        static struct option long_options[] = { 
                { "logLevel", required_argument, 0, 'a' }, 
                { "referenceEventString", required_argument, 0, 'b' },
                { "cactusDisk", required_argument, 0, 'c' }, 
		{ "flowerName", required_argument, 0, 'e' },
		{ "outputFile", required_argument, 0, 'f' },
                { "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } };

        int option_index = 0;

        int key = getopt_long(argc, argv, "a:b:c:d:e:f:h", long_options,
                &option_index);

        if (key == -1) {
            break;
        }

        switch (key) {
            case 'a':
                logLevelString = stString_copy(optarg);
                break;
            case 'b':
                referenceEventString = stString_copy(optarg);
                break;
            case 'c':
                cactusDiskDatabaseString = stString_copy(optarg);
                break;
            case 'e':
                flowerName = stString_copy(optarg);
                break;
            case 'f':
                outputFile = stString_copy(optarg);
                break;
            case 'h':
                usage();
                return 0;
            default:
                usage();
                return 1;
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    // (0) Check the inputs.
    ///////////////////////////////////////////////////////////////////////////

    assert(flowerName != NULL);
    assert(referenceEventString != NULL);
    assert(cactusDiskDatabaseString != NULL);
    assert(outputFile != NULL);

    //////////////////////////////////////////////
    //Set up logging
    //////////////////////////////////////////////

    st_setLogLevelFromString(logLevelString);

    //////////////////////////////////////////////
    //Log (some of) the inputs
    //////////////////////////////////////////////

    st_logInfo("Flower name : %s\n", flowerName);
    st_logInfo("Sequence name : %s\n", referenceEventString);
    st_logInfo("Output file : %s\n", outputFile);

    //////////////////////////////////////////////
    //Load the database
    //////////////////////////////////////////////

    stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(
            cactusDiskDatabaseString);
    CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, false, true);
    st_logInfo("Set up the flower disk\n");

    ///////////////////////////////////////////////////////////////////////////
    // Parse the basic reconstruction problem
    ///////////////////////////////////////////////////////////////////////////

    Flower *flower = cactusDisk_getFlower(cactusDisk, cactusMisc_stringToName(
            flowerName));
    st_logInfo("Parsed the top level flower of the cactus tree to check\n");

    ///////////////////////////////////////////////////////////////////////////
    // Recursive check the flowers.
    ///////////////////////////////////////////////////////////////////////////

    //int64_t startTime = time(NULL);
    //flower = flower_addReferenceSequence(flower, cactusDisk, name);
    //st_logInfo("Added the reference sequence in %" PRIi64 " seconds/\n", time(NULL) - startTime);

    int64_t numSequences = flower_getSequenceNumber(flower);
    //Make sure that referenceSequence has already been added:
    if(getSequenceMatchesEvent(flower, referenceEventString) == NULL &&
       numSequences > 0){
        fprintf(stderr, "No reference sequence found in cactusDisk\n");
        exit(EXIT_FAILURE); 
    }
      
    FILE *fileHandle = fopen(outputFile, "w");
    if (numSequences > 0) {
      getReferenceSequences(fileHandle, flower, referenceEventString);
    }
    else {
      st_logCritical("cactus_getReferenceSeq found no reference sequence in empty cactus disk %s",
                     cactusDiskDatabaseString);
    }
    fclose(fileHandle);

    ///////////////////////////////////////////////////////////////////////////
    // Clean up.
    ///////////////////////////////////////////////////////////////////////////

    cactusDisk_destruct(cactusDisk);

    return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection.

    stKVDatabaseConf_destruct(kvDatabaseConf);

    return 0;
}
Esempio n. 29
0
int main(int argc, char *argv[]) {
    /*
     * Arguments/options
     */
    char * logLevelString = NULL;
    char * mAFFile1 = NULL;
    char * mAFFile2 = NULL;
    char * outputFile = NULL;
    int32_t sampleNumber = 1000000; // by default do a million samples per pair.
    char * trioFile = NULL;

    ///////////////////////////////////////////////////////////////////////////
    // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
    ///////////////////////////////////////////////////////////////////////////

    while(1) {
        static struct option long_options[] = {
            { "logLevel", required_argument, 0, 'a' },
            { "mAFFile1", required_argument, 0, 'b' },
            { "mAFFile2", required_argument, 0, 'c' },
            { "outputFile", required_argument, 0, 'd' },
            { "sampleNumber", required_argument, 0, 'e' },
            { "trioFile", required_argument, 0, 'f' },
            { "help", no_argument, 0, 'h' },
            { 0, 0, 0, 0 }
        };

        int option_index = 0;

        int key = getopt_long(argc, argv, "a:b:c:d:e:f:h", long_options, &option_index);

        if(key == -1) {
            break;
        }

        switch(key) {
            case 'a':
                logLevelString = stString_copy(optarg);
                break;
            case 'b':
                mAFFile1 = stString_copy(optarg);
                break;
            case 'c':
                mAFFile2 = stString_copy(optarg);
                break;
            case 'd':
                outputFile = stString_copy(optarg);
                break;
            case 'e':
                assert(sscanf(optarg, "%i", &sampleNumber) == 1);
                break;
            case 'f':
                trioFile = stString_copy(optarg);
                break;
            case 'h':
                usage();
                return 0;
            default:
                usage();
                return 1;
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    // (0) Check the inputs.
    ///////////////////////////////////////////////////////////////////////////

    if (argc == 1) {
        usage();
        return 1;
    }

    assert(mAFFile1 != NULL);
    assert(mAFFile2 != NULL);
    assert(outputFile != NULL);
    assert(trioFile != NULL);

    FILE *fileHandle = fopen(mAFFile1, "r");
    if (fileHandle == NULL) {
        fprintf(stderr, "ERROR, unable to open `%s', is path correct?\n", mAFFile1);
        exit(1);
    }
    fclose(fileHandle);
    fileHandle = fopen(mAFFile2, "r");
    if (fileHandle == NULL) {
        fprintf(stderr, "ERROR, unable to open `%s', is path correct?\n", mAFFile2);
        exit(1);
    }
    fclose(fileHandle);
    fileHandle = fopen(trioFile, "r");
    if (fileHandle == NULL) {
        fprintf(stderr, "ERROR, unable to open `%s', is path correct?\n", trioFile);
        exit(1);
    }
    fclose(fileHandle);

    //////////////////////////////////////////////
    //Set up logging
    //////////////////////////////////////////////

    st_setLogLevelFromString(logLevelString);

    //////////////////////////////////////////////
    //Log (some of) the inputs
    //////////////////////////////////////////////

    st_logInfo("MAF file 1 name : %s\n", mAFFile1);
    st_logInfo("MAF file 2 name : %s\n", mAFFile2);
    st_logInfo("Trio species name : %s\n", trioFile);
    st_logInfo("Output stats file : %s\n", outputFile);

    /* Parse the trioFile triples into a list */
    struct List *speciesList = parseTrioFile(trioFile);

    //////////////////////////////////////////////
    // Create hashtable for the first MAF file.
    //////////////////////////////////////////////

    struct hashtable *seqNameHash;
    seqNameHash = create_hashtable(256, hashtable_stringHashKey, hashtable_stringEqualKey, free, free);
    populateNameHash(mAFFile1, seqNameHash);

    // TODO: Check if query species are in maf file

    //////////////////////////////////////////////
    //Do comparisons.
    //////////////////////////////////////////////

    struct avl_table *results_12 = compareMAFs_AB_Trio(mAFFile1, mAFFile2, sampleNumber, seqNameHash, speciesList);
    struct avl_table *results_21 = compareMAFs_AB_Trio(mAFFile2, mAFFile1, sampleNumber, seqNameHash, speciesList);

    fileHandle = fopen(outputFile, "w");
    if (fileHandle == NULL) {
        fprintf(stderr, "ERROR, unable to open `%s' for writing.\n", outputFile);
        exit(1);
    }

    fprintf(fileHandle, "<trio_comparisons sampleNumber=\"%i\">\n", sampleNumber);
    reportResultsTrio(results_12, mAFFile1, mAFFile2, fileHandle);
    reportResultsTrio(results_21, mAFFile2, mAFFile1, fileHandle);
    fprintf(fileHandle, "</trio_comparisons>\n");
    fclose(fileHandle);

    ///////////////////////////////////////////////////////////////////////////
    // Clean up.
    ///////////////////////////////////////////////////////////////////////////

    free(mAFFile1);
    free(mAFFile2);
    free(outputFile);
    free(trioFile);
    free(logLevelString);

    hashtable_destroy(seqNameHash, 1, 1);

    avl_destroy(results_12, (void (*)(void *, void *))aTrio_destruct);
    avl_destroy(results_21, (void (*)(void *, void *))aTrio_destruct);

    destructList(speciesList);

    return 0;
}
Esempio n. 30
0
stKVDatabaseConf *stKVDatabaseConf_constructTokyoCabinet(const char *databaseDir) {
    stKVDatabaseConf *conf = stSafeCCalloc(sizeof(stKVDatabaseConf));
    conf->type = stKVDatabaseTypeTokyoCabinet;
    conf->databaseDir = stString_copy(databaseDir);
    return conf;
}