/* Parse XML string into a hash. This parses all attributes of all tags * into values. st_kv_database_conf type is stored as conf_type, * database tag is stores as db_tag. This does minimal error checking * and is really lame. */ static stHash *hackParseXmlString(const char *xmlString) { stHash *hash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, free); char *toReplace[5] = { "</", "<", "/>", ">", "=" }; char *cA = stString_replace(xmlString, toReplace[0], " "), *cA2; for (int64_t i = 1; i < 5; i++) { cA2 = stString_replace(cA, toReplace[i], " "); free(cA); cA = cA2; } getExpectedToken(&cA2, "st_kv_database_conf"); stHash_insert(hash, stString_copy("conf_type"), getKeyValue(&cA2, "type")); stHash_insert(hash, stString_copy("db_tag"), getNextToken(&cA2)); char *key; while (((key = getNextToken(&cA2)) != NULL) && !stString_eq(key, "st_kv_database_conf")) { char *value = getNextToken(&cA2); if (value == NULL) { stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "failed to to get value for key \"%s\"", key); } if (stHash_search(hash, key) != NULL) { stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "got a duplicate entry in the database conf string \"%s\"", key); } stHash_insert(hash, key, value); } if(!stString_eq(key, "st_kv_database_conf")) { stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "got an unexpected final entry \"%s\"", key); } free(key); free(cA); return hash; }
static void test_addBlockToHash_3(CuTest *testCase) { // concatenation with 2 bases of interstitial and a sequence length breakpoint options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name3.chr1 0 13 + 100 GCAGCTGAAAACA\n", observedList ); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gtcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr1 50 5 + 100 CCCCC\n" , 3); stHash *expectedHash = NULL; expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGac----------gtcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG------------ATGTg\n" "s name3 0 28 + 28 GCAGCTGAAAACA--NNNNNNNNNNCCCCC\n", expectedList ); row_t *r = stHash_search(expectedHash, "name3"); r->prevRightPos = 54; free(r->prevName); r->prevName = stString_copy("name3.chr1"); r->multipleNames = true; stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("GCAGCTGAAAACAggggggggggggggggggggggggggggggggggggg" "CCCCCaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ); stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
static char *tree_getNewickTreeStringP(stTree *tree) { char *cA, *cA2; if(stTree_getChildNumber(tree) > 0) { int32_t i; cA = stString_copy("("); for(i=0; i<stTree_getChildNumber(tree); i++) { cA2 = tree_getNewickTreeStringP(stTree_getChild(tree, i)); char *cA3 = stString_print((i+1 < stTree_getChildNumber(tree) ? "%s%s," : "%s%s"), cA, cA2); free(cA); free(cA2); cA = cA3; } cA2 = stString_print("%s)", cA); free(cA); cA = cA2; } else { cA = stString_copy(""); } if(stTree_getLabel(tree) != NULL) { cA2 = stString_print("%s%s", cA, stTree_getLabel(tree)); free(cA); cA = cA2; } if(stTree_getBranchLength(tree) != INFINITY) { char *cA2 = stString_print("%s:%g", cA, stTree_getBranchLength(tree)); free(cA); cA = cA2; } return cA; }
static void test_addMafLineToRow_1(CuTest *testCase) { row_t *obs = newRow(20); obs->name = stString_copy("seq1.chr0"); obs->prevName = stString_copy("seq1.amazing"); obs->multipleNames = true; obs->start = 3; obs->length = 10; obs->prevRightPos = 20; obs->strand = '+'; obs->prevStrand = '+'; obs->sourceLength = 100; row_copyIn(obs, "acgtacgtac"); mafLine_t *ml = maf_newMafLineFromString("s seq1.chr_bleh 13 5 + 100 ACGTA", 10); addMafLineToRow(obs, ml); row_t *exp = newRow(20); exp->name = stString_copy("seq1.chr0"); exp->prevName = stString_copy("seq1.chr_bleh"); exp->multipleNames = true; exp->start = 3; exp->length = 15; exp->prevRightPos = 17; exp->strand = '+'; exp->prevStrand = '+'; exp->sourceLength = 15; row_copyIn(exp, "acgtacgtacACGTA"); CuAssertTrue(testCase, rowsAreEqual(obs, exp)); destroyRow(obs); destroyRow(exp); maf_destroyMafLineList(ml); }
static stKVDatabaseConf *constructSql(stKVDatabaseType type, const char *host, unsigned port, const char *user, const char *password, const char *databaseName, const char *tableName) { stKVDatabaseConf *conf = stSafeCCalloc(sizeof(stKVDatabaseConf)); conf->type = type; conf->host = stString_copy(host); conf->port = port; conf->user = stString_copy(user); conf->password = stString_copy(password); conf->databaseName = stString_copy(databaseName); conf->tableName = stString_copy(tableName); return conf; }
static void test_addBlockToHash_2(CuTest *testCase) { // concatenation with 2 bases of interstitial AND a previously unobserved sequence options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n", observedList); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gTcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr@ 0 5 + 20 aaccg\n" , 3); stHash *expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca--ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGacgTcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG--ATGTg\n" "s name3.chr@ 0 5 + 20 ---------------aaccg\n", expectedList ); stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgTc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("aaccgTTTTTTTTTTTTTTT"); stHash_insert(seqHash, stString_copy("name3.chr@"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
stKVDatabaseConf *stKVDatabaseConf_constructKyotoTycoon(const char *host, unsigned port, int timeout, int64_t maxRecordSize, int64_t maxBulkSetSize, int64_t maxBulkSetNumRecords, const char *databaseDir, const char* databaseName) { stKVDatabaseConf *conf = stSafeCCalloc(sizeof(stKVDatabaseConf)); conf->type = stKVDatabaseTypeKyotoTycoon; conf->databaseDir = stString_copy(databaseDir); conf->host = stString_copy(host); conf->port = port; conf->timeout = timeout; conf->maxKTRecordSize = maxRecordSize; conf->maxKTBulkSetSize = maxBulkSetSize; conf->maxKTBulkSetNumRecords = maxBulkSetNumRecords; conf->databaseName = stString_copy(databaseName); return conf; }
// Returns a hash mapping from sequence header to sequence data. static stHash *readFastaFile(char *filename) { FILE *fasta = fopen(filename, "r"); if (fasta == NULL) { st_errnoAbort("Could not open fasta file %s", filename); } stHash *headerToData = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, free); struct List *seqs = constructEmptyList(0, NULL); struct List *seqLengths = constructEmptyList(0, free); struct List *headers = constructEmptyList(0, free); fastaRead(fasta, seqs, seqLengths, headers); for (int64_t i = 0; i < seqs->length; i++) { char *fullHeader = headers->list[i]; stList *headerTokens = stString_splitByString(fullHeader, " "); char *usableHeader = stString_copy(stList_get(headerTokens, 0)); stHash_insert(headerToData, usableHeader, seqs->list[i]); stList_destruct(headerTokens); } destructList(seqs); destructList(seqLengths); destructList(headers); return headerToData; }
static stHash *createSeqHashFromString(char *name, char *input) { mtfseq_t *mtfs = newMtfseq(strlen(input)); stHash *hash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, destroyMtfseq); seq_copyIn(mtfs, input); stHash_insert(hash, stString_copy(name), mtfs); return hash; }
char *makeAlphaNumeric(const char *string) { char *cA = stString_copy(string); int64_t j = 0; for (int64_t i = 0; i < strlen(string); i++) { if (isalpha(string[i]) || isdigit(string[i])) { cA[j++] = string[i]; } } cA[j] = '\0'; return cA; }
static int64_t *getInts(const char *string, int64_t *arrayLength) { int64_t *iA = st_malloc(sizeof(int64_t) * strlen(string)); char *cA = stString_copy(string); char *cA2 = cA; char *cA3; *arrayLength = 0; while ((cA3 = stString_getNextWord(&cA)) != NULL) { int64_t i = sscanf(cA3, "%" PRIi64 "", &iA[(*arrayLength)++]); (void) i; assert(i == 1); free(cA3); } free(cA2); return iA; }
int main(int argc, char *argv[]) { char *cactusDiskString = NULL; stKVDatabaseConf *kvDatabaseConf; CactusDisk *cactusDisk; Flower *flower; Flower_SequenceIterator *flowerIt; Sequence *sequence; struct option longopts[] = { {"cactusDisk", required_argument, NULL, 'c' }, {0, 0, 0, 0} }; int flag; while((flag = getopt_long(argc, argv, "", longopts, NULL)) != -1) { switch(flag) { case 'c': cactusDiskString = stString_copy(optarg); break; case '?': default: usage(); return 1; } } if (cactusDiskString == NULL) { st_errAbort("--cactusDisk option must be provided"); } kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskString); cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); // Get top-level flower. flower = cactusDisk_getFlower(cactusDisk, 0); flowerIt = flower_getSequenceIterator(flower); while((sequence = flower_getNextSequence(flowerIt)) != NULL) { MetaSequence *metaSequence = sequence_getMetaSequence(sequence); const char *header; char *firstToken, *newHeader; stList *tokens; // Strip the ID token from the header (should be the first // |-separated token) and complain if there isn't one. header = metaSequence_getHeader(metaSequence); tokens = fastaDecodeHeader(header); assert(stList_length(tokens) > 1); firstToken = stList_removeFirst(tokens); assert(!strncmp(firstToken, "id=", 3)); free(firstToken); newHeader = fastaEncodeHeader(tokens); metaSequence_setHeader(metaSequence, newHeader); } cactusDisk_write(cactusDisk); }
Event *event_construct(Name name, const char *header, float branchLength, Event *parentEvent, EventTree *eventTree) { assert(eventTree_getEvent(eventTree, name) == NULL); //the event must not already exist in the tree. Event *event; event = st_malloc(sizeof(Event)); event->name = name; event->parent = parentEvent; event->children = constructEmptyList(0, NULL); event->header = stString_copy(header == NULL ? "" : header); event->branchLength = branchLength < 0.0 ? 0.0 : branchLength; event->isOutgroup = 0; if (parentEvent != NULL) { listAppend(parentEvent->children, event); } event->eventTree = eventTree; eventTree_addEvent(eventTree, event); return event; }
MetaSequence *metaSequence_construct2(Name name, int64_t start, int64_t length, Name stringName, const char *header, Name eventName, bool isTrivialSequence, CactusDisk *cactusDisk) { MetaSequence *metaSequence; metaSequence = st_malloc(sizeof(MetaSequence)); metaSequence->name = name; assert(length >= 0); metaSequence->start = start; metaSequence->length = length; metaSequence->stringName = stringName; metaSequence->eventName = eventName; metaSequence->cactusDisk = cactusDisk; metaSequence->header = stString_copy(header != NULL ? header : ""); metaSequence->isTrivialSequence = isTrivialSequence; cactusDisk_addMetaSequence(cactusDisk, metaSequence); return metaSequence; }
char *getTerminalAdjacencySubString(Cap *cap) { if(getTerminalAdjacencyLength_ignoreAdjacencies) { return stString_copy(""); } cap = getTerminalCap(cap); cap = cap_getStrand(cap) ? cap : cap_getReverse(cap); //This ensures the asserts are as expected. Cap *adjacentCap = cap_getAdjacency(cap); int64_t i = cap_getCoordinate(cap) - cap_getCoordinate(adjacentCap); assert(i != 0); if (i > 0) { assert(cap_getSide(cap)); assert(!cap_getSide(adjacentCap)); return sequence_getString(cap_getSequence(cap), cap_getCoordinate(adjacentCap) + 1, i - 1, 1); } else { assert(cap_getSide(adjacentCap)); assert(!cap_getSide(cap)); return sequence_getString(cap_getSequence(cap), cap_getCoordinate(cap) + 1, -i - 1, 1); } }
stKVDatabaseConf *stKVDatabaseConf_constructClone(stKVDatabaseConf *srcConf) { stKVDatabaseConf *conf = stSafeCCalloc(sizeof(stKVDatabaseConf)); conf->type = srcConf->type; conf->databaseDir = stString_copy(srcConf->databaseDir); conf->host = stString_copy(srcConf->host); conf->port = srcConf->port; conf->timeout = srcConf->timeout; conf->maxKTRecordSize = srcConf->maxKTRecordSize; conf->maxKTBulkSetSize = srcConf->maxKTBulkSetSize; conf->maxKTBulkSetNumRecords = srcConf->maxKTBulkSetNumRecords; conf->user = stString_copy(srcConf->user); conf->password = stString_copy(srcConf->password); conf->databaseName = stString_copy(srcConf->databaseName); conf->tableName = stString_copy(srcConf->tableName); return conf; }
char *cactusDisk_getString(CactusDisk *cactusDisk, Name name, int64_t start, int64_t length, int64_t strand, int64_t totalSequenceLength) { /* * Gets a string from the database. * */ assert(length >= 0); if (length == 0) { return stString_copy(""); } //First try getting it from the cache char *string = cactusDisk_getStringFromCache(cactusDisk, name, start, length, strand); if (string == NULL) { //If not in the cache, add it to the cache and then get it from the cache. stList *list = stList_construct3(0, (void (*)(void *)) substring_destruct); stList_append(list, substring_construct(name, start, length)); cacheSubstringsFromDB(cactusDisk, list); stList_destruct(list); string = cactusDisk_getStringFromCache(cactusDisk, name, start, length, strand); } assert(string != NULL); return string; }
/* connect to a database server */ static MySqlDb *connect(stKVDatabaseConf *conf) { MySqlDb *dbImpl = stSafeCCalloc(sizeof(MySqlDb)); if ((dbImpl->conn = mysql_init(NULL)) == NULL) { disconnect(dbImpl); stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "mysql_init failed"); } if (mysql_real_connect(dbImpl->conn, stKVDatabaseConf_getHost(conf), stKVDatabaseConf_getUser(conf), stKVDatabaseConf_getPassword(conf), stKVDatabaseConf_getDatabaseName(conf), stKVDatabaseConf_getPort(conf), NULL, 0) == NULL) { stExcept *ex = createMySqlExcept(dbImpl, "failed to connect to MySql database: %s on %s as user %s", stKVDatabaseConf_getDatabaseName(conf), stKVDatabaseConf_getHost(conf), stKVDatabaseConf_getUser(conf)); disconnect(dbImpl); stThrow(ex); } dbImpl->table = stString_copy(stKVDatabaseConf_getTableName(conf)); // disable report of notes, so only warnings and errors come back sqlExec(dbImpl, "set sql_notes=0"); // set max sizes of an sql statment to 1G. This must also be specified // for the server by adding "max_allowed_packet = 1G" to the [mysqld] // section of my.cnf sqlExec(dbImpl, "set global max_allowed_packet=1073741824"); // set the idle timeout to a week int waitTimeout = 7 * 24 * 60 * 60; // 1 week sqlExec(dbImpl, "set wait_timeout=%d", waitTimeout); // set the read timeout to an hour int readTimeout = 60 * 60; // 1 hour sqlExec(dbImpl, "set net_read_timeout=%d", readTimeout); // NOTE: commit will not return an error, this does row-level locking on // the select done before the update sqlExec(dbImpl, "set autocommit = 0;"); sqlExec(dbImpl, "set session transaction isolation level serializable;"); return dbImpl; }
static CactusDisk *cactusDisk_constructPrivate(stKVDatabaseConf *conf, bool create, const char *sequencesFileName) { //sequencesFileName = NULL; //Disable the ability to store the sequences on disk. CactusDisk *cactusDisk = st_calloc(1, sizeof(CactusDisk)); //construct lists of in memory objects cactusDisk->metaSequences = stSortedSet_construct3(cactusDisk_constructMetaSequencesP, NULL); cactusDisk->flowers = stSortedSet_construct3(cactusDisk_constructFlowersP, NULL); cactusDisk->flowerNamesMarkedForDeletion = stSortedSet_construct3((int (*)(const void *, const void *)) strcmp, free); cactusDisk->updateRequests = stList_construct3(0, (void (*)(void *)) stKVDatabaseBulkRequest_destruct); //Now open the database cactusDisk->database = stKVDatabase_construct(conf, create); cactusDisk->cache = stCache_construct(); cactusDisk->stringCache = stCache_construct(); //initialise the unique ids. int64_t seed = (clock() << 24) | (time(NULL) << 16) | (getpid() & 65535); //Likely to be unique st_logDebug("The cactus disk is seeding the random number generator with the value %" PRIi64 "\n", seed); st_randomSeed(seed); cactusDisk->uniqueNumber = 0; cactusDisk->maxUniqueNumber = 0; //Now load any stuff.. if (containsRecord(cactusDisk, CACTUS_DISK_PARAMETER_KEY)) { if (create) { stThrowNew(CACTUS_DISK_EXCEPTION_ID, "Tried to create a cactus disk, but the cactus disk already exists"); } if (sequencesFileName != NULL) { stThrowNew(CACTUS_DISK_EXCEPTION_ID, "A sequences file name is specified, but the cactus disk is not being created"); } void *record = getRecord(cactusDisk, CACTUS_DISK_PARAMETER_KEY, "cactus_disk parameters"); void *record2 = record; cactusDisk_loadFromBinaryRepresentation(&record, cactusDisk, conf); free(record2); } else { assert(create); if (sequencesFileName == NULL) { cactusDisk->storeSequencesInAFile = 0; cactusDisk->sequencesFileName = NULL; cactusDisk->sequencesReadFileHandle = NULL; cactusDisk->sequencesWriteFileHandle = NULL; cactusDisk->absSequencesFileName = NULL; } else { if (stKVDatabaseConf_getDir(conf) == NULL) { stThrowNew(CACTUS_DISK_EXCEPTION_ID, "The database conf does not contain a directory in which the sequence file is to be found!\n"); } cactusDisk->storeSequencesInAFile = 1; cactusDisk->sequencesFileName = stString_copy(sequencesFileName); cactusDisk->absSequencesFileName = stString_print("%s/%s", stKVDatabaseConf_getDir(conf), cactusDisk->sequencesFileName); //Make sure the file exists cactusDisk->sequencesReadFileHandle = fopen(cactusDisk->absSequencesFileName, "w"); assert(cactusDisk->sequencesReadFileHandle != NULL); fclose(cactusDisk->sequencesReadFileHandle); //Flush it first time. cactusDisk->sequencesReadFileHandle = NULL; cactusDisk->sequencesWriteFileHandle = NULL; } } return cactusDisk; }
int main(int argc, char *argv[]) { /* * Script for adding alignments to cactus tree. */ int64_t startTime; stKVDatabaseConf *kvDatabaseConf; CactusDisk *cactusDisk; int key, k; bool (*filterFn)(stPinchSegment *, stPinchSegment *) = NULL; stSet *outgroupThreads = NULL; /* * Arguments/options */ char * logLevelString = NULL; char * alignmentsFile = NULL; char * constraintsFile = NULL; char * cactusDiskDatabaseString = NULL; char * lastzArguments = ""; int64_t minimumSequenceLengthForBlast = 1; //Parameters for annealing/melting rounds int64_t *annealingRounds = NULL; int64_t annealingRoundsLength = 0; int64_t *meltingRounds = NULL; int64_t meltingRoundsLength = 0; //Parameters for melting float maximumAdjacencyComponentSizeRatio = 10; int64_t blockTrim = 0; int64_t alignmentTrimLength = 0; int64_t *alignmentTrims = NULL; int64_t chainLengthForBigFlower = 1000000; int64_t longChain = 2; int64_t minLengthForChromosome = 1000000; float proportionOfUnalignedBasesForNewChromosome = 0.8; bool breakChainsAtReverseTandems = 1; int64_t maximumMedianSequenceLengthBetweenLinkedEnds = INT64_MAX; bool realign = 0; char *realignArguments = ""; bool removeRecoverableChains = false; bool (*recoverableChainsFilter)(stCactusEdgeEnd *, Flower *) = NULL; int64_t maxRecoverableChainsIterations = 1; int64_t maxRecoverableChainLength = INT64_MAX; //Parameters for removing ancient homologies bool doPhylogeny = false; int64_t phylogenyNumTrees = 1; enum stCaf_RootingMethod phylogenyRootingMethod = BEST_RECON; enum stCaf_ScoringMethod phylogenyScoringMethod = COMBINED_LIKELIHOOD; double breakpointScalingFactor = 1.0; bool phylogenySkipSingleCopyBlocks = 0; int64_t phylogenyMaxBaseDistance = 1000; int64_t phylogenyMaxBlockDistance = 100; bool phylogenyKeepSingleDegreeBlocks = 0; stList *phylogenyTreeBuildingMethods = stList_construct(); enum stCaf_TreeBuildingMethod defaultMethod = GUIDED_NEIGHBOR_JOINING; stList_append(phylogenyTreeBuildingMethods, &defaultMethod); double phylogenyCostPerDupPerBase = 0.2; double phylogenyCostPerLossPerBase = 0.2; const char *debugFileName = NULL; const char *referenceEventHeader = NULL; double phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce = 1.0; int64_t numTreeBuildingThreads = 2; int64_t minimumBlockDegreeToCheckSupport = 10; double minimumBlockHomologySupport = 0.7; double nucleotideScalingFactor = 1.0; HomologyUnitType phylogenyHomologyUnitType = BLOCK; enum stCaf_DistanceCorrectionMethod phylogenyDistanceCorrectionMethod = JUKES_CANTOR; bool sortAlignments = false; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "alignments", required_argument, 0, 'b' }, { "cactusDisk", required_argument, 0, 'c' }, { "lastzArguments", required_argument, 0, 'd' }, { "help", no_argument, 0, 'h' }, { "annealingRounds", required_argument, 0, 'i' }, { "trim", required_argument, 0, 'k' }, { "trimChange", required_argument, 0, 'l', }, { "minimumTreeCoverage", required_argument, 0, 'm' }, { "blockTrim", required_argument, 0, 'n' }, { "deannealingRounds", required_argument, 0, 'o' }, { "minimumDegree", required_argument, 0, 'p' }, { "minimumIngroupDegree", required_argument, 0, 'q' }, { "minimumOutgroupDegree", required_argument, 0, 'r' }, { "alignmentFilter", required_argument, 0, 't' }, { "minimumSequenceLengthForBlast", required_argument, 0, 'v' }, { "maxAdjacencyComponentSizeRatio", required_argument, 0, 'w' }, { "constraints", required_argument, 0, 'x' }, { "minLengthForChromosome", required_argument, 0, 'y' }, { "proportionOfUnalignedBasesForNewChromosome", required_argument, 0, 'z' }, { "maximumMedianSequenceLengthBetweenLinkedEnds", required_argument, 0, 'A' }, { "realign", no_argument, 0, 'B' }, { "realignArguments", required_argument, 0, 'C' }, { "phylogenyNumTrees", required_argument, 0, 'D' }, { "phylogenyRootingMethod", required_argument, 0, 'E' }, { "phylogenyScoringMethod", required_argument, 0, 'F' }, { "phylogenyBreakpointScalingFactor", required_argument, 0, 'G' }, { "phylogenySkipSingleCopyBlocks", no_argument, 0, 'H' }, { "phylogenyMaxBaseDistance", required_argument, 0, 'I' }, { "phylogenyMaxBlockDistance", required_argument, 0, 'J' }, { "phylogenyDebugFile", required_argument, 0, 'K' }, { "phylogenyKeepSingleDegreeBlocks", no_argument, 0, 'L' }, { "phylogenyTreeBuildingMethod", required_argument, 0, 'M' }, { "phylogenyCostPerDupPerBase", required_argument, 0, 'N' }, { "phylogenyCostPerLossPerBase", required_argument, 0, 'O' }, { "referenceEventHeader", required_argument, 0, 'P' }, { "phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce", required_argument, 0, 'Q' }, { "numTreeBuildingThreads", required_argument, 0, 'R' }, { "phylogeny", no_argument, 0, 'S' }, { "minimumBlockHomologySupport", required_argument, 0, 'T' }, { "phylogenyNucleotideScalingFactor", required_argument, 0, 'U' }, { "minimumBlockDegreeToCheckSupport", required_argument, 0, 'V' }, { "removeRecoverableChains", required_argument, 0, 'W' }, { "minimumNumberOfSpecies", required_argument, 0, 'X' }, { "phylogenyHomologyUnitType", required_argument, 0, 'Y' }, { "phylogenyDistanceCorrectionMethod", required_argument, 0, 'Z' }, { "maxRecoverableChainsIterations", required_argument, 0, '1' }, { "maxRecoverableChainLength", required_argument, 0, '2' }, { 0, 0, 0, 0 } }; int option_index = 0; key = getopt_long(argc, argv, "a:b:c:hi:k:m:n:o:p:q:r:stv:w:x:y:z:A:BC:D:E:", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); st_setLogLevelFromString(logLevelString); break; case 'b': alignmentsFile = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': lastzArguments = stString_copy(optarg); break; case 'h': usage(); return 0; case 'i': annealingRounds = getInts(optarg, &annealingRoundsLength); break; case 'o': meltingRounds = getInts(optarg, &meltingRoundsLength); break; case 'k': alignmentTrims = getInts(optarg, &alignmentTrimLength); break; case 'm': k = sscanf(optarg, "%f", &minimumTreeCoverage); assert(k == 1); break; case 'n': k = sscanf(optarg, "%" PRIi64 "", &blockTrim); assert(k == 1); break; case 'p': k = sscanf(optarg, "%" PRIi64 "", &minimumDegree); assert(k == 1); break; case 'q': k = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree); assert(k == 1); break; case 'r': k = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree); assert(k == 1); break; case 't': if (strcmp(optarg, "singleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_filterByOutgroup; } else if (strcmp(optarg, "relaxedSingleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByOutgroup; } else if (strcmp(optarg, "singleCopy") == 0) { sortAlignments = true; filterFn = stCaf_filterByRepeatSpecies; } else if (strcmp(optarg, "relaxedSingleCopy") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByRepeatSpecies; } else if (strcmp(optarg, "singleCopyChr") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyChr; } else if (strcmp(optarg, "singleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyIngroup; } else if (strcmp(optarg, "relaxedSingleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedSingleCopyIngroup; } else if (strcmp(optarg, "none") == 0) { sortAlignments = false; filterFn = NULL; } else { st_errAbort("Could not recognize alignmentFilter option %s", optarg); } break; case 'v': k = sscanf(optarg, "%" PRIi64 "", &minimumSequenceLengthForBlast); assert(k == 1); break; case 'w': k = sscanf(optarg, "%f", &maximumAdjacencyComponentSizeRatio); assert(k == 1); break; case 'x': constraintsFile = stString_copy(optarg); break; case 'y': k = sscanf(optarg, "%" PRIi64 "", &minLengthForChromosome); assert(k == 1); break; case 'z': k = sscanf(optarg, "%f", &proportionOfUnalignedBasesForNewChromosome); assert(k == 1); break; case 'A': k = sscanf(optarg, "%" PRIi64 "", &maximumMedianSequenceLengthBetweenLinkedEnds); assert(k == 1); break; case 'B': realign = 1; break; case 'C': realignArguments = stString_copy(optarg); break; case 'D': k = sscanf(optarg, "%" PRIi64, &phylogenyNumTrees); assert(k == 1); break; case 'E': if (!strcmp(optarg, "outgroupBranch")) { phylogenyRootingMethod = OUTGROUP_BRANCH; } else if (!strcmp(optarg, "longestBranch")) { phylogenyRootingMethod = LONGEST_BRANCH; } else if (!strcmp(optarg, "bestRecon")) { phylogenyRootingMethod = BEST_RECON; } else { st_errAbort("Invalid tree rooting method: %s", optarg); } break; case 'F': if (!strcmp(optarg, "reconCost")) { phylogenyScoringMethod = RECON_COST; } else if (!strcmp(optarg, "nucLikelihood")) { phylogenyScoringMethod = NUCLEOTIDE_LIKELIHOOD; } else if (!strcmp(optarg, "reconLikelihood")) { phylogenyScoringMethod = RECON_LIKELIHOOD; } else if (!strcmp(optarg, "combinedLikelihood")) { phylogenyScoringMethod = COMBINED_LIKELIHOOD; } else { st_errAbort("Invalid tree scoring method: %s", optarg); } break; case 'G': k = sscanf(optarg, "%lf", &breakpointScalingFactor); assert(k == 1); break; case 'H': phylogenySkipSingleCopyBlocks = true; break; case 'I': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBaseDistance); assert(k == 1); break; case 'J': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBlockDistance); assert(k == 1); break; case 'K': debugFileName = stString_copy(optarg); break; case 'L': phylogenyKeepSingleDegreeBlocks = true; break; case 'M': // clear the default setting of the list stList_destruct(phylogenyTreeBuildingMethods); phylogenyTreeBuildingMethods = stList_construct(); stList *methodStrings = stString_splitByString(optarg, ","); for (int64_t i = 0; i < stList_length(methodStrings); i++) { char *methodString = stList_get(methodStrings, i); enum stCaf_TreeBuildingMethod *method = st_malloc(sizeof(enum stCaf_TreeBuildingMethod)); if (strcmp(methodString, "neighborJoining") == 0) { *method = NEIGHBOR_JOINING; } else if (strcmp(methodString, "guidedNeighborJoining") == 0) { *method = GUIDED_NEIGHBOR_JOINING; } else if (strcmp(methodString, "splitDecomposition") == 0) { *method = SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "strictSplitDecomposition") == 0) { *method = STRICT_SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "removeBadChains") == 0) { *method = REMOVE_BAD_CHAINS; } else { st_errAbort("Unknown tree building method: %s", methodString); } stList_append(phylogenyTreeBuildingMethods, method); } stList_destruct(methodStrings); break; case 'N': k = sscanf(optarg, "%lf", &phylogenyCostPerDupPerBase); assert(k == 1); break; case 'O': k = sscanf(optarg, "%lf", &phylogenyCostPerLossPerBase); assert(k == 1); break; case 'P': referenceEventHeader = stString_copy(optarg); break; case 'Q': k = sscanf(optarg, "%lf", &phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce); assert(k == 1); break; case 'R': k = sscanf(optarg, "%" PRIi64, &numTreeBuildingThreads); assert(k == 1); break; case 'S': doPhylogeny = true; break; case 'T': k = sscanf(optarg, "%lf", &minimumBlockHomologySupport); assert(k == 1); assert(minimumBlockHomologySupport <= 1.0); assert(minimumBlockHomologySupport >= 0.0); break; case 'U': k = sscanf(optarg, "%lf", &nucleotideScalingFactor); assert(k == 1); break; case 'V': k = sscanf(optarg, "%" PRIi64, &minimumBlockDegreeToCheckSupport); assert(k == 1); break; case 'W': if (strcmp(optarg, "1") == 0) { removeRecoverableChains = true; recoverableChainsFilter = NULL; } else if (strcmp(optarg, "unequalNumberOfIngroupCopies") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopies; } else if (strcmp(optarg, "unequalNumberOfIngroupCopiesOrNoOutgroup") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopiesOrNoOutgroup; } else if (strcmp(optarg, "0") == 0) { removeRecoverableChains = false; } else { st_errAbort("Could not parse removeRecoverableChains argument"); } break; case 'X': k = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies); if (k != 1) { st_errAbort("Error parsing the minimumNumberOfSpecies argument"); } break; case 'Y': if (strcmp(optarg, "chain") == 0) { phylogenyHomologyUnitType = CHAIN; } else if (strcmp(optarg, "block") == 0) { phylogenyHomologyUnitType = BLOCK; } else { st_errAbort("Could not parse the phylogenyHomologyUnitType argument"); } break; case 'Z': if (strcmp(optarg, "jukesCantor") == 0) { phylogenyDistanceCorrectionMethod = JUKES_CANTOR; } else if (strcmp(optarg, "none") == 0 ) { phylogenyDistanceCorrectionMethod = NONE; } else { st_errAbort("Could not parse the phylogenyDistanceCorrectionMethod argument"); } break; case '1': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainsIterations); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainsIterations argument"); } break; case '2': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainLength); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainLength argument"); } break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); assert(minimumTreeCoverage >= 0.0); assert(minimumTreeCoverage <= 1.0); assert(blockTrim >= 0); assert(annealingRoundsLength >= 0); for (int64_t i = 0; i < annealingRoundsLength; i++) { assert(annealingRounds[i] >= 0); } assert(meltingRoundsLength >= 0); for (int64_t i = 1; i < meltingRoundsLength; i++) { assert(meltingRounds[i - 1] < meltingRounds[i]); assert(meltingRounds[i - 1] >= 1); } assert(alignmentTrimLength >= 0); for (int64_t i = 0; i < alignmentTrimLength; i++) { assert(alignmentTrims[i] >= 0); } assert(minimumOutgroupDegree >= 0); assert(minimumIngroupDegree >= 0); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Sort the constraints /////////////////////////////////////////////////////////////////////////// stPinchIterator *pinchIteratorForConstraints = NULL; if (constraintsFile != NULL) { pinchIteratorForConstraints = stPinchIterator_constructFromFile(constraintsFile); st_logInfo("Created an iterator for the alignment constaints from file: %s\n", constraintsFile); } /////////////////////////////////////////////////////////////////////////// // Do the alignment /////////////////////////////////////////////////////////////////////////// startTime = time(NULL); stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk); if (alignmentsFile == NULL) { cactusDisk_preCacheStrings(cactusDisk, flowers); } char *tempFile1 = NULL; for (int64_t i = 0; i < stList_length(flowers); i++) { flower = stList_get(flowers, i); if (!flower_builtBlocks(flower)) { // Do nothing if the flower already has defined blocks st_logDebug("Processing flower: %lli\n", flower_getName(flower)); stCaf_setFlowerForAlignmentFiltering(flower); //Set up the graph and add the initial alignments stPinchThreadSet *threadSet = stCaf_setup(flower); //Build the set of outgroup threads outgroupThreads = stCaf_getOutgroupThreads(flower, threadSet); //Setup the alignments stPinchIterator *pinchIterator; stList *alignmentsList = NULL; if (alignmentsFile != NULL) { assert(i == 0); assert(stList_length(flowers) == 1); if (sortAlignments) { tempFile1 = getTempFile(); stCaf_sortCigarsFileByScoreInDescendingOrder(alignmentsFile, tempFile1); pinchIterator = stPinchIterator_constructFromFile(tempFile1); } else { pinchIterator = stPinchIterator_constructFromFile(alignmentsFile); } } else { if (tempFile1 == NULL) { tempFile1 = getTempFile(); } alignmentsList = stCaf_selfAlignFlower(flower, minimumSequenceLengthForBlast, lastzArguments, realign, realignArguments, tempFile1); if (sortAlignments) { stCaf_sortCigarsByScoreInDescendingOrder(alignmentsList); } st_logDebug("Ran lastz and have %" PRIi64 " alignments\n", stList_length(alignmentsList)); pinchIterator = stPinchIterator_constructFromList(alignmentsList); } for (int64_t annealingRound = 0; annealingRound < annealingRoundsLength; annealingRound++) { int64_t minimumChainLength = annealingRounds[annealingRound]; int64_t alignmentTrim = annealingRound < alignmentTrimLength ? alignmentTrims[annealingRound] : 0; st_logDebug("Starting annealing round with a minimum chain length of %" PRIi64 " and an alignment trim of %" PRIi64 "\n", minimumChainLength, alignmentTrim); stPinchIterator_setTrim(pinchIterator, alignmentTrim); //Add back in the constraints if (pinchIteratorForConstraints != NULL) { stCaf_anneal(threadSet, pinchIteratorForConstraints, filterFn); } //Do the annealing if (annealingRound == 0) { stCaf_anneal(threadSet, pinchIterator, filterFn); } else { stCaf_annealBetweenAdjacencyComponents(threadSet, pinchIterator, filterFn); } // Dump the block degree and length distribution to a file if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-preMelting", debugFileName)); } printf("Sequence graph statistics after annealing:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Check for poorly-supported blocks--those that have // been transitively aligned together but with very // few homologies supporting the transitive // alignment. These "megablocks" can snarl up the // graph so that a lot of extra gets thrown away in // the first melting step. stPinchThreadSetBlockIt blockIt = stPinchThreadSet_getBlockIt(threadSet); stPinchBlock *block; while ((block = stPinchThreadSetBlockIt_getNext(&blockIt)) != NULL) { if (stPinchBlock_getDegree(block) > minimumBlockDegreeToCheckSupport) { uint64_t supportingHomologies = stPinchBlock_getNumSupportingHomologies(block); uint64_t possibleSupportingHomologies = numPossibleSupportingHomologies(block, flower); double support = ((double) supportingHomologies) / possibleSupportingHomologies; if (support < minimumBlockHomologySupport) { fprintf(stdout, "Destroyed a megablock with degree %" PRIi64 " and %" PRIi64 " supporting homologies out of a maximum " "of %" PRIi64 " (%lf%%).\n", stPinchBlock_getDegree(block), supportingHomologies, possibleSupportingHomologies, support); stPinchBlock_destruct(block); } } } //Do the melting rounds for (int64_t meltingRound = 0; meltingRound < meltingRoundsLength; meltingRound++) { int64_t minimumChainLengthForMeltingRound = meltingRounds[meltingRound]; st_logDebug("Starting melting round with a minimum chain length of %" PRIi64 " \n", minimumChainLengthForMeltingRound); if (minimumChainLengthForMeltingRound >= minimumChainLength) { break; } stCaf_melt(flower, threadSet, NULL, 0, minimumChainLengthForMeltingRound, 0, INT64_MAX); } st_logDebug("Last melting round of cycle with a minimum chain length of %" PRIi64 " \n", minimumChainLength); stCaf_melt(flower, threadSet, NULL, 0, minimumChainLength, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds); //This does the filtering of blocks that do not have the required species/tree-coverage/degree. stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } if (removeRecoverableChains) { stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-postMelting", debugFileName)); } printf("Sequence graph statistics after melting:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Build a tree for each block, then use each tree to // partition the homologies between the ingroups sequences // into those that occur before the speciation with the // outgroup and those which occur late. if (stSet_size(outgroupThreads) > 0 && doPhylogeny) { st_logDebug("Starting to build trees and partition ingroup homologies\n"); stHash *threadStrings = stCaf_getThreadStrings(flower, threadSet); st_logDebug("Got sets of thread strings and set of threads that are outgroups\n"); stCaf_PhylogenyParameters params; params.distanceCorrectionMethod = phylogenyDistanceCorrectionMethod; params.treeBuildingMethods = phylogenyTreeBuildingMethods; params.rootingMethod = phylogenyRootingMethod; params.scoringMethod = phylogenyScoringMethod; params.breakpointScalingFactor = breakpointScalingFactor; params.nucleotideScalingFactor = nucleotideScalingFactor; params.skipSingleCopyBlocks = phylogenySkipSingleCopyBlocks; params.keepSingleDegreeBlocks = phylogenyKeepSingleDegreeBlocks; params.costPerDupPerBase = phylogenyCostPerDupPerBase; params.costPerLossPerBase = phylogenyCostPerLossPerBase; params.maxBaseDistance = phylogenyMaxBaseDistance; params.maxBlockDistance = phylogenyMaxBlockDistance; params.numTrees = phylogenyNumTrees; params.ignoreUnalignedBases = 1; params.onlyIncludeCompleteFeatureBlocks = 0; params.doSplitsWithSupportHigherThanThisAllAtOnce = phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce; params.numTreeBuildingThreads = numTreeBuildingThreads; assert(params.numTreeBuildingThreads >= 1); stCaf_buildTreesToRemoveAncientHomologies( threadSet, phylogenyHomologyUnitType, threadStrings, outgroupThreads, flower, ¶ms, debugFileName == NULL ? NULL : stString_print("%s-phylogeny", debugFileName), referenceEventHeader); stHash_destruct(threadStrings); st_logDebug("Finished building trees\n"); if (removeRecoverableChains) { // We melt recoverable chains after splitting, as // well as before, to alleviate coverage loss // caused by bad splits. stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } // Enforce the block constraints on minimum degree, // etc. after splitting. stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX); } //Sort out case when we allow blocks of degree 1 if (minimumDegree < 2) { st_logDebug("Creating degree 1 blocks\n"); stCaf_makeDegreeOneBlocks(threadSet); stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } else if (maximumAdjacencyComponentSizeRatio < INT64_MAX) { //Deal with giant components st_logDebug("Breaking up components greedily\n"); stCaf_breakupComponentsGreedily(threadSet, maximumAdjacencyComponentSizeRatio); } //Finish up stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, minLengthForChromosome, proportionOfUnalignedBasesForNewChromosome); //Flower is then destroyed at this point. st_logInfo("Ran the cactus core script\n"); //Cleanup stPinchThreadSet_destruct(threadSet); stPinchIterator_destruct(pinchIterator); stSet_destruct(outgroupThreads); if (alignmentsList != NULL) { stList_destruct(alignmentsList); } st_logInfo("Cleaned up from main loop\n"); } else { st_logInfo("We've already built blocks / alignments for this flower\n"); } } stList_destruct(flowers); if (tempFile1 != NULL) { st_system("rm %s", tempFile1); } if (constraintsFile != NULL) { stPinchIterator_destruct(pinchIteratorForConstraints); } /////////////////////////////////////////////////////////////////////////// // Write the flower to disk. /////////////////////////////////////////////////////////////////////////// st_logDebug("Writing the flowers to disk\n"); cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk and %" PRIi64 " seconds have elapsed\n", time(NULL) - startTime); /////////////////////////////////////////////////////////////////////////// // Clean up. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); }
//============================== MAIN ========================================= int main(int argc, char *argv[]) { Flower *flower; /* * Arguments/options */ char * st_logLevelString = NULL; char * cactusDiskDatabaseString = NULL; char * flowerName = "0"; char * outputFile = NULL; char * species = NULL; char * geneFile = NULL; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while(1) { static struct option long_options[] = { { "genePslFile", required_argument, 0, 'g' }, { "species", required_argument, 0, 's' }, { "st_logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'c' }, { "outputFile", required_argument, 0, 'o' }, { "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "s:g:o:a:c:h", long_options, &option_index); if(key == -1) { break; } switch(key) { case 'a': st_logLevelString = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'o': outputFile = stString_copy(optarg); break; case 's': species = stString_copy(optarg); break; case 'g': geneFile = stString_copy(optarg); break; case 'h': usage(); return 0; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); assert(outputFile != NULL); assert(species != NULL); assert(geneFile != NULL); ////////////////////////////////////////////// //Set up st_logging ////////////////////////////////////////////// st_setLogLevelFromString(st_logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); st_logInfo("Output file : %s\n", outputFile); st_logInfo("Species: %s\n", species); st_logInfo("GenePslFile: %s\n", geneFile); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Parse the basic reconstruction problem /////////////////////////////////////////////////////////////////////////// flower = cactusDisk_getFlower(cactusDisk, cactusMisc_stringToName(flowerName)); st_logInfo("Parsed the top level flower of the cactus tree to check\n"); /////////////////////////////////////////////////////////////////////////// // Recursive check the flowers. /////////////////////////////////////////////////////////////////////////// int64_t startTime = time(NULL); FILE *fileHandle = fopen(outputFile, "w"); struct bed *gene = bedLoadAll(geneFile); mapGenes(flower, fileHandle, gene, species); fclose(fileHandle); st_logInfo("Map genes in %" PRIi64 " seconds/\n", time(NULL) - startTime); /////////////////////////////////////////////////////////////////////////// // Clean up. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); return 0; }
void stTree_setLabel(stTree *tree, const char *label) { if(tree->label != NULL) { free(tree->label); } tree->label = label == NULL ? NULL : stString_copy(label); }
struct bed *bedLoadAll(char *fileName){ struct bed *list = NULL; struct bed *currbed = NULL; struct bed *prevbed = NULL; FILE *fp; char *line = st_malloc(sizeof(char)*LINEMAXSIZE); char *lend; struct List *lineList; fp = fopen(fileName, "r"); assert(fp != NULL); while( fgets(line, LINEMAXSIZE, fp) != NULL ){//each line //catch if not enough buffer if ( (lend = strstr(line, "\n")) == NULL ){ fprintf(stderr, "Input line is too long for buffer: \n*%s*\n", line); exit(0); }else{//trim \n *lend = '\0'; } //skip header line if(strstr(line, "track") == line){ continue; } lineList = splitString(line, "\t"); //empty lines, skip if(lineList->length == 0){ continue; } if(lineList->length < 12){ fprintf(stderr, "Wrong input format. Need at least 12 fields for each bed\n"); exit(0); }else{ currbed = constructbed(); currbed->chrom = stString_copy(lineList->list[0]); assert( sscanf(lineList->list[1], "%" PRIi64, &(currbed->chromStart)) == 1); assert( sscanf(lineList->list[2], "%" PRIi64, &(currbed->chromEnd)) == 1); currbed->name = stString_copy(lineList->list[3]); assert( sscanf(lineList->list[4], "%d", &(currbed->score)) == 1); currbed->strand = stString_copy(lineList->list[5]); assert( sscanf(lineList->list[6], "%" PRIi64, &(currbed->thickStart)) == 1); assert( sscanf(lineList->list[7], "%" PRIi64, &(currbed->thickEnd)) == 1); assert( sscanf(lineList->list[8], "%" PRIi64, &(currbed->itemRgb)) == 1); assert( sscanf(lineList->list[9], "%" PRIi64, &(currbed->blockCount)) == 1); currbed->blockSizes = splitIntString(stString_copy(lineList->list[10]), ","); assert(currbed->blockSizes->length == currbed->blockCount); currbed->chromStarts = splitIntString(stString_copy(lineList->list[11]), ","); assert(currbed->chromStarts->length == currbed->blockCount); //destructList(lineList); if(list == NULL){ list = currbed; prevbed = currbed; }else{ prevbed->next = currbed; prevbed = currbed; } } st_logInfo("Gene: %s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\n", currbed->chrom, currbed->chromStart, currbed->chromEnd, currbed->name, currbed->score, currbed->strand, currbed->thickStart, currbed->thickEnd, currbed->itemRgb, currbed->blockCount); } fclose(fp); return list; }
int main(int argc, char *argv[]) { /* * Script for adding a reference genome to a flower. */ /* * Arguments/options */ char * logLevelString = NULL; char * cactusDiskDatabaseString = NULL; char *referenceEventString = (char *) cactusMisc_getDefaultReferenceEventHeader(); char *outputFile = NULL; Name flowerName = NULL_NAME; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'c' }, { "flowerName", required_argument, 0, 'd' }, { "referenceEventString", required_argument, 0, 'g' }, { "help", no_argument, 0, 'h' }, { "outputFile", required_argument, 0, 'k' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:c:d:g:hk:", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': flowerName = cactusMisc_stringToName(optarg); break; case 'g': referenceEventString = stString_copy(optarg); break; case 'h': usage(); return 0; case 'k': outputFile = stString_copy(optarg); break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString( cactusDiskDatabaseString); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); stKVDatabaseConf_destruct(kvDatabaseConf); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Get the set of flowers to manipulate /////////////////////////////////////////////////////////////////////////// Flower *flower = cactusDisk_getFlower(cactusDisk, flowerName); /////////////////////////////////////////////////////////////////////////// // Get the reference event name /////////////////////////////////////////////////////////////////////////// Event *referenceEvent = eventTree_getEventByHeader( flower_getEventTree(flower), referenceEventString); assert(referenceEvent != NULL); Name referenceEventName = event_getName(referenceEvent); /////////////////////////////////////////////////////////////////////////// // Now process each flower in turn. /////////////////////////////////////////////////////////////////////////// if(outputFile == NULL) { st_errAbort("No output file specified\n"); } FILE *fileHandle = fopen(outputFile, "w"); printFastaSequences(flower, fileHandle, referenceEventName); if(fileHandle != NULL) { fclose(fileHandle); } /////////////////////////////////////////////////////////////////////////// //Clean up memory /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); //return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. free(cactusDiskDatabaseString); free(referenceEventString); free(logLevelString); st_logInfo("Cleaned stuff up and am finished\n"); //while(1); return 0; }
static char *terminalAdjacencyWriteFn(Cap *cap) { return stString_copy(""); }
int main(int argc, char *argv[]) { Flower *flower; FILE *fileHandle; /* * Arguments/options */ char * logLevelString = NULL; char * cactusDiskDatabaseString = NULL; char * flowerName = NULL; char * outputFile = NULL; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'c' }, { "flowerName", required_argument, 0, 'd' }, { "outputFile", required_argument, 0, 'e' }, { "scaleNodeSizes", no_argument, 0, 'f' }, { "nameLabels", no_argument, 0, 'g' }, { "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:c:d:e:fgh", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': flowerName = stString_copy(optarg); break; case 'e': outputFile = stString_copy(optarg); break; case 'f': scaleNodeSizes = !scaleNodeSizes; break; case 'g': nameLabels = !nameLabels; break; case 'h': usage(); return 0; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); assert(flowerName != NULL); assert(outputFile != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower name : %s\n", flowerName); st_logInfo("Output graph file : %s\n", outputFile); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Parse the basic reconstruction problem /////////////////////////////////////////////////////////////////////////// flower = cactusDisk_getFlower(cactusDisk, cactusMisc_stringToName(flowerName)); st_logInfo("Parsed the top level flower of the cactus tree to build\n"); /////////////////////////////////////////////////////////////////////////// // Build the graph. /////////////////////////////////////////////////////////////////////////// totalProblemSize = flower_getTotalBaseLength(flower); fileHandle = fopen(outputFile, "w"); graphViz_setupGraphFile(fileHandle); makeCactusTree_flower(flower, fileHandle, NULL, NULL); graphViz_finishGraphFile(fileHandle); fclose(fileHandle); st_logInfo("Written the tree to file\n"); /////////////////////////////////////////////////////////////////////////// // Clean up. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); stKVDatabaseConf_destruct(kvDatabaseConf); return 0; }
int main(int argc, char *argv[]) { /* * Open the database. * Construct a flower. * Construct an event tree representing the species tree. * For each sequence contruct two ends each containing an cap. * Make a file for the sequence. * Link the two caps. * Finish! */ int64_t key, j; Group *group; Flower_EndIterator *endIterator; End *end; bool makeEventHeadersAlphaNumeric = 0; /* * Arguments/options */ char * logLevelString = NULL; char * speciesTree = NULL; char * outgroupEvents = NULL; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, { "speciesTree", required_argument, 0, 'f' }, { "outgroupEvents", required_argument, 0, 'g' }, { "help", no_argument, 0, 'h' }, { "makeEventHeadersAlphaNumeric", no_argument, 0, 'i' }, { 0, 0, 0, 0 } }; int option_index = 0; key = getopt_long(argc, argv, "a:b:f:hg:i", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = optarg; break; case 'b': cactusDiskDatabaseString = optarg; break; case 'f': speciesTree = optarg; break; case 'g': outgroupEvents = optarg; break; case 'h': usage(); return 0; case 'i': makeEventHeadersAlphaNumeric = 1; break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// //assert(logLevelString == NULL || strcmp(logLevelString, "CRITICAL") == 0 || strcmp(logLevelString, "INFO") == 0 || strcmp(logLevelString, "DEBUG") == 0); assert(cactusDiskDatabaseString != NULL); assert(speciesTree != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); for (j = optind; j < argc; j++) { st_logInfo("Sequence file/directory %s\n", argv[j]); } ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// stKVDatabaseConf *kvDatabaseConf = kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); if (stKVDatabaseConf_getType(kvDatabaseConf) == stKVDatabaseTypeTokyoCabinet || stKVDatabaseConf_getType(kvDatabaseConf) == stKVDatabaseTypeKyotoTycoon) { assert(stKVDatabaseConf_getDir(kvDatabaseConf) != NULL); cactusDisk = cactusDisk_construct2(kvDatabaseConf, "cactusSequences"); } else { cactusDisk = cactusDisk_construct(kvDatabaseConf, 1); } st_logInfo("Set up the flower disk\n"); ////////////////////////////////////////////// //Construct the flower ////////////////////////////////////////////// if (cactusDisk_getFlower(cactusDisk, 0) != NULL) { cactusDisk_destruct(cactusDisk); st_logInfo("The first flower already exists\n"); return 0; } flower = flower_construct2(0, cactusDisk); assert(flower_getName(flower) == 0); st_logInfo("Constructed the flower\n"); ////////////////////////////////////////////// //Construct the event tree ////////////////////////////////////////////// st_logInfo("Going to build the event tree with newick string: %s\n", speciesTree); stTree *tree = stTree_parseNewickString(speciesTree); st_logInfo("Parsed the tree\n"); if (makeEventHeadersAlphaNumeric) { makeEventHeadersAlphaNumericFn(tree); } stTree_setBranchLength(tree, INT64_MAX); checkBranchLengthsAreDefined(tree); eventTree = eventTree_construct2(flower); //creates the event tree and the root even totalEventNumber = 1; st_logInfo("Constructed the basic event tree\n"); // Construct a set of outgroup names so that ancestral outgroups // get recognized. stSet *outgroupNameSet = stSet_construct3(stHash_stringKey, stHash_stringEqualKey, free); if(outgroupEvents != NULL) { stList *outgroupNames = stString_split(outgroupEvents); for(int64_t i = 0; i < stList_length(outgroupNames); i++) { char *outgroupName = stList_get(outgroupNames, i); stSet_insert(outgroupNameSet, stString_copy(outgroupName)); } stList_destruct(outgroupNames); } //now traverse the tree j = optind; assignEventsAndSequences(eventTree_getRootEvent(eventTree), tree, outgroupNameSet, argv, &j); char *eventTreeString = eventTree_makeNewickString(eventTree); st_logInfo( "Constructed the initial flower with %" PRIi64 " sequences and %" PRIi64 " events with string: %s\n", totalSequenceNumber, totalEventNumber, eventTreeString); assert(event_getSubTreeBranchLength(eventTree_getRootEvent(eventTree)) >= 0.0); free(eventTreeString); //assert(0); ////////////////////////////////////////////// //Label any outgroup events. ////////////////////////////////////////////// if (outgroupEvents != NULL) { stList *outgroupEventsList = stString_split(outgroupEvents); for (int64_t i = 0; i < stList_length(outgroupEventsList); i++) { char *outgroupEvent = makeEventHeadersAlphaNumeric ? makeAlphaNumeric(stList_get(outgroupEventsList, i)) : stString_copy(stList_get(outgroupEventsList, i)); Event *event = eventTree_getEventByHeader(eventTree, outgroupEvent); if (event == NULL) { st_errAbort("Got an outgroup string that does not match an event, outgroup string %s", outgroupEvent); } assert(!event_isOutgroup(event)); event_setOutgroupStatus(event, 1); assert(event_isOutgroup(event)); free(outgroupEvent); } stList_destruct(outgroupEventsList); } ////////////////////////////////////////////// //Construct the terminal group. ////////////////////////////////////////////// if (flower_getEndNumber(flower) > 0) { group = group_construct2(flower); endIterator = flower_getEndIterator(flower); while ((end = flower_getNextEnd(endIterator)) != NULL) { end_setGroup(end, group); } flower_destructEndIterator(endIterator); assert(group_isLeaf(group)); // Create a one link chain if there is only one pair of attached ends.. group_constructChainForLink(group); assert(!flower_builtBlocks(flower)); } else { flower_setBuiltBlocks(flower, 1); } /////////////////////////////////////////////////////////////////////////// // Write the flower to disk. /////////////////////////////////////////////////////////////////////////// //flower_check(flower); cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk\n"); /////////////////////////////////////////////////////////////////////////// // Cleanup. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. stSet_destruct(outgroupNameSet); stTree_destruct(tree); stKVDatabaseConf_destruct(kvDatabaseConf); return 0; }
int main(int argc, char *argv[]) { char * logLevelString = NULL; char * cactusDiskDatabaseString = NULL; char * flowerName = NULL; char * outputFile = NULL; char *referenceEventString = NULL; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "referenceEventString", required_argument, 0, 'b' }, { "cactusDisk", required_argument, 0, 'c' }, { "flowerName", required_argument, 0, 'e' }, { "outputFile", required_argument, 0, 'f' }, { "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:b:c:d:e:f:h", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); break; case 'b': referenceEventString = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'e': flowerName = stString_copy(optarg); break; case 'f': outputFile = stString_copy(optarg); break; case 'h': usage(); return 0; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(flowerName != NULL); assert(referenceEventString != NULL); assert(cactusDiskDatabaseString != NULL); assert(outputFile != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower name : %s\n", flowerName); st_logInfo("Sequence name : %s\n", referenceEventString); st_logInfo("Output file : %s\n", outputFile); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString( cactusDiskDatabaseString); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, false, true); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Parse the basic reconstruction problem /////////////////////////////////////////////////////////////////////////// Flower *flower = cactusDisk_getFlower(cactusDisk, cactusMisc_stringToName( flowerName)); st_logInfo("Parsed the top level flower of the cactus tree to check\n"); /////////////////////////////////////////////////////////////////////////// // Recursive check the flowers. /////////////////////////////////////////////////////////////////////////// //int64_t startTime = time(NULL); //flower = flower_addReferenceSequence(flower, cactusDisk, name); //st_logInfo("Added the reference sequence in %" PRIi64 " seconds/\n", time(NULL) - startTime); int64_t numSequences = flower_getSequenceNumber(flower); //Make sure that referenceSequence has already been added: if(getSequenceMatchesEvent(flower, referenceEventString) == NULL && numSequences > 0){ fprintf(stderr, "No reference sequence found in cactusDisk\n"); exit(EXIT_FAILURE); } FILE *fileHandle = fopen(outputFile, "w"); if (numSequences > 0) { getReferenceSequences(fileHandle, flower, referenceEventString); } else { st_logCritical("cactus_getReferenceSeq found no reference sequence in empty cactus disk %s", cactusDiskDatabaseString); } fclose(fileHandle); /////////////////////////////////////////////////////////////////////////// // Clean up. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. stKVDatabaseConf_destruct(kvDatabaseConf); return 0; }
int main(int argc, char *argv[]) { /* * Arguments/options */ char * logLevelString = NULL; char * mAFFile1 = NULL; char * mAFFile2 = NULL; char * outputFile = NULL; int32_t sampleNumber = 1000000; // by default do a million samples per pair. char * trioFile = NULL; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while(1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "mAFFile1", required_argument, 0, 'b' }, { "mAFFile2", required_argument, 0, 'c' }, { "outputFile", required_argument, 0, 'd' }, { "sampleNumber", required_argument, 0, 'e' }, { "trioFile", required_argument, 0, 'f' }, { "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:b:c:d:e:f:h", long_options, &option_index); if(key == -1) { break; } switch(key) { case 'a': logLevelString = stString_copy(optarg); break; case 'b': mAFFile1 = stString_copy(optarg); break; case 'c': mAFFile2 = stString_copy(optarg); break; case 'd': outputFile = stString_copy(optarg); break; case 'e': assert(sscanf(optarg, "%i", &sampleNumber) == 1); break; case 'f': trioFile = stString_copy(optarg); break; case 'h': usage(); return 0; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// if (argc == 1) { usage(); return 1; } assert(mAFFile1 != NULL); assert(mAFFile2 != NULL); assert(outputFile != NULL); assert(trioFile != NULL); FILE *fileHandle = fopen(mAFFile1, "r"); if (fileHandle == NULL) { fprintf(stderr, "ERROR, unable to open `%s', is path correct?\n", mAFFile1); exit(1); } fclose(fileHandle); fileHandle = fopen(mAFFile2, "r"); if (fileHandle == NULL) { fprintf(stderr, "ERROR, unable to open `%s', is path correct?\n", mAFFile2); exit(1); } fclose(fileHandle); fileHandle = fopen(trioFile, "r"); if (fileHandle == NULL) { fprintf(stderr, "ERROR, unable to open `%s', is path correct?\n", trioFile); exit(1); } fclose(fileHandle); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("MAF file 1 name : %s\n", mAFFile1); st_logInfo("MAF file 2 name : %s\n", mAFFile2); st_logInfo("Trio species name : %s\n", trioFile); st_logInfo("Output stats file : %s\n", outputFile); /* Parse the trioFile triples into a list */ struct List *speciesList = parseTrioFile(trioFile); ////////////////////////////////////////////// // Create hashtable for the first MAF file. ////////////////////////////////////////////// struct hashtable *seqNameHash; seqNameHash = create_hashtable(256, hashtable_stringHashKey, hashtable_stringEqualKey, free, free); populateNameHash(mAFFile1, seqNameHash); // TODO: Check if query species are in maf file ////////////////////////////////////////////// //Do comparisons. ////////////////////////////////////////////// struct avl_table *results_12 = compareMAFs_AB_Trio(mAFFile1, mAFFile2, sampleNumber, seqNameHash, speciesList); struct avl_table *results_21 = compareMAFs_AB_Trio(mAFFile2, mAFFile1, sampleNumber, seqNameHash, speciesList); fileHandle = fopen(outputFile, "w"); if (fileHandle == NULL) { fprintf(stderr, "ERROR, unable to open `%s' for writing.\n", outputFile); exit(1); } fprintf(fileHandle, "<trio_comparisons sampleNumber=\"%i\">\n", sampleNumber); reportResultsTrio(results_12, mAFFile1, mAFFile2, fileHandle); reportResultsTrio(results_21, mAFFile2, mAFFile1, fileHandle); fprintf(fileHandle, "</trio_comparisons>\n"); fclose(fileHandle); /////////////////////////////////////////////////////////////////////////// // Clean up. /////////////////////////////////////////////////////////////////////////// free(mAFFile1); free(mAFFile2); free(outputFile); free(trioFile); free(logLevelString); hashtable_destroy(seqNameHash, 1, 1); avl_destroy(results_12, (void (*)(void *, void *))aTrio_destruct); avl_destroy(results_21, (void (*)(void *, void *))aTrio_destruct); destructList(speciesList); return 0; }
stKVDatabaseConf *stKVDatabaseConf_constructTokyoCabinet(const char *databaseDir) { stKVDatabaseConf *conf = stSafeCCalloc(sizeof(stKVDatabaseConf)); conf->type = stKVDatabaseTypeTokyoCabinet; conf->databaseDir = stString_copy(databaseDir); return conf; }