void processBody(mafFileApi_t *mfa) { // walk the body of the maf file and process it, block by block. mafBlock_t *thisBlock = NULL; thisBlock = maf_readBlock(mfa); // unused maf_destroyMafBlockList(thisBlock); while((thisBlock = maf_readBlock(mfa)) != NULL) { correctSpeciesNames(thisBlock); checkBlock(thisBlock); maf_destroyMafBlockList(thisBlock); } }
void processBody(mafFileApi_t *mfa, char *seq, char strand) { // walk the body of the maf file and process it, block by block. mafBlock_t *thisBlock = NULL; thisBlock = maf_readBlock(mfa); // header block, unused maf_destroyMafBlockList(thisBlock); printHeader(); while((thisBlock = maf_readBlock(mfa)) != NULL) { checkBlock(thisBlock, seq, strand); maf_mafBlock_print(thisBlock); maf_destroyMafBlockList(thisBlock); } }
void searchInput(mafFileApi_t *mfa, char *fullname, unsigned long pos) { mafBlock_t *thisBlock = NULL; while ((thisBlock = maf_readBlock(mfa)) != NULL) { checkBlock(thisBlock, fullname, pos); maf_destroyMafBlockList(thisBlock); } }
static void test_addBlockToHash_3(CuTest *testCase) { // concatenation with 2 bases of interstitial and a sequence length breakpoint options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name3.chr1 0 13 + 100 GCAGCTGAAAACA\n", observedList ); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gtcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr1 50 5 + 100 CCCCC\n" , 3); stHash *expectedHash = NULL; expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGac----------gtcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG------------ATGTg\n" "s name3 0 28 + 28 GCAGCTGAAAACA--NNNNNNNNNNCCCCC\n", expectedList ); row_t *r = stHash_search(expectedHash, "name3"); r->prevRightPos = 54; free(r->prevName); r->prevName = stString_copy("name3.chr1"); r->multipleNames = true; stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("GCAGCTGAAAACAggggggggggggggggggggggggggggggggggggg" "CCCCCaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ); stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
void processBody(mafFileApi_t *mfa, char *seq, uint32_t start, uint32_t stop) { mafBlock_t *thisBlock = NULL; bool printedHeader = false; while ((thisBlock = maf_readBlock(mfa)) != NULL) { checkBlock(thisBlock, seq, start, stop, &printedHeader); maf_destroyMafBlockList(thisBlock); } }
static void targetColumnTest(CuTest *testCase, const char *mafString, uint64_t start, uint64_t stop, uint64_t expectedLen, bool expected[]) { mafBlock_t *ib = maf_newMafBlockFromString(mafString, 3); bool *targetColumns = NULL; uint64_t len = 0; getTargetColumns(&targetColumns, &len, ib, "theTarget.chr0", start, stop); CuAssertTrue(testCase, len == expectedLen); CuAssertTrue(testCase, boolArraysAreEqual(targetColumns, expected, len)); maf_destroyMafBlockList(ib); free(targetColumns); }
void processBody(mafFileApi_t *mfa, char *seq, uint64_t start, uint64_t stop, bool isSoft) { mafBlock_t *thisBlock = NULL; bool printedHeader = false; uint64_t blockNumber = 0; while ((thisBlock = maf_readBlock(mfa)) != NULL) { checkBlock(thisBlock, blockNumber, seq, start, stop, &printedHeader, isSoft); maf_destroyMafBlockList(thisBlock); ++blockNumber; } if (!printedHeader) { // this makes the output valid even when no data was output printHeader(); } }
mafBlock_t* maf_readBlock(mafFileApi_t *mfa) { // either returns a pointer to the next mafBlock in the maf file, // or a NULL pointer if the end of the file has been reached. if (mfa->lineNumber == 0) { // header mafBlock_t *header = maf_readBlockHeader(mfa); if (header->headLine != NULL) { return header; } else { maf_destroyMafBlockList(header); return NULL; } } else { // body mafBlock_t *mb = maf_readBlockBody(mfa); if (mb->headLine != NULL) { return mb; } else { maf_destroyMafBlockList(mb); return NULL; } } }
static void spliceTest(CuTest *testCase, const char *input, const char *expected, uint64_t l, uint64_t r, int64_t **offs) { mafBlock_t *ib = maf_newMafBlockFromString(input, 3); mafBlock_t *eb = maf_newMafBlockFromString(expected, 3); bool cleanOffs = false; if (offs == NULL) { cleanOffs = true; offs = createOffsets(maf_mafBlock_getNumberOfSequences(ib)); } mafBlock_t *ob = spliceBlock(ib, l, r, offs); CuAssertTrue(testCase, mafBlocksAreEqual(eb, ob)); if ((l == 0) && (r == maf_mafBlock_getSequenceFieldLength(ib) - 1)) { CuAssertTrue(testCase, ib == ob); } // clean up if (cleanOffs) { destroyOffsets(offs, maf_mafBlock_getNumberOfSequences(ib)); } if (ib != ob) maf_destroyMafBlockList(ob); maf_destroyMafBlockList(ib); maf_destroyMafBlockList(eb); }
static void test_addBlockToHash_2(CuTest *testCase) { // concatenation with 2 bases of interstitial AND a previously unobserved sequence options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n", observedList); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gTcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr@ 0 5 + 20 aaccg\n" , 3); stHash *expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca--ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGacgTcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG--ATGTg\n" "s name3.chr@ 0 5 + 20 ---------------aaccg\n", expectedList ); stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgTc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("aaccgTTTTTTTTTTTTTTT"); stHash_insert(seqHash, stString_copy("name3.chr@"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
mafBlock_t *spliceBlock(mafBlock_t *b, uint64_t l, uint64_t r, int64_t **offsetArray) { // b is the input maf block // l is the left index in the sequence field, the start of inclusion // r is the right index in the sequence field, the stop of inclusion if ((l == 0) && (r == maf_mafBlock_getSequenceFieldLength(b) - 1)) { return b; } // printf("spliceBlock(l=%"PRIu64", r=%"PRIu64")\n", l, r); mafBlock_t *mb = maf_newMafBlock(); mafLine_t *ml1 = NULL, *ml2 = NULL; ml1 = maf_mafBlock_getHeadLine(b); uint64_t lineNumber = maf_mafLine_getLineNumber(ml1); uint64_t numberOfSequences = maf_mafBlock_getNumberOfSequences(b); for (uint64_t i = 0; i < numberOfSequences; ++i) { assert(offsetArray[i][0] <= (int64_t)l); } assert(r < maf_mafBlock_getSequenceFieldLength(b)); ml1 = maf_mafLine_getNext(ml1); maf_mafBlock_setHeadLine(mb, maf_newMafLineFromString("a score=0 mafExtractor_splicedBlock=true", lineNumber)); ml2 = maf_mafBlock_getHeadLine(mb); maf_mafBlock_setLineNumber(mb, lineNumber); maf_mafBlock_incrementNumberOfLines(mb); uint64_t len; char *seq = NULL; bool prevLineUsed = true; // used when a mafline is dropped because it's length becomes 0 bool emptyBlock = true; uint64_t si = 0; // sequence index, for addressing into offsetArray while (ml1 != NULL) { // loop through all maf lines in a block if (prevLineUsed) { // advance ml2 maf_mafBlock_setTailLine(mb, ml2); // assign this first, in case last line is not used. maf_mafLine_setNext(ml2, maf_newMafLine()); ml2 = maf_mafLine_getNext(ml2); } if (maf_mafLine_getLine(ml1) == NULL) { ml1 = maf_mafLine_getNext(ml1); prevLineUsed = false; continue; } maf_mafLine_setType(ml2, maf_mafLine_getType(ml1)); maf_mafLine_setLineNumber(ml2, ++lineNumber); maf_mafBlock_incrementNumberOfLines(mb); maf_mafBlock_incrementLineNumber(mb); if (maf_mafLine_getType(ml2) != 's') { // copy the line over, move on to next ml maf_mafLine_setLine(ml2, de_strdup(maf_mafLine_getLine(ml1))); ml1 = maf_mafLine_getNext(ml1); prevLineUsed = true; continue; } maf_mafBlock_incrementNumberOfSequences(mb); seq = maf_mafLine_getSequence(ml1); len = maf_mafBlock_getSequenceFieldLength(b); if (maf_mafBlock_getSequenceFieldLength(mb) == 0) { maf_mafBlock_setSequenceFieldLength(mb, len); } // printf(" [%2"PRIi64", %2"PRIi64"] pre: %s\n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1)); // find first non-gap position OR the left edge, update offset while (seq[offsetArray[si][0]] == '-' && offsetArray[si][0] <= (int64_t)l) { ++offsetArray[si][0]; if (seq[offsetArray[si][0]] != '-' && offsetArray[si][1] >= 0) { // if we've advanced to a non-gap char and the local offset // is not 0 (i.e. these aren't simply leading gaps), then // advance the offset ++offsetArray[si][1]; // advance offset } } // printf(" [%2"PRIi64", %2"PRIi64"] post-initial gap / left edge discovery: %s \n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1)); // we normally ignore the initial value because it's already been set. however if -1 it must be set if (seq[offsetArray[si][0]] != '-' && offsetArray[si][1] == -1) { offsetArray[si][1] = 0; } // offsets for (int64_t i = offsetArray[si][0] + 1; i <= (int64_t)l; ++i) { // figure out the non-gap offset for the splice-in point, `l' offsetArray[si][0] = i; // local pos if (seq[i] != '-') { if (offsetArray[si][1] == -1) { offsetArray[si][1] = 0; } ++offsetArray[si][1]; // advance offset } } // printf(" [%2"PRIi64", %2"PRIi64"] post-gaps: %s \n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1)); bool allGaps = true; for (uint64_t i = l; i <= r; ++i) { if (seq[i] != '-') { allGaps = false; break; } } if (allGaps) { // this sequence is all gaps in this region, exclude it ml1 = maf_mafLine_getNext(ml1); prevLineUsed = false; maf_mafBlock_decrementLineNumber(mb); maf_mafBlock_decrementNumberOfLines(mb); maf_mafBlock_decrementNumberOfSequences(mb); --lineNumber; ++si; continue; } // Walk up beyond the `l' point if the left edge falls on a gap character while (seq[offsetArray[si][0]] == '-' && offsetArray[si][0] < (int64_t) r) { ++offsetArray[si][0]; if (seq[offsetArray[si][0]] != '-') { // && offsetArray[si][1] >= 0) { ++offsetArray[si][1]; } } // printf(" [%2"PRIi64", %2"PRIi64"] walk past left: %s\n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1)); // ensure local offset is set properly int64_t seqCoords = offsetArray[si][1]; if (offsetArray[si][1] == -1) { seqCoords = 0; if (seq[offsetArray[si][0]] != '-') { offsetArray[si][1] = 0; } } // printf(" [%2"PRIi64", %2"PRIi64"] seqCoords:%"PRIi64" set properly: %s\n", offsetArray[si][0], offsetArray[si][1], seqCoords, maf_mafLine_getSpecies(ml1)); maf_mafLine_setStart(ml2, maf_mafLine_getStart(ml1) + seqCoords); // update offsetArray: for (uint64_t i = offsetArray[si][0] + 1; i <= r; ++i) { offsetArray[si][0] = i; if (seq[i] != '-') { if (offsetArray[si][1] == -1) { offsetArray[si][1] = 0; } ++offsetArray[si][1]; } } // printf(" [%2"PRIi64", %2"PRIi64"] post update: %s\n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1)); maf_mafLine_setSequence(ml2, de_strndup(seq + l, 1 + r - l)); maf_mafLine_setLength(ml2, countNonGaps(maf_mafLine_getSequence(ml2))); maf_mafBlock_setSequenceFieldLength(mb, maf_mafLine_getSequenceFieldLength(ml2)); maf_mafLine_setStrand(ml2, maf_mafLine_getStrand(ml1)); maf_mafLine_setSourceLength(ml2, maf_mafLine_getSourceLength(ml1)); maf_mafLine_setSpecies(ml2, de_strdup(maf_mafLine_getSpecies(ml1))); maf_mafLine_setLine(ml2, maf_mafLine_imputeLine(ml2)); prevLineUsed = true; ml1 = maf_mafLine_getNext(ml1); ++si; emptyBlock = false; } maf_mafBlock_incrementLineNumber(mb); // extra \n at the end of a block if (prevLineUsed) { maf_mafBlock_setTailLine(mb, ml2); } if (!emptyBlock) { return mb; } else { // this condition should never be tripped, we have a condition set up // in the "process" wrapper above this function. printf("block was empty, this should never happen in spliceBlock()\n"); assert(2 + 2 == 5); maf_destroyMafBlockList(mb); return NULL; } }
mafBlock_t *processBlockForSplice(mafBlock_t *b, uint64_t blockNumber, const char *seq, uint64_t start, uint64_t stop, bool store) { // walks mafBlock_t b, returns a mafBlock_t (using the linked list feature) of all spliced out bits. // if store is true, will return a mafBlock_t linked list of all sub-blocks. If store is false, // will report each sub-block (maf_mafBlock_print()) as it comes in and immediatly destroy that block. /* printf("\n\nprocessBlockForSplice(block=%"PRIu64", seq=%s, start=%"PRIu64", stop=%"PRIu64")\n", blockNumber, seq, start, stop); maf_mafBlock_print(b); */ bool *targetColumns = NULL; uint64_t len = 0, sum = 0; mafBlock_t *head = NULL, *mb = NULL; sum = getTargetColumns(&targetColumns, &len, b, seq, start, stop); // printTargetColumns(targetColumns, len); int64_t **offsets = createOffsets(maf_mafBlock_getNumberOfSequences(b)); uint64_t l = 0, r = 0, ri = 0; uint64_t spliceNumber = 0; char *id = (char*) de_malloc(kMaxStringLength); while (l < len) { if (!targetColumns[l]) { // find the left most element ++l; r = l; continue; } while (targetColumns[r] && r < len) { // find the end of the right ++r; } // set ri equal to the index of the last element ri = r - 1; if (store) { // used in unit tests if (head == NULL) { head = spliceBlock(b, l, ri, offsets); mb = head; } else { if (mb != b) { // manipulated blocks should have this extra tag attached sprintf(id, " splice_id=%" PRIu64 "_%" PRIu64, blockNumber, spliceNumber); maf_mafBlock_appendToAlignmentBlock(mb, id); } maf_mafBlock_setNext(mb, spliceBlock(b, l, ri, offsets)); mb = maf_mafBlock_getNext(mb); ++spliceNumber; } } else { // used in production mb = spliceBlock(b, l, ri, offsets); if (mb != b) { sprintf(id, " splice_id=%" PRIu64 "_%" PRIu64, blockNumber, spliceNumber); maf_mafBlock_appendToAlignmentBlock(mb, id); } maf_mafBlock_print(mb); if (mb != b) { maf_destroyMafBlockList(mb); } ++spliceNumber; } l = r; if (l == len - 1) { break; } } // clean up free(id); free(targetColumns); destroyOffsets(offsets, maf_mafBlock_getNumberOfSequences(b)); return head; }
static void test_addBlockToHash_4(CuTest *testCase) { // concatenation with sequnece breakpoint due to *strand* alone // note that name3 is well within the interstitial boundary, the two blocks // essentially looking like >>>>>>>>>>>>> <<<<< (strand diffs) options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name3.chr1 0 13 + 100 GCAGCTGAAAACA\n", observedList ); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gtcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr1 82 5 - 100 GGGGG\n" , 3); stHash *expectedHash = NULL; expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGac----------gtcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG------------ATGTg\n" "s name3.chr1 0 28 + 100 GCAGCTGAAAACA--NNNNNNNNNNGGGGG\n", expectedList ); row_t *r = stHash_search(expectedHash, "name3"); r->prevRightPos = 86; r->strand = '*'; r->prevStrand = '-'; stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("GCAGCTGAAAACACCCCCgggggggggggggggggggggggggggggggg" "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ); stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
static stHash *createBlockHashFromString(char *input, stList *orderList) { mafBlock_t *ibhead = maf_newMafBlockListFromString(input, 3); stHash *hash = mafBlockToBlockHash(ibhead, orderList); maf_destroyMafBlockList(ibhead); return hash; }