// Join traces before and after the seed together struct trace gappedExtension_joinTraces(struct trace beforeTrace, struct trace afterTrace) { struct trace joinedTrace; unsigned char *traceCodes; int4 count; // Joined trace will have length equal to sum of lengths of the two traces, // and will start where the beforeTrace starts joinedTrace.length = beforeTrace.length + afterTrace.length; joinedTrace.queryStart = beforeTrace.queryStart; joinedTrace.subjectStart = beforeTrace.subjectStart; // Add memory to space already allocated by before traceCodes traceCodes = (unsigned char *)global_realloc( beforeTrace.traceCodes, sizeof(unsigned char) * joinedTrace.length); // Add after trace codes to end in reverse order count = 0; while (count < afterTrace.length) { traceCodes[beforeTrace.length + count] = afterTrace.traceCodes[afterTrace.length - count - 1]; count++; } joinedTrace.traceCodes = traceCodes; return joinedTrace; }
void proteinLookup_db_sub(uint4 sequenceOffset, unsigned char *sequence, int4 subjectLength, int4 wordLength, int blockNum) { uint4 codeword, numEntries, byteNumber, queryPosition; struct initialWord_protein_db *initialLookup, *initialWord; // Number of entries in the table numEntries = proteinLookup_numWords; initialLookup = proteinLookup_db + blockNum * proteinLookup_numWords; // Slide a word-sized window across the query //queryPosition = 0; for(queryPosition = 0; queryPosition < (subjectLength - wordLength + 1); queryPosition++) { //int4 codeword; uint4 codeCount; codeword = 0; codeCount = 0; while (codeCount < wordLength) { codeword *= wordLookupDFA_numCodes; if (sequence[queryPosition + codeCount] < wordLookupDFA_numCodes) codeword += sequence[queryPosition + codeCount]; codeCount++; } //codeword = getCodeword(sequence + queryPosition, wordLength); initialWord = initialLookup + codeword; initialWord->numSubPositions++; if (initialWord->numSubPositions > initialWord->allocSubPositions) { // total_allocHits += allocSubPositions[codeword]; initialWord->allocSubPositions = initialWord->allocSubPositions == 0 ? 10 : 2 * initialWord->allocSubPositions; initialWord->subSequencePositions = (subPos_t *)global_realloc( initialWord->subSequencePositions, sizeof(subPos_t) * initialWord->allocSubPositions); } // initialWord->subSequencePositions[initialWord->numSubPositions - 1] = //((sequenceNum - blockNum * dbIdx_block_size) << 16) + queryPosition; initialWord->subSequencePositions[initialWord->numSubPositions - 1].subOff = queryPosition; initialWord->subSequencePositions[initialWord->numSubPositions - 1].seqId = sequenceOffset; //initialWord->subSequencePositions[initialWord->numSubPositions - 1] = //(queryPosition << 16) + (sequenceNum - startSeqNum); //queryPosition++; } }
// Get a run of consecutive entries from the block void *memBlocks_newEntries(struct memBlocks *memBlocks, uint4 numNewEntries) { void *newEntry; // Check if we need to create a new block of memory if (memBlocks->numEntries[memBlocks->numBlocks - 1] + numNewEntries > memBlocks->blockSizes) { // Declare memory for the new block memBlocks->lastBlock = (void *)global_malloc(memBlocks->entrySize * memBlocks->blockSizes); // Check if we need more memory for block pointers if (memBlocks->numBlocks >= memBlocks->maxNumBlocks) { // Allocate more memBlocks->maxNumBlocks *= 2; memBlocks->blocks = (void **)global_realloc( memBlocks->blocks, sizeof(void *) * memBlocks->maxNumBlocks); memBlocks->numEntries = (int4 *)global_realloc( memBlocks->numEntries, sizeof(int4) * memBlocks->maxNumBlocks); } // Store the address of this new block memBlocks->blocks[memBlocks->numBlocks] = memBlocks->lastBlock; // Reset number of entries in this block memBlocks->numEntries[memBlocks->numBlocks] = 0; memBlocks->numBlocks++; } // Use the next available slot in the latest block newEntry = ((char *)(memBlocks->lastBlock)) + memBlocks->numEntries[memBlocks->numBlocks - 1] * memBlocks->entrySize; memBlocks->numEntries[memBlocks->numBlocks - 1] += numNewEntries; memBlocks->numTotalEntries += numNewEntries; return newEntry; }
// Add a given word position to the index void index_addWord(uint4 codeword, uint4 subjectNumber, uint4 offset) { struct wordList* wordList; unsigned char* vbyteEncoded; uint4 sequenceGap, offsetGap, vbyteSize; wordList = index_words + codeword; // Encoded the difference between this sequence number and the last sequenceGap = subjectNumber - wordList->lastSequenceNumber; vbyteEncoded = index_vbyteEncoded; vbyte_putVbyte(vbyteEncoded, sequenceGap); // If last occurrence was in the same sequence, store difference in offet if (sequenceGap == 0) offsetGap = offset - wordList->lastOffset; else // Else store absolute offset offsetGap = offset; vbyte_putVbyte(vbyteEncoded, offsetGap); vbyteSize = vbyteEncoded - index_vbyteEncoded; // If we need more space in the list of offsets if (wordList->length + vbyteSize >= wordList->allocated) { wordList->allocated += vbyteSize + 10; wordList->offsets = (unsigned char*)global_realloc(wordList->offsets, sizeof(char) * wordList->allocated); } // Record encoded offset memcpy(wordList->offsets + wordList->length, index_vbyteEncoded, vbyteSize); wordList->lastOffset = offset; wordList->lastSequenceNumber = subjectNumber; wordList->length += vbyteSize; }
// Extend the end of a region if necessary void unpack_extendRegionEnd(int4 position, struct unpackRegion *unpackRegion) { unsigned char *newUnpackedSubject; int4 newRegionStart, newRegionEnd; // printf("pos=%d region=%d,%d subjectLength=%d\n", position, // unpackRegion->startOffset, // unpackRegion->endOffset, unpackRegion->subjectLength); // fflush(stdout); if (position > unpackRegion->endOffset) { // Extend the region end newRegionStart = unpackRegion->startOffset; newRegionEnd = unpackRegion->endOffset + constants_unpackRegionExtend; if (newRegionEnd > unpackRegion->subjectLength) newRegionEnd = unpackRegion->subjectLength; // Realloc memory for the new region unpackRegion->unpackedSubject += unpackRegion->startOffset; newUnpackedSubject = (unsigned char *)global_realloc( unpackRegion->unpackedSubject, sizeof(char) * (newRegionEnd - newRegionStart)); newUnpackedSubject -= newRegionStart; // Round old end unpackRegion->endOffset = (unpackRegion->endOffset / 4) * 4; // Unpack the new part of the region encoding_byteUnpackRegion(newUnpackedSubject + unpackRegion->endOffset, unpackRegion->subject + (unpackRegion->endOffset / 4), newRegionEnd - unpackRegion->endOffset); unpackRegion->unpackedSubject = newUnpackedSubject; unpackRegion->endOffset = newRegionEnd; } }
int4 main(int argc, char* argv[]) { char *sequence, *filename; uint4 sequenceLength; int4 totalWilds = 0, alphabetType; struct memSingleBlock* wildcardEdits; struct wildcardEdit* wildcardEdit; char *wildcardData = NULL, *startWildcardData = NULL; // User must provide FASTA format file at command line if (argc < 2) { fprintf(stderr, "Useage: formatdb <FASTA file>\n"); exit(-1); } filename = argv[1]; // Initialize array to store wildcard edits wildcardEdits = memSingleBlock_initialize(sizeof(struct wildcardEdit), 10); // Determine if database is protein or nucleotide alphabetType = determineDbAlphabetType(filename); if (alphabetType == encoding_protein) { printf("PROTEIN database detected.\n"); } else if (alphabetType == encoding_nucleotide) { printf("NUCLEOTIDE database detected.\n"); } // Initialize codes array encoding_initialize(alphabetType); // Initialize writing to formatted database writedb_initialize(filename, alphabetType); // Open FASTA file for reading readFasta_open(filename); printf("Formatting database..."); fflush(stdout); // Move through the FASTA file reading descriptions and sequences while (readFasta_readSequence()) { // Get sequence just read sequence = readFasta_sequenceBuffer; sequenceLength = readFasta_sequenceLength; // Encode the sequence encoding_encodeSequence(sequence, sequenceLength, alphabetType); // Convert nucleotide sequences to byte-packed format if (alphabetType == encoding_nucleotide) { // Replace any wilds with a random character totalWilds += encoding_replaceWildcards(wildcardEdits, sequence, sequenceLength); // Declare memory to hold wildcard data startWildcardData = global_realloc(startWildcardData, sizeof(char) * wildcardEdits->numEntries * 5); wildcardData = startWildcardData; // For each wildcard edit, encode details using chars and vbytes memSingleBlock_resetCurrent(wildcardEdits); while ((wildcardEdit = memSingleBlock_getCurrent(wildcardEdits)) != NULL) { // Record wild character *wildcardData = wildcardEdit->code; wildcardData++; // Convert the position to a vbyte vbyte_putVbyte(wildcardData, wildcardEdit->position); } } else { startWildcardData = wildcardData = NULL; } // printf("[%s](%d)", readFasta_descriptionBuffer, readFasta_descriptionLength); fflush(stdout); // Add sequence to the formatted collection writedb_addSequence(sequence, sequenceLength, readFasta_descriptionBuffer, readFasta_descriptionLength, startWildcardData, wildcardData - startWildcardData, NULL, 0); // Print status dots if (writedb_sequenceCount % 10000 == 0) { printf("."); fflush(stdout); } } // Close fasta reader readFasta_close(); // Finalize writing to the formatted collection writedb_close(); printf("done.\n"); printf("%d sequences processed.\n", writedb_sequenceCount); printf("%llu letters processed.\n", writedb_numberOfLetters); printf("%d wildcards encoded.\n", totalWilds); printf("%d volume(s) created.\n", writedb_volume + 1); printf("Longest/shortest sequence was %d/%d letters\n", writedb_maximumSequenceLength, writedb_minimumSequenceLength); fflush(stdout); return 0; }