// Perform an ungapped extension between points queryStart,subjectStart and queryEnd,subjectEnd // and extend in each direction until score drops below best score yet minus a dropoff parameter // Shucai struct ungappedExtension* ungappedExtension_extend(int4 queryoffset, unsigned char* subjectHit, uint4 lastHitFP, struct PSSMatrix PSSMatrix, struct PSSMatrixFP PSSMatrixFP, unsigned char* subject, unsigned char *startAddressFP) { //Shucai int2 *queryPosition; unsigned char *subjectPosition, *subjectStart, *subjectEnd; int4 changeSinceBest = 0; int4 dropoff, originalDropoff; originalDropoff = dropoff = -statistics_ungappedNominalDropoff; ungappedExtension_bestScore = 0; // Start at queryEnd,subjectEnd (right/last hit position) queryPosition = PSSMatrixFP.matrix + queryoffset * encoding_numCodes; subjectPosition = subjectStart = subjectHit; // Extend the start of the hit backwards until dropoff while (changeSinceBest > dropoff) { //changeSinceBest += (*queryPosition)[*subjectPosition]; changeSinceBest += queryPosition[*subjectPosition]; // If we have got a positive score if (changeSinceBest > 0) { // Keep updating best score and resetting change-since-best // whilst we are reading positive scores do { ungappedExtension_bestScore += changeSinceBest; //Shucai queryPosition -= encoding_numCodes; subjectPosition--; //Shucai changeSinceBest = queryPosition[*subjectPosition]; } while (changeSinceBest > 0); subjectStart = subjectPosition; } //Shucai queryPosition -= encoding_numCodes; subjectPosition--; } // Correct for extra decrement subjectStart++; // If best start point is right of previous hit which helped trigger this extension // then stop now // Shucai //if (subjectStart - startAddressFP > lastHitFP) if (subjectStart - subject > lastHitFP) { //Shucai //ungappedExtension_subjectEndReachedFP = subjectHit - startAddressFP; ungappedExtension_subjectEndReachedFP = subjectHit - subject; return NULL; } // Starting at right/last hit position again //Shucai queryPosition = PSSMatrixFP.matrix + (queryoffset + 1) * encoding_numCodes; subjectEnd = subjectHit; subjectPosition = subjectHit + 1; changeSinceBest = 0; // May need to alter dropoff so we also dropoff if below zero if (-ungappedExtension_bestScore > originalDropoff) { dropoff = -ungappedExtension_bestScore; } // Extend end of alignment until dropoff while (changeSinceBest > dropoff) { //Shucai changeSinceBest += queryPosition[*subjectPosition]; // If we have got a positive score if (changeSinceBest > 0) { // Keep updating best score and resetting change-since-best // whilst we are reading positive scores do { ungappedExtension_bestScore += changeSinceBest; //Shucai queryPosition += encoding_numCodes; subjectPosition++; //Shucai changeSinceBest = queryPosition[*subjectPosition]; } while (changeSinceBest > 0); subjectEnd = subjectPosition; // Check need for change in dropoff if ((dropoff = -ungappedExtension_bestScore) < originalDropoff) { dropoff = originalDropoff; } } //Shucai queryPosition += encoding_numCodes; subjectPosition++; } // Correct for extra increment subjectEnd--; //Shucai //ungappedExtension_subjectEndReachedFP = subjectEnd - startAddressFP; ungappedExtension_subjectEndReachedFP = subjectEnd - subject; // If extension scored above trigger for gapping, create object and return it if (ungappedExtension_bestScore >= blast_ungappedNominalTrigger) { int4 diagonal; struct ungappedExtension* newUngappedExtension; newUngappedExtension = memBlocks_newEntry(ungappedExtension_extensions); // Calculate diagonal // Shucai diagonal = (subjectHit - subject) - queryoffset; // Determine offsets from pointers newUngappedExtension->start.subjectOffset = subjectStart - subject; newUngappedExtension->end.subjectOffset = subjectEnd - subject; newUngappedExtension->start.queryOffset = newUngappedExtension->start.subjectOffset - diagonal; newUngappedExtension->end.queryOffset = newUngappedExtension->end.subjectOffset - diagonal; // Find the seed point newUngappedExtension->seed = ungappedExtension_findProteinSeed(newUngappedExtension, PSSMatrix, PSSMatrixFP, subject); // Initialize next to null newUngappedExtension->next = NULL; newUngappedExtension->nominalScore = ungappedExtension_bestScore; newUngappedExtension->status = ungappedExtension_UNGAPPED; return newUngappedExtension; } else { return NULL; } }
// Perform one-hit seeded ungapped extension for nucleotide, 1 packed-byte at a time struct ungappedExtension* ungappedExtension_nucleotideExtend(int4 queryHitOffset, int4 subjectHitOffset, struct PSSMatrix PSSMatrix, unsigned char* subject, uint4 subjectLength) { unsigned char* queryPosition, *minQueryPosition, *maxQueryPosition; unsigned char* subjectPosition, *subjectStart, *subjectEnd; int4 dropoff, originalDropoff; int4 changeSinceBest = 0; int4 matchLettersScore; originalDropoff = dropoff = -statistics_ungappedNominalDropoff; // Start with score for lookup-table nucleotide match that is not aligned ungappedExtension_bestScore = ungappedExtension_tableMatchesReward; // Determine minimum query position; either start of the query or start of the second strand if (queryHitOffset <= PSSMatrix.strandLength) { if (queryHitOffset < subjectHitOffset * 4) minQueryPosition = PSSMatrix.bytePackedCodes; else minQueryPosition = PSSMatrix.bytePackedCodes + queryHitOffset - subjectHitOffset * 4; } else { if (queryHitOffset - PSSMatrix.strandLength < subjectHitOffset * 4) minQueryPosition = PSSMatrix.bytePackedCodes + PSSMatrix.strandLength; else minQueryPosition = PSSMatrix.bytePackedCodes + queryHitOffset - subjectHitOffset * 4; } // Start left of hit location queryPosition = PSSMatrix.bytePackedCodes + queryHitOffset - parameters_wordTableLetters - 4; subjectPosition = subjectStart = subject + subjectHitOffset - parameters_wordTableBytes - 1; // Consider partial match of first byte before hit matchLettersScore = PSSMatrix_packedLeftMatchScores[*queryPosition ^ *subjectPosition]; ungappedExtension_bestScore += matchLettersScore; changeSinceBest = -matchLettersScore; // Move back through alignment until start of query or subject, or until dropoff while (queryPosition > minQueryPosition) { // Add score of matching entire bytes changeSinceBest += PSSMatrix_packedScore[*queryPosition ^ *subjectPosition]; #ifdef VERBOSE if (parameters_verboseDloc == blast_dloc) { printf("<%d< ", PSSMatrix_packedScore[*queryPosition ^ *subjectPosition]); printf("["); encoding_printLetters(*queryPosition, 4); printf(","); encoding_printLetters(*subjectPosition, 4); printf("]\n"); } #endif // If we possibly have a new best score if (changeSinceBest > ungappedExtension_minus3reward) { // Get score for matching individual letters in next byte queryPosition-=4; subjectPosition--; matchLettersScore = PSSMatrix_packedLeftMatchScores[*queryPosition ^ *subjectPosition]; // If best score if (changeSinceBest + matchLettersScore > 0) { // Mark new best position subjectStart = subjectPosition; // Update best score and change since best ungappedExtension_bestScore += changeSinceBest + matchLettersScore; changeSinceBest = -matchLettersScore; #ifdef VERBOSE if (parameters_verboseDloc == blast_dloc) printf("(Best=%d)\n", ungappedExtension_bestScore); #endif } } else { // Decrease in score, check dropoff if (changeSinceBest < dropoff) break; queryPosition-=4; subjectPosition--; } } // Determine maximum query position; either end of the query or end of the first strand if (queryHitOffset <= PSSMatrix.strandLength) { if (PSSMatrix.strandLength - queryHitOffset < subjectLength - subjectHitOffset * 4) maxQueryPosition = PSSMatrix.bytePackedCodes + PSSMatrix.strandLength - 4; else maxQueryPosition = PSSMatrix.bytePackedCodes + (subjectLength - subjectHitOffset * 4) + queryHitOffset - 4; } else { if (PSSMatrix.length - queryHitOffset < subjectLength - subjectHitOffset * 4) maxQueryPosition = PSSMatrix.bytePackedCodes + PSSMatrix.length - 4; else maxQueryPosition = PSSMatrix.bytePackedCodes + (subjectLength - subjectHitOffset * 4) + queryHitOffset - 4; } // Starting right of hit position queryPosition = PSSMatrix.bytePackedCodes + queryHitOffset; subjectPosition = subjectEnd = subject + subjectHitOffset; changeSinceBest = 0; // May need to alter dropoff so we also dropoff if below zero if (-ungappedExtension_bestScore > originalDropoff) { dropoff = -ungappedExtension_bestScore; } // Consider partial match of first byte after hit matchLettersScore = PSSMatrix_packedRightMatchScores[*queryPosition ^ *subjectPosition]; ungappedExtension_bestScore += matchLettersScore; changeSinceBest = -matchLettersScore; // Move forward through alignment until end of query or subject, or until dropoff while (queryPosition < maxQueryPosition) { // Score of matching entire bytes changeSinceBest += PSSMatrix_packedScore[*queryPosition ^ *subjectPosition]; #ifdef VERBOSE if (parameters_verboseDloc == blast_dloc) { printf(">%d> ", PSSMatrix_packedScore[*queryPosition ^ *subjectPosition]); printf("["); encoding_printLetters(*queryPosition, 4); printf(","); encoding_printLetters(*subjectPosition, 4); printf("]\n"); printf("changeSinceBest=%d\n", changeSinceBest); } #endif // If we possibly have a new best score if (changeSinceBest > ungappedExtension_minus3reward) { // Get score for matching individual letters in next byte queryPosition+=4; subjectPosition++; matchLettersScore = PSSMatrix_packedRightMatchScores[*queryPosition ^ *subjectPosition]; // If best score if (changeSinceBest + matchLettersScore > 0) { // Mark new best position subjectEnd = subjectPosition; // Update best score and change since best ungappedExtension_bestScore += changeSinceBest + matchLettersScore; changeSinceBest = -matchLettersScore; #ifdef VERBOSE if (parameters_verboseDloc == blast_dloc) printf("(Best=%d)\n", ungappedExtension_bestScore); #endif } } else { // Decrease in score, check dropoff if (changeSinceBest < dropoff) break; queryPosition+=4; subjectPosition++; } } // Record the point we got to extending forwards ungappedExtension_subjectEndReached = subjectPosition; // If extension scored above trigger for gapping, create object and return it if (ungappedExtension_bestScore >= blast_ungappedNominalTrigger) { int4 diagonal; struct ungappedExtension* newUngappedExtension; newUngappedExtension = memBlocks_newEntry(ungappedExtension_extensions); // Correct for extra decrement subjectStart++; // Correct for extra increment subjectEnd--; // Calculate diagonal diagonal = subjectHitOffset * 4 - queryHitOffset; // Determine offsets from pointers newUngappedExtension->start.subjectOffset = (subjectStart - subject) * 4; newUngappedExtension->end.subjectOffset = (subjectEnd - subject) * 4; newUngappedExtension->start.queryOffset = newUngappedExtension->start.subjectOffset - diagonal; newUngappedExtension->end.queryOffset = newUngappedExtension->end.subjectOffset - diagonal; newUngappedExtension->seed.queryOffset = -1; newUngappedExtension->seed.subjectOffset = -1; // Initialize next to null newUngappedExtension->next = NULL; newUngappedExtension->nominalScore = ungappedExtension_bestScore; newUngappedExtension->status = ungappedExtension_UNGAPPED; #ifdef VERBOSE if (parameters_verboseDloc == blast_dloc) { printf("Hit=%d,%d\n", queryHitOffset, subjectHitOffset); printf("%d,%d - %d,%d\n", newUngappedExtension->start.queryOffset, newUngappedExtension->start.subjectOffset, newUngappedExtension->end.queryOffset, newUngappedExtension->end.subjectOffset); fflush(stdout); printf("seed=%d,%d\n", newUngappedExtension->seed.queryOffset, newUngappedExtension->seed.subjectOffset); } #endif return newUngappedExtension; } else { return NULL; } }
// Given a query sequence uses an inverted index of the collection to identify the // sequence number and offset of all hits between the query and the collection void index_processQuery(unsigned char* startIndex, struct PSSMatrix PSSMatrix, uint4 numSequences) { uint4 queryPosition, codeword = 0, queryPosition4; unsigned char* offsets, *endOffsets; uint4 offsetGap, offset, sequenceGap, sequenceNumber; struct indexCoordinate* coordinate; struct memBlocks* unsortedCoordinates; uint4 *numSubjectHits, numQueryPositions, queryWordCount, numOffsets; uint4 time, wordPosition, containsWildcard; struct queryWord* queryWords; // Read word and interval size from start of index vbyte_getVbyte(startIndex, &index_wordSize); vbyte_getVbyte(startIndex, &index_intervalSize); index_numWords = pow(4, index_wordSize); index_sequencePositions = (uint4*)startIndex; index_descriptionLocations = index_sequencePositions + numSequences; index_loadedWords = index_descriptionLocations + numSequences; index_offsets = (unsigned char*)(index_loadedWords + index_numWords + 1); time = clock(); unsortedCoordinates = memBlocks_initialize(sizeof(struct indexCoordinate), numSequences); // Declare and initialize array for count number of hits for each sequence numSubjectHits = (uint*)global_malloc(sizeof(uint4) * numSequences); sequenceNumber = 0; while (sequenceNumber < numSequences) { numSubjectHits[sequenceNumber] = 0; sequenceNumber++; } // Memory to hold offsets string for each query word numQueryPositions = PSSMatrix.length - index_wordSize + 1; queryWords = (struct queryWord*)global_malloc(sizeof(struct queryWord) * numQueryPositions); // For each word in the query queryPosition = 0; while (queryPosition < numQueryPositions) { // Check if the word contains a wildcard containsWildcard = 0; wordPosition = 0; while (wordPosition < index_wordSize) { if (PSSMatrix.queryCodes[queryPosition + wordPosition] >= encoding_numRegularLetters) containsWildcard = 1; wordPosition++; } // Don't include words that cross the strand boundry or contain wildcards if (!containsWildcard && !(queryPosition < PSSMatrix.strandLength && queryPosition >= PSSMatrix.strandLength - index_wordSize + 1)) { // printf("--Query position=%d\n", queryPosition); // Get the codeword codeword = index_generateCodeword(PSSMatrix.bestMatchCodes + queryPosition, index_wordSize); // Get wordlist for that codeword offsets = index_offsets + index_loadedWords[codeword]; endOffsets = index_offsets + index_loadedWords[codeword + 1]; queryWords[queryPosition].offsets = offsets; queryWords[queryPosition].endOffsets = endOffsets; queryWords[queryPosition].queryPosition = queryPosition; queryWords[queryPosition].codeword = codeword; // printf("codeword=%d start=%d end=%d numHits=%d\n", codeword, index_loadedWords[codeword], // index_loadedWords[codeword + 1], endOffsets - offsets); } else { queryWords[queryPosition].offsets = NULL; queryWords[queryPosition].endOffsets = NULL; queryWords[queryPosition].queryPosition = queryPosition; queryWords[queryPosition].codeword = codeword; } // printf("\n"); queryPosition++; } // Sort the query words by codeword qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareCodeword); // For each query word queryWordCount = 0; while (queryWordCount < numQueryPositions) { // Ignoring those that cross the strand boundry if (queryWords[queryWordCount].offsets != NULL) { // Make in-memory copy of list of offsets numOffsets = queryWords[queryWordCount].endOffsets - queryWords[queryWordCount].offsets; offsets = (char*)global_malloc(sizeof(char) * numOffsets); memcpy(offsets, queryWords[queryWordCount].offsets, numOffsets); queryWords[queryWordCount].offsets = offsets; queryWords[queryWordCount].endOffsets = offsets + numOffsets; } queryWordCount++; } // Sort the query words by query position qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareQueryPosition); queryPosition = 0; while (queryPosition < numQueryPositions) { // Ignoring those that cross the strand boundry if (queryWords[queryPosition].offsets != NULL) { offsets = queryWords[queryPosition].offsets; endOffsets = queryWords[queryPosition].endOffsets; offset = 0; sequenceNumber = 0; queryPosition4 = queryPosition + (index_wordSize - 4); // Traverse the offsets while (offsets < endOffsets) { vbyte_getVbyte(offsets, (&sequenceGap)); vbyte_getVbyte(offsets, (&offsetGap)); // printf("[%d,%d]\n", sequenceGap, offsetGap); if (sequenceGap > 0) { offset = offsetGap; sequenceNumber += sequenceGap; } else { offset += offsetGap; } // printf(" %u", offset); // Add query/database coordinate of match to relevant bucket // printf("Sequence number=%d\n", sequenceNumber); coordinate = (struct indexCoordinate*)memBlocks_newEntry(unsortedCoordinates); coordinate->queryOffset = queryPosition4; coordinate->subjectOffset = offset * index_intervalSize + (index_wordSize - 4); coordinate->subjectNumber = sequenceNumber; numSubjectHits[sequenceNumber]++; // printf("[%d,%d]\n", queryPosition, offset); blast_numHits++; } free(queryWords[queryPosition].offsets); } queryPosition++; } printf("Time to process query=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC); time = clock(); // Make memory for sorted list index_numCoordinates = unsortedCoordinates->numTotalEntries; index_coordinates = (struct indexCoordinate*)global_malloc( sizeof(struct indexCoordinate) * index_numCoordinates); index_sequenceCoordinates = (struct indexCoordinate**)global_malloc( sizeof(struct indexCoordinate*) * numSequences); // For each sequence coordinate = index_coordinates; sequenceNumber = 0; while (sequenceNumber < numSequences) { // If it has hits if (numSubjectHits[sequenceNumber] != 0) { // Point to location in sorted list of coordinates index_sequenceCoordinates[sequenceNumber] = coordinate; coordinate += numSubjectHits[sequenceNumber]; numSubjectHits[sequenceNumber] = 0; } sequenceNumber++; } // Move through list of unsorted coordinates memBlocks_resetCurrent(unsortedCoordinates); while ((coordinate = memBlocks_getCurrent(unsortedCoordinates)) != NULL) { sequenceNumber = coordinate->subjectNumber; // printf("%d,%d=[%d]\n", index_sequenceCoordinates[sequenceNumber], numSubjectHits[sequenceNumber], sequenceNumber); // Place into sorted list index_sequenceCoordinates[sequenceNumber][numSubjectHits[sequenceNumber]] = *coordinate; numSubjectHits[sequenceNumber]++; } memBlocks_free(unsortedCoordinates); /* // Print sorted coordinates coordinate = index_coordinates; while (coordinate < index_coordinates + index_numCoordinates) { printf("[%d]", coordinate); printf("Subject %d Offset %d,%d\n", coordinate->subjectNumber, coordinate->queryOffset, coordinate->subjectOffset); coordinate++; }*/ printf("Time to sort buckets=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC); }
// Unpack entire or sections of a subject sequence before gapped alignment void unpack_unpackSubject(struct PSSMatrix PSSMatrix, struct alignment *alignment) { unsigned char *subject, *unpackedSubject, wildcard, *edits, *endEdits; uint4 wildcardPosition; struct unpackRegion *firstRegion = NULL, *lastRegion, *currentRegion, *unpackRegion; int4 regionStart, regionEnd, numRegions; // No need to unpack a protein subject, or already unpacked nucleotide subject if (parameters_ssearch || encoding_alphabetType == encoding_protein) { // Just create a single region covering the entire sequence firstRegion = memBlocks_newEntry(unpack_unpackRegions); firstRegion->startOffset = 0; firstRegion->endOffset = alignment->subjectLength; firstRegion->subject = alignment->subject; firstRegion->unpackedSubject = alignment->subject; firstRegion->subjectLength = alignment->subjectLength; alignment->unpackRegions = firstRegion; alignment->numUnpackRegions = 1; return; } // Get the subject regions for this alignment numRegions = unpack_getRegions(PSSMatrix, alignment, 0, unpack_unpackRegions); lastRegion = memBlocks_getLastEntry(unpack_unpackRegions); lastRegion++; firstRegion = lastRegion - numRegions; // Sort the regions in order of start position qsort(firstRegion, lastRegion - firstRegion, sizeof(struct unpackRegion), unpack_compareUnpackRegions); // Unpack each region currentRegion = firstRegion; while (currentRegion < lastRegion) { regionEnd = currentRegion->endOffset; regionStart = currentRegion->startOffset; #ifdef VERBOSE if (parameters_verboseDloc == alignment->descriptionLocation) { printf("Unpack subject region %d to %d (length=%d)\n", regionStart, regionEnd, alignment->subjectLength); fflush(stdout); } #endif // Get the subject region to be unpacked if (alignment->unpackRegions == NULL) { subject = alignment->subject; } else { unpackRegion = unpack_selectRegion( alignment->unpackRegions, alignment->numUnpackRegions, regionStart); subject = unpackRegion->subject; } // Declare memory for the region unpackedSubject = (unsigned char *)global_malloc(sizeof(char) * (regionEnd - regionStart)); // Unpack the region of interest encoding_byteUnpackRegion(unpackedSubject, subject + (regionStart / 4), regionEnd - regionStart); unpackedSubject -= regionStart; currentRegion->unpackedSubject = unpackedSubject; currentRegion->subject = subject; currentRegion->subjectLength = alignment->subjectLength; blast_totalUnpacked += (regionEnd - regionStart); currentRegion++; } currentRegion = firstRegion; // Get wildcard edits for the sequence edits = alignment->edits; endEdits = alignment->edits + alignment->encodedLength - ((alignment->subjectLength + 3) / 4); // If there are edits if (edits < endEdits) { // Read first wildcard wildcard = *edits; edits++; // Read its position vbyte_getVbyte(edits, &wildcardPosition); // For each region in order of position in the subject while (currentRegion < lastRegion) { // Skip past edits that are before current region while (edits < endEdits && wildcardPosition < currentRegion->startOffset) { // Read wildcard wildcard = *edits; edits++; // Read its position vbyte_getVbyte(edits, &wildcardPosition); } // Process edits that are in the current region while (edits < endEdits && wildcardPosition < currentRegion->endOffset) { // Insert wildcard into sequence currentRegion->unpackedSubject[wildcardPosition] = wildcard; // Read next wildcard wildcard = *edits; edits++; // Read its position vbyte_getVbyte(edits, &wildcardPosition); } // Advance to the next region currentRegion++; } } alignment->unpackRegions = firstRegion; alignment->numUnpackRegions = lastRegion - firstRegion; }
// Add sequence to the formatted collection void writedb_addSequence_oid(unsigned char *sequence, uint4 sequenceLength, unsigned char *description, uint4 descriptionLength, unsigned char *wildcards, uint4 wildcardsLength, struct child *children, uint4 numChildren, uint4 oid) { uint4 encodedLength, childNum, sizeEdits = 0, editNum; unsigned char *editData, *startEditData; struct child child; struct sequenceData *sequenceData; sequenceData = memBlocks_newEntry(writedb_sequenceData); // Write the description to file if (description != NULL) if (fwrite(description, sizeof(unsigned char), descriptionLength, writedb_descriptionsFile) < descriptionLength) { fprintf(stderr, "Error writing header to sequence file %s\n", writedb_sequenceFilename); exit(-1); } // Calculate length of encoded sequence if (writedb_alphabetType == encoding_nucleotide) { encodedLength = encoding_bytePackSequence(sequence, sequenceLength); } else { encodedLength = sequenceLength + 2; } // Calculate maximum space required to record sequence's edits childNum = 0; while (childNum < numChildren) { child = children[childNum]; sizeEdits += 16 + 5 * child.numEdits; childNum++; } // Initialize array to record edits editData = startEditData = global_malloc(sizeEdits); // Record children edits as vbytes childNum = 0; while (childNum < numChildren) { child = children[childNum]; // Write children descriptions to disk if (fwrite(child.description, sizeof(unsigned char), child.descriptionLength, writedb_descriptionsFile) < child.descriptionLength) { fprintf(stderr, "Error writing description to sequence file %s\n", writedb_descriptionsFilename); exit(-1); } descriptionLength += child.descriptionLength; // Convert child details to vbytes vbyte_safePutVbyte(editData, child.descriptionLength); vbyte_safePutVbyte(editData, child.regionStart); vbyte_safePutVbyte(editData, child.length); vbyte_safePutVbyte(editData, child.numEdits); // Append edits editNum = 0; while (editNum < child.numEdits) { // Record edit character *editData = child.edits[editNum].code; editData++; editNum++; } // Add sequence size to total tally of letters writedb_numberOfLetters += child.length; writedb_sequenceCount++; childNum++; } // Update volume size, encoded length encodedLength += (editData - startEditData); writedb_volumeSize += encodedLength + wildcardsLength; sequenceData->descriptionLength = descriptionLength; sequenceData->sequenceLength = sequenceLength; sequenceData->encodedLength = encodedLength + wildcardsLength; sequenceData->oid = oid; // If the entry will exceed volume max size if (writedb_volumeSize > constants_volumeMaxSize) { // Close current volume fclose(writedb_sequenceFile); // Open next volume for writing writedb_volume++; sprintf(writedb_sequenceFilename, "%s.sequences%d", writedb_filename, writedb_volume); if ((writedb_sequenceFile = fopen(writedb_sequenceFilename, "w")) == NULL) { fprintf(stderr, "Error opening file %s for writing\n", writedb_sequenceFilename); exit(-1); } // Reset volume size counter writedb_volumeSize = encodedLength + wildcardsLength; } // Nulceotide if (writedb_alphabetType == encoding_nucleotide) { // Write packed nucleotide sequences to disk if (fwrite(sequence, sizeof(unsigned char), encodedLength, writedb_sequenceFile) < encodedLength) { fprintf(stderr, "Error writing to sequence file %s\n", writedb_sequenceFilename); exit(-1); } } // Protein else { // Write sentinal byte after protein sequences fputc(encoding_sentinalCode, writedb_sequenceFile); // Write sequence codes to disk if (fwrite(sequence, sizeof(unsigned char), sequenceLength, writedb_sequenceFile) < sequenceLength) { fprintf(stderr, "Error writing to sequence file %s\n", writedb_sequenceFilename); exit(-1); } // Write sentinal byte after protein sequences fputc(encoding_sentinalCode, writedb_sequenceFile); } // Write wildcard data to disk if (fwrite(wildcards, sizeof(unsigned char), wildcardsLength, writedb_sequenceFile) < wildcardsLength) { fprintf(stderr, "Error writing to sequence file %s\n", writedb_sequenceFilename); exit(-1); } // Write edit information to disk if (fwrite(startEditData, sizeof(unsigned char), (editData - startEditData), writedb_sequenceFile) < (editData - startEditData)) { fprintf(stderr, "Error writing to sequence file %s\n", writedb_sequenceFilename); exit(-1); } free(startEditData); if (numChildren == 0) { // Add sequence size to total tally of letters writedb_numberOfLetters += sequenceLength; writedb_sequenceCount++; } writedb_numberOfClusters++; // Check for new longest/shortest sequence if (sequenceLength > writedb_maximumSequenceLength) writedb_maximumSequenceLength = sequenceLength; if (writedb_minimumSequenceLength == 0 || sequenceLength < writedb_minimumSequenceLength) writedb_minimumSequenceLength = sequenceLength; }