// Print the contents of the index void index_print() { uint4 codeword = 0; struct wordList* wordList; unsigned char* offsets, *endOffsets; uint4 offsetGap, offset, numOffsets; uint4 totalSize = 0; while (codeword < index_numWords) { numOffsets = 0; offset = 0; wordList = index_words + codeword; offsets = wordList->offsets; endOffsets = offsets + wordList->length; totalSize += wordList->length; if (offsets < endOffsets) printf("\nCodeword=%u:", codeword); while (offsets < endOffsets) { vbyte_getVbyte(offsets, (&offsetGap)); offset += offsetGap; printf(" %u", offset); numOffsets++; } // printf("[%d/%d = %f]\n", wordList->length, numOffsets, (float)(wordList->length) / (float)numOffsets); codeword++; } printf("\nTotal table size=%d bytes\n", totalSize); }
// Unpack entire or sections of a subject sequence before gapped alignment void unpack_unpackSubject(struct PSSMatrix PSSMatrix, struct alignment *alignment) { unsigned char *subject, *unpackedSubject, wildcard, *edits, *endEdits; uint4 wildcardPosition; struct unpackRegion *firstRegion = NULL, *lastRegion, *currentRegion, *unpackRegion; int4 regionStart, regionEnd, numRegions; // No need to unpack a protein subject, or already unpacked nucleotide subject if (parameters_ssearch || encoding_alphabetType == encoding_protein) { // Just create a single region covering the entire sequence firstRegion = memBlocks_newEntry(unpack_unpackRegions); firstRegion->startOffset = 0; firstRegion->endOffset = alignment->subjectLength; firstRegion->subject = alignment->subject; firstRegion->unpackedSubject = alignment->subject; firstRegion->subjectLength = alignment->subjectLength; alignment->unpackRegions = firstRegion; alignment->numUnpackRegions = 1; return; } // Get the subject regions for this alignment numRegions = unpack_getRegions(PSSMatrix, alignment, 0, unpack_unpackRegions); lastRegion = memBlocks_getLastEntry(unpack_unpackRegions); lastRegion++; firstRegion = lastRegion - numRegions; // Sort the regions in order of start position qsort(firstRegion, lastRegion - firstRegion, sizeof(struct unpackRegion), unpack_compareUnpackRegions); // Unpack each region currentRegion = firstRegion; while (currentRegion < lastRegion) { regionEnd = currentRegion->endOffset; regionStart = currentRegion->startOffset; #ifdef VERBOSE if (parameters_verboseDloc == alignment->descriptionLocation) { printf("Unpack subject region %d to %d (length=%d)\n", regionStart, regionEnd, alignment->subjectLength); fflush(stdout); } #endif // Get the subject region to be unpacked if (alignment->unpackRegions == NULL) { subject = alignment->subject; } else { unpackRegion = unpack_selectRegion( alignment->unpackRegions, alignment->numUnpackRegions, regionStart); subject = unpackRegion->subject; } // Declare memory for the region unpackedSubject = (unsigned char *)global_malloc(sizeof(char) * (regionEnd - regionStart)); // Unpack the region of interest encoding_byteUnpackRegion(unpackedSubject, subject + (regionStart / 4), regionEnd - regionStart); unpackedSubject -= regionStart; currentRegion->unpackedSubject = unpackedSubject; currentRegion->subject = subject; currentRegion->subjectLength = alignment->subjectLength; blast_totalUnpacked += (regionEnd - regionStart); currentRegion++; } currentRegion = firstRegion; // Get wildcard edits for the sequence edits = alignment->edits; endEdits = alignment->edits + alignment->encodedLength - ((alignment->subjectLength + 3) / 4); // If there are edits if (edits < endEdits) { // Read first wildcard wildcard = *edits; edits++; // Read its position vbyte_getVbyte(edits, &wildcardPosition); // For each region in order of position in the subject while (currentRegion < lastRegion) { // Skip past edits that are before current region while (edits < endEdits && wildcardPosition < currentRegion->startOffset) { // Read wildcard wildcard = *edits; edits++; // Read its position vbyte_getVbyte(edits, &wildcardPosition); } // Process edits that are in the current region while (edits < endEdits && wildcardPosition < currentRegion->endOffset) { // Insert wildcard into sequence currentRegion->unpackedSubject[wildcardPosition] = wildcard; // Read next wildcard wildcard = *edits; edits++; // Read its position vbyte_getVbyte(edits, &wildcardPosition); } // Advance to the next region currentRegion++; } } alignment->unpackRegions = firstRegion; alignment->numUnpackRegions = lastRegion - firstRegion; }
// Given a query sequence uses an inverted index of the collection to identify the // sequence number and offset of all hits between the query and the collection void index_processQuery(unsigned char* startIndex, struct PSSMatrix PSSMatrix, uint4 numSequences) { uint4 queryPosition, codeword = 0, queryPosition4; unsigned char* offsets, *endOffsets; uint4 offsetGap, offset, sequenceGap, sequenceNumber; struct indexCoordinate* coordinate; struct memBlocks* unsortedCoordinates; uint4 *numSubjectHits, numQueryPositions, queryWordCount, numOffsets; uint4 time, wordPosition, containsWildcard; struct queryWord* queryWords; // Read word and interval size from start of index vbyte_getVbyte(startIndex, &index_wordSize); vbyte_getVbyte(startIndex, &index_intervalSize); index_numWords = pow(4, index_wordSize); index_sequencePositions = (uint4*)startIndex; index_descriptionLocations = index_sequencePositions + numSequences; index_loadedWords = index_descriptionLocations + numSequences; index_offsets = (unsigned char*)(index_loadedWords + index_numWords + 1); time = clock(); unsortedCoordinates = memBlocks_initialize(sizeof(struct indexCoordinate), numSequences); // Declare and initialize array for count number of hits for each sequence numSubjectHits = (uint*)global_malloc(sizeof(uint4) * numSequences); sequenceNumber = 0; while (sequenceNumber < numSequences) { numSubjectHits[sequenceNumber] = 0; sequenceNumber++; } // Memory to hold offsets string for each query word numQueryPositions = PSSMatrix.length - index_wordSize + 1; queryWords = (struct queryWord*)global_malloc(sizeof(struct queryWord) * numQueryPositions); // For each word in the query queryPosition = 0; while (queryPosition < numQueryPositions) { // Check if the word contains a wildcard containsWildcard = 0; wordPosition = 0; while (wordPosition < index_wordSize) { if (PSSMatrix.queryCodes[queryPosition + wordPosition] >= encoding_numRegularLetters) containsWildcard = 1; wordPosition++; } // Don't include words that cross the strand boundry or contain wildcards if (!containsWildcard && !(queryPosition < PSSMatrix.strandLength && queryPosition >= PSSMatrix.strandLength - index_wordSize + 1)) { // printf("--Query position=%d\n", queryPosition); // Get the codeword codeword = index_generateCodeword(PSSMatrix.bestMatchCodes + queryPosition, index_wordSize); // Get wordlist for that codeword offsets = index_offsets + index_loadedWords[codeword]; endOffsets = index_offsets + index_loadedWords[codeword + 1]; queryWords[queryPosition].offsets = offsets; queryWords[queryPosition].endOffsets = endOffsets; queryWords[queryPosition].queryPosition = queryPosition; queryWords[queryPosition].codeword = codeword; // printf("codeword=%d start=%d end=%d numHits=%d\n", codeword, index_loadedWords[codeword], // index_loadedWords[codeword + 1], endOffsets - offsets); } else { queryWords[queryPosition].offsets = NULL; queryWords[queryPosition].endOffsets = NULL; queryWords[queryPosition].queryPosition = queryPosition; queryWords[queryPosition].codeword = codeword; } // printf("\n"); queryPosition++; } // Sort the query words by codeword qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareCodeword); // For each query word queryWordCount = 0; while (queryWordCount < numQueryPositions) { // Ignoring those that cross the strand boundry if (queryWords[queryWordCount].offsets != NULL) { // Make in-memory copy of list of offsets numOffsets = queryWords[queryWordCount].endOffsets - queryWords[queryWordCount].offsets; offsets = (char*)global_malloc(sizeof(char) * numOffsets); memcpy(offsets, queryWords[queryWordCount].offsets, numOffsets); queryWords[queryWordCount].offsets = offsets; queryWords[queryWordCount].endOffsets = offsets + numOffsets; } queryWordCount++; } // Sort the query words by query position qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareQueryPosition); queryPosition = 0; while (queryPosition < numQueryPositions) { // Ignoring those that cross the strand boundry if (queryWords[queryPosition].offsets != NULL) { offsets = queryWords[queryPosition].offsets; endOffsets = queryWords[queryPosition].endOffsets; offset = 0; sequenceNumber = 0; queryPosition4 = queryPosition + (index_wordSize - 4); // Traverse the offsets while (offsets < endOffsets) { vbyte_getVbyte(offsets, (&sequenceGap)); vbyte_getVbyte(offsets, (&offsetGap)); // printf("[%d,%d]\n", sequenceGap, offsetGap); if (sequenceGap > 0) { offset = offsetGap; sequenceNumber += sequenceGap; } else { offset += offsetGap; } // printf(" %u", offset); // Add query/database coordinate of match to relevant bucket // printf("Sequence number=%d\n", sequenceNumber); coordinate = (struct indexCoordinate*)memBlocks_newEntry(unsortedCoordinates); coordinate->queryOffset = queryPosition4; coordinate->subjectOffset = offset * index_intervalSize + (index_wordSize - 4); coordinate->subjectNumber = sequenceNumber; numSubjectHits[sequenceNumber]++; // printf("[%d,%d]\n", queryPosition, offset); blast_numHits++; } free(queryWords[queryPosition].offsets); } queryPosition++; } printf("Time to process query=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC); time = clock(); // Make memory for sorted list index_numCoordinates = unsortedCoordinates->numTotalEntries; index_coordinates = (struct indexCoordinate*)global_malloc( sizeof(struct indexCoordinate) * index_numCoordinates); index_sequenceCoordinates = (struct indexCoordinate**)global_malloc( sizeof(struct indexCoordinate*) * numSequences); // For each sequence coordinate = index_coordinates; sequenceNumber = 0; while (sequenceNumber < numSequences) { // If it has hits if (numSubjectHits[sequenceNumber] != 0) { // Point to location in sorted list of coordinates index_sequenceCoordinates[sequenceNumber] = coordinate; coordinate += numSubjectHits[sequenceNumber]; numSubjectHits[sequenceNumber] = 0; } sequenceNumber++; } // Move through list of unsorted coordinates memBlocks_resetCurrent(unsortedCoordinates); while ((coordinate = memBlocks_getCurrent(unsortedCoordinates)) != NULL) { sequenceNumber = coordinate->subjectNumber; // printf("%d,%d=[%d]\n", index_sequenceCoordinates[sequenceNumber], numSubjectHits[sequenceNumber], sequenceNumber); // Place into sorted list index_sequenceCoordinates[sequenceNumber][numSubjectHits[sequenceNumber]] = *coordinate; numSubjectHits[sequenceNumber]++; } memBlocks_free(unsortedCoordinates); /* // Print sorted coordinates coordinate = index_coordinates; while (coordinate < index_coordinates + index_numCoordinates) { printf("[%d]", coordinate); printf("Subject %d Offset %d,%d\n", coordinate->subjectNumber, coordinate->queryOffset, coordinate->subjectOffset); coordinate++; }*/ printf("Time to sort buckets=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC); }