// Free memory used to store unpacked regions void unpack_free() { struct unpackRegion *region; // For each unpack region if (!parameters_ssearch && encoding_alphabetType == encoding_nucleotide) { memBlocks_resetCurrent(unpack_unpackRegions); while ((region = memBlocks_getCurrent(unpack_unpackRegions)) != NULL) { // Free the unpacked sequence free(region->unpackedSubject + region->startOffset); } } // For each copied subject region memBlocks_resetCurrent(unpack_subjectRegions); while ((region = memBlocks_getCurrent(unpack_subjectRegions)) != NULL) { // Free the subject free(region->subject + region->startOffset / 4); } memBlocks_free(unpack_unpackRegions); memBlocks_free(unpack_subjectRegions); }
// Given a query sequence uses an inverted index of the collection to identify the // sequence number and offset of all hits between the query and the collection void index_processQuery(unsigned char* startIndex, struct PSSMatrix PSSMatrix, uint4 numSequences) { uint4 queryPosition, codeword = 0, queryPosition4; unsigned char* offsets, *endOffsets; uint4 offsetGap, offset, sequenceGap, sequenceNumber; struct indexCoordinate* coordinate; struct memBlocks* unsortedCoordinates; uint4 *numSubjectHits, numQueryPositions, queryWordCount, numOffsets; uint4 time, wordPosition, containsWildcard; struct queryWord* queryWords; // Read word and interval size from start of index vbyte_getVbyte(startIndex, &index_wordSize); vbyte_getVbyte(startIndex, &index_intervalSize); index_numWords = pow(4, index_wordSize); index_sequencePositions = (uint4*)startIndex; index_descriptionLocations = index_sequencePositions + numSequences; index_loadedWords = index_descriptionLocations + numSequences; index_offsets = (unsigned char*)(index_loadedWords + index_numWords + 1); time = clock(); unsortedCoordinates = memBlocks_initialize(sizeof(struct indexCoordinate), numSequences); // Declare and initialize array for count number of hits for each sequence numSubjectHits = (uint*)global_malloc(sizeof(uint4) * numSequences); sequenceNumber = 0; while (sequenceNumber < numSequences) { numSubjectHits[sequenceNumber] = 0; sequenceNumber++; } // Memory to hold offsets string for each query word numQueryPositions = PSSMatrix.length - index_wordSize + 1; queryWords = (struct queryWord*)global_malloc(sizeof(struct queryWord) * numQueryPositions); // For each word in the query queryPosition = 0; while (queryPosition < numQueryPositions) { // Check if the word contains a wildcard containsWildcard = 0; wordPosition = 0; while (wordPosition < index_wordSize) { if (PSSMatrix.queryCodes[queryPosition + wordPosition] >= encoding_numRegularLetters) containsWildcard = 1; wordPosition++; } // Don't include words that cross the strand boundry or contain wildcards if (!containsWildcard && !(queryPosition < PSSMatrix.strandLength && queryPosition >= PSSMatrix.strandLength - index_wordSize + 1)) { // printf("--Query position=%d\n", queryPosition); // Get the codeword codeword = index_generateCodeword(PSSMatrix.bestMatchCodes + queryPosition, index_wordSize); // Get wordlist for that codeword offsets = index_offsets + index_loadedWords[codeword]; endOffsets = index_offsets + index_loadedWords[codeword + 1]; queryWords[queryPosition].offsets = offsets; queryWords[queryPosition].endOffsets = endOffsets; queryWords[queryPosition].queryPosition = queryPosition; queryWords[queryPosition].codeword = codeword; // printf("codeword=%d start=%d end=%d numHits=%d\n", codeword, index_loadedWords[codeword], // index_loadedWords[codeword + 1], endOffsets - offsets); } else { queryWords[queryPosition].offsets = NULL; queryWords[queryPosition].endOffsets = NULL; queryWords[queryPosition].queryPosition = queryPosition; queryWords[queryPosition].codeword = codeword; } // printf("\n"); queryPosition++; } // Sort the query words by codeword qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareCodeword); // For each query word queryWordCount = 0; while (queryWordCount < numQueryPositions) { // Ignoring those that cross the strand boundry if (queryWords[queryWordCount].offsets != NULL) { // Make in-memory copy of list of offsets numOffsets = queryWords[queryWordCount].endOffsets - queryWords[queryWordCount].offsets; offsets = (char*)global_malloc(sizeof(char) * numOffsets); memcpy(offsets, queryWords[queryWordCount].offsets, numOffsets); queryWords[queryWordCount].offsets = offsets; queryWords[queryWordCount].endOffsets = offsets + numOffsets; } queryWordCount++; } // Sort the query words by query position qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareQueryPosition); queryPosition = 0; while (queryPosition < numQueryPositions) { // Ignoring those that cross the strand boundry if (queryWords[queryPosition].offsets != NULL) { offsets = queryWords[queryPosition].offsets; endOffsets = queryWords[queryPosition].endOffsets; offset = 0; sequenceNumber = 0; queryPosition4 = queryPosition + (index_wordSize - 4); // Traverse the offsets while (offsets < endOffsets) { vbyte_getVbyte(offsets, (&sequenceGap)); vbyte_getVbyte(offsets, (&offsetGap)); // printf("[%d,%d]\n", sequenceGap, offsetGap); if (sequenceGap > 0) { offset = offsetGap; sequenceNumber += sequenceGap; } else { offset += offsetGap; } // printf(" %u", offset); // Add query/database coordinate of match to relevant bucket // printf("Sequence number=%d\n", sequenceNumber); coordinate = (struct indexCoordinate*)memBlocks_newEntry(unsortedCoordinates); coordinate->queryOffset = queryPosition4; coordinate->subjectOffset = offset * index_intervalSize + (index_wordSize - 4); coordinate->subjectNumber = sequenceNumber; numSubjectHits[sequenceNumber]++; // printf("[%d,%d]\n", queryPosition, offset); blast_numHits++; } free(queryWords[queryPosition].offsets); } queryPosition++; } printf("Time to process query=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC); time = clock(); // Make memory for sorted list index_numCoordinates = unsortedCoordinates->numTotalEntries; index_coordinates = (struct indexCoordinate*)global_malloc( sizeof(struct indexCoordinate) * index_numCoordinates); index_sequenceCoordinates = (struct indexCoordinate**)global_malloc( sizeof(struct indexCoordinate*) * numSequences); // For each sequence coordinate = index_coordinates; sequenceNumber = 0; while (sequenceNumber < numSequences) { // If it has hits if (numSubjectHits[sequenceNumber] != 0) { // Point to location in sorted list of coordinates index_sequenceCoordinates[sequenceNumber] = coordinate; coordinate += numSubjectHits[sequenceNumber]; numSubjectHits[sequenceNumber] = 0; } sequenceNumber++; } // Move through list of unsorted coordinates memBlocks_resetCurrent(unsortedCoordinates); while ((coordinate = memBlocks_getCurrent(unsortedCoordinates)) != NULL) { sequenceNumber = coordinate->subjectNumber; // printf("%d,%d=[%d]\n", index_sequenceCoordinates[sequenceNumber], numSubjectHits[sequenceNumber], sequenceNumber); // Place into sorted list index_sequenceCoordinates[sequenceNumber][numSubjectHits[sequenceNumber]] = *coordinate; numSubjectHits[sequenceNumber]++; } memBlocks_free(unsortedCoordinates); /* // Print sorted coordinates coordinate = index_coordinates; while (coordinate < index_coordinates + index_numCoordinates) { printf("[%d]", coordinate); printf("Subject %d Offset %d,%d\n", coordinate->subjectNumber, coordinate->queryOffset, coordinate->subjectOffset); coordinate++; }*/ printf("Time to sort buckets=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC); }
// Finalize writing to the formatted collection void writedb_close() { unsigned char headerData[100], *headerDataPointer; uint4 headerLength; struct sequenceData *sequenceData; // Write sentinal/padding byte at end if (writedb_alphabetType == encoding_protein) fputc(encoding_sentinalCode, writedb_sequenceFile); else fputc(0, writedb_sequenceFile); // Close writing to sequence and description files fclose(writedb_sequenceFile); fclose(writedb_descriptionsFile); // Open data file for writing if ((writedb_dataFile = fopen(writedb_dataFilename, "w")) == NULL) { fprintf(stderr, "Error opening file %s for writing\n", writedb_dataFilename); exit(-1); } // Convert 6 header values to vbytes headerDataPointer = headerData; vbyte_safePutVbyte(headerDataPointer, constants_databaseVersion); vbyte_safePutVbyte(headerDataPointer, writedb_sequenceCount); vbyte_safePutVbyte(headerDataPointer, writedb_numberOfLetters); vbyte_safePutVbyte(headerDataPointer, writedb_maximumSequenceLength); vbyte_safePutVbyte(headerDataPointer, writedb_alphabetType); vbyte_safePutVbyte(headerDataPointer, writedb_numberOfClusters); vbyte_safePutVbyte(headerDataPointer, writedb_volume + 1); // Write the header data at the start of the file headerLength = headerDataPointer - headerData; if (fwrite(&headerData, sizeof(unsigned char), headerLength, writedb_dataFile) < headerLength) { fprintf(stderr, "Error writing header to sequence file %s\n", writedb_dataFilename); exit(-1); } // For each sequence memBlocks_resetCurrent(writedb_sequenceData); while ((sequenceData = memBlocks_getCurrent(writedb_sequenceData)) != NULL) { // Prepare to write sequence description length, subject length, and encoded // length using vbytes headerDataPointer = headerData; vbyte_safePutVbyte(headerDataPointer, sequenceData->descriptionLength); vbyte_safePutVbyte(headerDataPointer, sequenceData->sequenceLength); vbyte_safePutVbyte(headerDataPointer, sequenceData->encodedLength); vbyte_safePutVbyte(headerDataPointer, sequenceData->oid); // Write sequence header information headerLength = headerDataPointer - headerData; if (fwrite(headerData, sizeof(unsigned char), headerLength, writedb_dataFile) < headerLength) { fprintf(stderr, "Error writing to sequence file %s\n", writedb_dataFilename); exit(-1); } } // Close writing to sequence file fclose(writedb_dataFile); memBlocks_free(writedb_sequenceData); }