// Initialize region copying/unpacking void unpack_initialize() { unpack_unpackRegions = memBlocks_initialize( sizeof(struct unpackRegion), constants_initialAllocUnpackRegions); unpack_subjectRegions = memBlocks_initialize( sizeof(struct unpackRegion), constants_initialAllocUnpackRegions); }
// Initialize the creation of ungapped extensions void ungappedExtension_initialize() { ungappedExtension_extensions = memBlocks_initialize(sizeof(struct ungappedExtension), constants_initialAllocUngappedExtensions); ungappedExtension_minus3reward = parameters_matchScore * -3; ungappedExtension_tableMatchesReward = parameters_matchScore * parameters_wordTableLetters; }
// Initialize writing to formatted database void writedb_initialize(char *filename, uint4 alphabetType) { char *wildcardsFilename; writedb_filename = filename; writedb_alphabetType = alphabetType; writedb_maximumSequenceLength = 0; writedb_minimumSequenceLength = 0; writedb_numberOfLetters = 0; writedb_volume = 0; writedb_sequenceCount = 0; writedb_numberOfClusters = 0; // Construct sequence and description filenames writedb_sequenceFilename = (char *)global_malloc(strlen(filename) + 13); sprintf(writedb_sequenceFilename, "%s.sequences", filename); writedb_descriptionsFilename = (char *)global_malloc(strlen(filename) + 15); sprintf(writedb_descriptionsFilename, "%s.descriptions", filename); writedb_dataFilename = (char *)global_malloc(strlen(filename) + 8); sprintf(writedb_dataFilename, "%s.data", filename); wildcardsFilename = (char *)global_malloc(strlen(filename) + 12); sprintf(wildcardsFilename, "%s.wildcards", filename); // Delete the wildcards file if one exists rename(wildcardsFilename, writedb_sequenceFilename); // Open sequence file for writing if ((writedb_sequenceFile = fopen(writedb_sequenceFilename, "w")) == NULL) { fprintf(stderr, "Error opening file %s for writing\n", writedb_sequenceFilename); exit(-1); } // Write sentinal/padding byte at start if (alphabetType == encoding_protein) fputc(encoding_sentinalCode, writedb_sequenceFile); else fputc(0, writedb_sequenceFile); // Open descriptions file for writing if ((writedb_descriptionsFile = fopen(writedb_descriptionsFilename, "w")) == NULL) { fprintf(stderr, "Error opening file %s for writing\n", writedb_descriptionsFilename); exit(-1); } writedb_volumeSize = 1; writedb_sequenceData = memBlocks_initialize(sizeof(struct sequenceData), constants_initialSequenceData); }
// Given a query sequence uses an inverted index of the collection to identify the // sequence number and offset of all hits between the query and the collection void index_processQuery(unsigned char* startIndex, struct PSSMatrix PSSMatrix, uint4 numSequences) { uint4 queryPosition, codeword = 0, queryPosition4; unsigned char* offsets, *endOffsets; uint4 offsetGap, offset, sequenceGap, sequenceNumber; struct indexCoordinate* coordinate; struct memBlocks* unsortedCoordinates; uint4 *numSubjectHits, numQueryPositions, queryWordCount, numOffsets; uint4 time, wordPosition, containsWildcard; struct queryWord* queryWords; // Read word and interval size from start of index vbyte_getVbyte(startIndex, &index_wordSize); vbyte_getVbyte(startIndex, &index_intervalSize); index_numWords = pow(4, index_wordSize); index_sequencePositions = (uint4*)startIndex; index_descriptionLocations = index_sequencePositions + numSequences; index_loadedWords = index_descriptionLocations + numSequences; index_offsets = (unsigned char*)(index_loadedWords + index_numWords + 1); time = clock(); unsortedCoordinates = memBlocks_initialize(sizeof(struct indexCoordinate), numSequences); // Declare and initialize array for count number of hits for each sequence numSubjectHits = (uint*)global_malloc(sizeof(uint4) * numSequences); sequenceNumber = 0; while (sequenceNumber < numSequences) { numSubjectHits[sequenceNumber] = 0; sequenceNumber++; } // Memory to hold offsets string for each query word numQueryPositions = PSSMatrix.length - index_wordSize + 1; queryWords = (struct queryWord*)global_malloc(sizeof(struct queryWord) * numQueryPositions); // For each word in the query queryPosition = 0; while (queryPosition < numQueryPositions) { // Check if the word contains a wildcard containsWildcard = 0; wordPosition = 0; while (wordPosition < index_wordSize) { if (PSSMatrix.queryCodes[queryPosition + wordPosition] >= encoding_numRegularLetters) containsWildcard = 1; wordPosition++; } // Don't include words that cross the strand boundry or contain wildcards if (!containsWildcard && !(queryPosition < PSSMatrix.strandLength && queryPosition >= PSSMatrix.strandLength - index_wordSize + 1)) { // printf("--Query position=%d\n", queryPosition); // Get the codeword codeword = index_generateCodeword(PSSMatrix.bestMatchCodes + queryPosition, index_wordSize); // Get wordlist for that codeword offsets = index_offsets + index_loadedWords[codeword]; endOffsets = index_offsets + index_loadedWords[codeword + 1]; queryWords[queryPosition].offsets = offsets; queryWords[queryPosition].endOffsets = endOffsets; queryWords[queryPosition].queryPosition = queryPosition; queryWords[queryPosition].codeword = codeword; // printf("codeword=%d start=%d end=%d numHits=%d\n", codeword, index_loadedWords[codeword], // index_loadedWords[codeword + 1], endOffsets - offsets); } else { queryWords[queryPosition].offsets = NULL; queryWords[queryPosition].endOffsets = NULL; queryWords[queryPosition].queryPosition = queryPosition; queryWords[queryPosition].codeword = codeword; } // printf("\n"); queryPosition++; } // Sort the query words by codeword qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareCodeword); // For each query word queryWordCount = 0; while (queryWordCount < numQueryPositions) { // Ignoring those that cross the strand boundry if (queryWords[queryWordCount].offsets != NULL) { // Make in-memory copy of list of offsets numOffsets = queryWords[queryWordCount].endOffsets - queryWords[queryWordCount].offsets; offsets = (char*)global_malloc(sizeof(char) * numOffsets); memcpy(offsets, queryWords[queryWordCount].offsets, numOffsets); queryWords[queryWordCount].offsets = offsets; queryWords[queryWordCount].endOffsets = offsets + numOffsets; } queryWordCount++; } // Sort the query words by query position qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareQueryPosition); queryPosition = 0; while (queryPosition < numQueryPositions) { // Ignoring those that cross the strand boundry if (queryWords[queryPosition].offsets != NULL) { offsets = queryWords[queryPosition].offsets; endOffsets = queryWords[queryPosition].endOffsets; offset = 0; sequenceNumber = 0; queryPosition4 = queryPosition + (index_wordSize - 4); // Traverse the offsets while (offsets < endOffsets) { vbyte_getVbyte(offsets, (&sequenceGap)); vbyte_getVbyte(offsets, (&offsetGap)); // printf("[%d,%d]\n", sequenceGap, offsetGap); if (sequenceGap > 0) { offset = offsetGap; sequenceNumber += sequenceGap; } else { offset += offsetGap; } // printf(" %u", offset); // Add query/database coordinate of match to relevant bucket // printf("Sequence number=%d\n", sequenceNumber); coordinate = (struct indexCoordinate*)memBlocks_newEntry(unsortedCoordinates); coordinate->queryOffset = queryPosition4; coordinate->subjectOffset = offset * index_intervalSize + (index_wordSize - 4); coordinate->subjectNumber = sequenceNumber; numSubjectHits[sequenceNumber]++; // printf("[%d,%d]\n", queryPosition, offset); blast_numHits++; } free(queryWords[queryPosition].offsets); } queryPosition++; } printf("Time to process query=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC); time = clock(); // Make memory for sorted list index_numCoordinates = unsortedCoordinates->numTotalEntries; index_coordinates = (struct indexCoordinate*)global_malloc( sizeof(struct indexCoordinate) * index_numCoordinates); index_sequenceCoordinates = (struct indexCoordinate**)global_malloc( sizeof(struct indexCoordinate*) * numSequences); // For each sequence coordinate = index_coordinates; sequenceNumber = 0; while (sequenceNumber < numSequences) { // If it has hits if (numSubjectHits[sequenceNumber] != 0) { // Point to location in sorted list of coordinates index_sequenceCoordinates[sequenceNumber] = coordinate; coordinate += numSubjectHits[sequenceNumber]; numSubjectHits[sequenceNumber] = 0; } sequenceNumber++; } // Move through list of unsorted coordinates memBlocks_resetCurrent(unsortedCoordinates); while ((coordinate = memBlocks_getCurrent(unsortedCoordinates)) != NULL) { sequenceNumber = coordinate->subjectNumber; // printf("%d,%d=[%d]\n", index_sequenceCoordinates[sequenceNumber], numSubjectHits[sequenceNumber], sequenceNumber); // Place into sorted list index_sequenceCoordinates[sequenceNumber][numSubjectHits[sequenceNumber]] = *coordinate; numSubjectHits[sequenceNumber]++; } memBlocks_free(unsortedCoordinates); /* // Print sorted coordinates coordinate = index_coordinates; while (coordinate < index_coordinates + index_numCoordinates) { printf("[%d]", coordinate); printf("Subject %d Offset %d,%d\n", coordinate->subjectNumber, coordinate->queryOffset, coordinate->subjectOffset); coordinate++; }*/ printf("Time to sort buckets=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC); }