// Create a new memory block struct memBlocks *memBlocks_initialize(size_t entrySize, int4 blockSizes) { struct memBlocks *memBlocks; // Declare memory for first block memBlocks = (struct memBlocks *)global_malloc(sizeof(struct memBlocks)); memBlocks->blockSizes = blockSizes; memBlocks->entrySize = entrySize; memBlocks->numBlocks = 0; memBlocks->numTotalEntries = 0; memBlocks->maxNumBlocks = 10; // Declare memory for the first block memBlocks->lastBlock = (void *)global_malloc(memBlocks->entrySize * memBlocks->blockSizes); // Declare memory for pointers to blocks and add the first block memBlocks->blocks = (void **)global_malloc(sizeof(void *) * memBlocks->maxNumBlocks); memBlocks->numEntries = (int4 *)global_malloc(sizeof(int4) * memBlocks->maxNumBlocks); memBlocks->blocks[memBlocks->numBlocks] = memBlocks->lastBlock; memBlocks->numEntries[memBlocks->numBlocks] = 0; memBlocks->numBlocks++; return memBlocks; }
// Load a BLOSUM or PAM scoring matrix void parameters_findScoringMatrix() { FILE* matrixFile, *ncbircFile; char* homeDirectory, *ncbircFilename; // Get home user's directory homeDirectory = getenv("HOME"); // Construct name of .ncbirc file ncbircFilename = (char*)global_malloc(sizeof(char) * (strlen(homeDirectory) + 9)); sprintf(ncbircFilename, "%s/.ncbirc", homeDirectory); // Check for existence of NCBI file if ((ncbircFile = fopen(ncbircFilename, "r")) != NULL) { // Determine the location of the scoring matrix file by consulting the .ncbirc file parameters_scoringMatrixPath = (char*)global_malloc(sizeof(char) * 1024); if (!(fscanf(ncbircFile, "[NCBI]\nData=%s", parameters_scoringMatrixPath))) { fprintf(stderr, "Error reading scoring matrix path from %s file\n", ncbircFilename); fprintf(stderr, "BLAST requires the file .ncbirc in the user's home directory\n"); fprintf(stderr, "containing the text:\n\n"); fprintf(stderr, "[NCBI]\n"); fprintf(stderr, "Data=/home/user/fsa-blast/data\n\n"); fprintf(stderr, "Where the path specified contains scoring matrix files (ie. BLOSUM62)\n"); exit(-1); } fclose(ncbircFile); } else { // If not available guess location of scoring matrix files parameters_scoringMatrixPath = (char*)global_malloc(sizeof(char) * 1024); strcpy(parameters_scoringMatrixPath, "data"); } // Append scoring matrix filename then check for existance sprintf(parameters_scoringMatrixPath, "%s/%s", parameters_scoringMatrixPath, parameters_scoringMatrix); matrixFile = fopen(parameters_scoringMatrixPath, "r"); if (matrixFile == NULL) { fprintf(stderr, "%s\n", strerror(errno)); fprintf(stderr, "Error reading matrix file %s\n", parameters_scoringMatrixPath); fprintf(stderr, "BLAST requires the file .ncbirc in the user's home directory\n"); fprintf(stderr, "containing the text:\n\n"); fprintf(stderr, "[NCBI]\n"); fprintf(stderr, "Data=/home/user/fsa-blast/data\n\n"); fprintf(stderr, "Where the path specified contains scoring matrix files (ie. BLOSUM62)\n"); exit(-1); } fclose(matrixFile); free(ncbircFilename); }
// Initialize writing to formatted database void writedb_initialize(char *filename, uint4 alphabetType) { char *wildcardsFilename; writedb_filename = filename; writedb_alphabetType = alphabetType; writedb_maximumSequenceLength = 0; writedb_minimumSequenceLength = 0; writedb_numberOfLetters = 0; writedb_volume = 0; writedb_sequenceCount = 0; writedb_numberOfClusters = 0; // Construct sequence and description filenames writedb_sequenceFilename = (char *)global_malloc(strlen(filename) + 13); sprintf(writedb_sequenceFilename, "%s.sequences", filename); writedb_descriptionsFilename = (char *)global_malloc(strlen(filename) + 15); sprintf(writedb_descriptionsFilename, "%s.descriptions", filename); writedb_dataFilename = (char *)global_malloc(strlen(filename) + 8); sprintf(writedb_dataFilename, "%s.data", filename); wildcardsFilename = (char *)global_malloc(strlen(filename) + 12); sprintf(wildcardsFilename, "%s.wildcards", filename); // Delete the wildcards file if one exists rename(wildcardsFilename, writedb_sequenceFilename); // Open sequence file for writing if ((writedb_sequenceFile = fopen(writedb_sequenceFilename, "w")) == NULL) { fprintf(stderr, "Error opening file %s for writing\n", writedb_sequenceFilename); exit(-1); } // Write sentinal/padding byte at start if (alphabetType == encoding_protein) fputc(encoding_sentinalCode, writedb_sequenceFile); else fputc(0, writedb_sequenceFile); // Open descriptions file for writing if ((writedb_descriptionsFile = fopen(writedb_descriptionsFilename, "w")) == NULL) { fprintf(stderr, "Error opening file %s for writing\n", writedb_descriptionsFilename); exit(-1); } writedb_volumeSize = 1; writedb_sequenceData = memBlocks_initialize(sizeof(struct sequenceData), constants_initialSequenceData); }
// Initialize the hitMatrix by declaring memory for the maximum number // of diagonals required by a subject sequence void hitMatrix_initialize(int4 queryLength, int4 maximumSubjectLength, unsigned char* startAddress) { int4 numDiagonals; unsigned char **minOffset, **offset, **maxOffset; // Use more memory efficient but slower hit matrix for nucleotide if (encoding_alphabetType == encoding_nucleotide) { // Calculate number of diagonals that will be required during search numDiagonals = 1; while (numDiagonals < queryLength + parameters_wordSize) { numDiagonals <<= 1; } // Construct mask hitMatrix_diagonalMask = numDiagonals - 1; // Declare memory for diagonal slots hitMatrix_furthest = (unsigned char**)global_malloc(sizeof(unsigned char*) * numDiagonals); minOffset = hitMatrix_furthest; } // Use less memory efficient but faster hit matrix for protein else { // Maximum number of diagonals that will be required during search numDiagonals = queryLength + maximumSubjectLength - parameters_wordSize + 1; minOffset = (unsigned char**)global_malloc(sizeof(unsigned char*) * numDiagonals); // Advance array pointer to allow offset values ranging from // -queryLength to subjectLength - wordSize hitMatrix_furthest = minOffset + queryLength; } // Record query length hitMatrix_queryLength = queryLength; // Start from smallest possible offset value and iterate through to largest offset = minOffset; maxOffset = minOffset + numDiagonals; // For each diagonal, reset furthest to address at start of file while (offset < maxOffset) { *offset = startAddress; offset++; } }
void read_dbIdxBlock(FILE *read_dbIdxBlockFile1) { int4 totalNumEntries = (proteinLookup_numWords + 1) * proteinLookup_numBlocks; proteinLookup_db_b[0].subPositionOffset = (uint4 *)global_malloc(sizeof(uint4) * totalNumEntries); totalIndexSize += sizeof(uint4) * totalNumEntries; if (fread(proteinLookup_db_b[0].subPositionOffset, sizeof(uint4), totalNumEntries, read_dbIdxBlockFile1) != totalNumEntries) { fprintf(stderr, "Error reading LoopkupHeader to dbIdxBlock file\n"); exit(-1); } int ii; uint8 totalHits = 0; for(ii = 0; ii < proteinLookup_numBlocks; ii++) { proteinLookup_db_b[ii].subPositionOffset = proteinLookup_db_b[0].subPositionOffset + (proteinLookup_numWords + 1) * ii; totalHits += proteinLookup_db_b[ii].subPositionOffset[proteinLookup_numWords]; } //int totalHits = proteinLookup_db_b[blockNum].subPositionOffset[proteinLookup_numWords]; ASSERT(totalNumPositions == totalHits); proteinLookup_db_b[0].subSequencePositions = (subPos_t *)global_malloc(sizeof(subPos_t) * totalHits); totalIndexSize += sizeof(subPos_t) * totalHits; if (fread(proteinLookup_db_b[0].subSequencePositions, sizeof(subPos_t), totalHits, read_dbIdxBlockFile1) != totalHits) { fprintf(stderr, "Error reading numSubPositions to dbIdxBlock file\n"); exit(-1); } for(ii = 1; ii < proteinLookup_numBlocks; ii++) { proteinLookup_db_b[ii].subSequencePositions = proteinLookup_db_b[ii - 1].subSequencePositions + proteinLookup_db_b[ii - 1].subPositionOffset[proteinLookup_numWords]; } }
int main(int argc, char *argv[]) { char *filename, *sequence, *description, *sequenceCopy; int sequenceLength; // User must provide FASTA format file at command line if (argc < 2) { fprintf(stderr, "Useage: dust <FASTA file>\n"); exit(-1); } filename = argv[1]; // Initialize encoding routines encoding_initialize(encoding_nucleotide); // Open FASTA file for reading readFasta_open(filename); // Read each sequence from the file while (readFasta_readSequence()) { // Get sequence just read sequence = readFasta_sequenceBuffer; description = readFasta_descriptionBuffer; sequenceLength = readFasta_sequenceLength; // Make in-memory copy of it sequenceCopy = (char *)global_malloc(sequenceLength); strcpy(sequenceCopy, sequence); // Perform dust filtering dust_dustSequence(sequence); // Print description and filtering sequence printf(">%s\n%s\n", description, sequence); } }
void proteinLookup_db_initial(int4 numCodes, int wordLength) { struct initialWord_protein_db *initialLookup, *initialWord; struct initialWord_neighborLookup *initialLookup_n; uint4 codeword, numEntries; totalNumPositions = 0; maxNumSeqBlk = 0; wordLookupDFA_numCodes = numCodes; wordLookupDFA_wordLength = wordLength; numEntries = ceil(pow(numCodes, wordLength)); int4 proteinLookup_wordLength = wordLength; uint4 numBlocks = ceil((float)readdb_numVolumeLetters / dbIdx_block_size) + 20; proteinLookup_numBlocks = numBlocks; // Declare memory for initial DB index blocks proteinLookup_db_b = (struct proteinLookup_db_blk *)global_malloc( sizeof(struct proteinLookup_db_blk) * numBlocks); // Declare memory for initial lookup table proteinLookup_db = initialLookup = (struct initialWord_protein_db *)global_malloc( sizeof(struct initialWord_protein_db) * numEntries * numBlocks); proteinLookup_numWords = numEntries; // Iterate through every possible codeword codeword = 0; while (codeword < numBlocks) { proteinLookup_db_b[codeword].subPositionOffset = (uint4 *)global_malloc(sizeof(uint4) * (numEntries + 1)); proteinLookup_db_b[codeword].dbIdxblk_longestSeq = 0; proteinLookup_db_b[codeword].numSeqBlk = 0; proteinLookup_db_b[codeword].subSequencePositions = NULL; codeword++; } codeword = 0; while (codeword < numEntries * numBlocks) { // Initialize list of query positions as empty initialLookup[codeword].numSubPositions = 0; initialLookup[codeword].allocSubPositions = 0; initialLookup[codeword].subSequencePositions = NULL; codeword++; } }
// Initialize for the construction of a new query position list void qPosList_initialize(int4 maxNumLists) { struct memSingleBlock* list; qPosList_qPosLists = (struct memSingleBlock*)global_malloc(sizeof(struct memSingleBlock) * maxNumLists); qPosList_numQPosLists = 0; qPosList_maxQPosLists = 0; // Initialize lists of query positions while (qPosList_maxQPosLists < maxNumLists) { list = qPosList_qPosLists + qPosList_maxQPosLists; memSingleBlock_initializeExisting(list, sizeof(struct queryPosition), 10); qPosList_maxQPosLists++; } qPosList_initialQPosLists = (struct initialQPosList*)global_malloc(sizeof(struct initialQPosList) * maxNumLists); qPosList_numInitialQPosLists = 0; }
void read_dbLookupAux(char *read_dbLookupFilename) { char *read_dbLookupAuxFilename; FILE *read_dbLookupAuxFile; read_dbLookupAuxFilename = (char *)global_malloc(strlen(read_dbLookupFilename) + 40); sprintf(read_dbLookupAuxFilename, "%s.sequence%d.dbLookupAux", read_dbLookupFilename, readdb_volume); // Open dbLookup file for writing if ((read_dbLookupAuxFile = fopen(read_dbLookupAuxFilename, "r")) == NULL) { fprintf(stderr, "Error opening file %s for reading\n", read_dbLookupAuxFilename); exit(-1); } struct dbLookupAux dbLookupAux; if(fread(&dbLookupAux, sizeof(struct dbLookupAux), 1, read_dbLookupAuxFile) != 1) { fprintf(stderr, "Error reading read_dbLookupAuxFile\n"); exit(0); } proteinLookup_numBlocks = dbLookupAux.proteinLookup_numBlocks; proteinLookup_numWords = dbLookupAux.proteinLookup_numWords; wordLookupDFA_wordLength = dbLookupAux.proteinLookup_wordLength; wordLookupDFA_numCodes = dbLookupAux.proteinLookup_numCodes; dbIdx_block_size = dbLookupAux.dbIdx_block_size; totalNumPositions = dbLookupAux.totalNumPositions; maxNumSeqBlk = dbLookupAux.maxNumSeqBlk; proteinLookup_db_b = (struct proteinLookup_db_blk *)malloc( sizeof(struct proteinLookup_db_blk) * proteinLookup_numBlocks); totalIndexSize += sizeof(struct proteinLookup_db_blk) * proteinLookup_numBlocks; if(fread(proteinLookup_db_b, sizeof(struct proteinLookup_db_blk), proteinLookup_numBlocks, read_dbLookupAuxFile) != proteinLookup_numBlocks) { fprintf(stderr, "Error reading read_dbLookupAuxFile\n"); exit(0); } //fprintf(stderr, "dbIdx: volumn: %d numBlocks: %d\n", readdb_volume, proteinLookup_numBlocks); proteinLookup_db = (struct initialWord_protein_db *)malloc( sizeof(struct initialWord_protein_db) * proteinLookup_numWords * proteinLookup_numBlocks); free(read_dbLookupAuxFilename); fclose(read_dbLookupAuxFile); }
void write_dbLookupAux(char *write_dbLookupFilename) { char *write_dbLookupAuxFilename; FILE *write_dbLookupAuxFile; write_dbLookupAuxFilename = (char *)global_malloc(strlen(write_dbLookupFilename) + 40); sprintf(write_dbLookupAuxFilename, "%s.sequence%d.dbLookupAux", write_dbLookupFilename, readdb_volume); // Open dbLookup file for writing if ((write_dbLookupAuxFile = fopen(write_dbLookupAuxFilename, "w")) == NULL) { fprintf(stderr, "Error opening file %s for writing\n", write_dbLookupAuxFilename); exit(-1); } struct dbLookupAux dbLookupAux; dbLookupAux.proteinLookup_numBlocks = proteinLookup_numBlocks; dbLookupAux.proteinLookup_numWords = proteinLookup_numWords; dbLookupAux.proteinLookup_wordLength = wordLookupDFA_wordLength; dbLookupAux.proteinLookup_numCodes = wordLookupDFA_numCodes; dbLookupAux.dbIdx_block_size = dbIdx_block_size; dbLookupAux.totalNumPositions = totalNumPositions; dbLookupAux.maxNumSeqBlk = maxNumSeqBlk; if (fwrite(&dbLookupAux, sizeof(struct dbLookupAux), 1, write_dbLookupAuxFile) != 1) { fprintf(stderr, "Error writing data to dbLookup aux file %s\n", write_dbLookupAuxFilename); exit(-1); } if (fwrite(proteinLookup_db_b, sizeof(struct proteinLookup_db_blk), proteinLookup_numBlocks, write_dbLookupAuxFile) != proteinLookup_numBlocks) { fprintf(stderr, "Error writing data to dbLookup aux file %s\n", write_dbLookupAuxFilename); exit(-1); } #if 1 fprintf(stderr, "dbIdxBlockSize:%d(KB) " "maxNumSeqPerBlk:%d longestSeqLength:%d maxBinSize: %d\n", dbIdx_block_size / 1024, maxNumSeqBlk, readdb_longestSequenceLength, maxBinSize); #endif free(write_dbLookupAuxFilename); fclose(write_dbLookupAuxFile); }
/// Allocate global blocks from the global heap. static hpx_addr_t _pgas_gas_alloc_local(size_t n, uint32_t bsize, uint32_t boundary, uint32_t attr) { size_t bytes = n * bsize; void *lva = NULL; if (boundary) { lva = global_memalign(boundary, bytes); } else { lva = global_malloc(bytes); } return (lva) ? pgas_lva_to_gpa(lva) : HPX_NULL; }
void neighbourLookup_init() { int4 numEntries = proteinLookup_numWords; neighborLookup = (struct initialWord_neighborLookup *)global_malloc( sizeof(struct initialWord_neighborLookup) * numEntries); int4 codeword = 0; while (codeword < numEntries) { neighborLookup[codeword].numNeighbours = 0; neighborLookup[codeword].neighbours = NULL; codeword++; } }
void neighbourLookup_build(struct PSSMatrix PSSMatrix, struct scoreMatrix scoreMatrix, int4 wordLength) { int4 queryPosition = 0; int4 numNeighbours; int4 codeword; int4 numWords = proteinLookup_numWords; struct neighbour *neighbours = (struct neighbour *)global_malloc(sizeof(struct neighbour) * numWords); while (queryPosition < PSSMatrix.length - wordLength + 1) { codeword = getCodeword(PSSMatrix.queryCodes + queryPosition, wordLength); if (neighborLookup[codeword].numNeighbours == 0) { numNeighbours = 0; // wordLookupDFA_getNeighbours(PSSMatrix, queryPosition, &numNeighbours, // neighbours); wordLookupSM_getNeighbours(PSSMatrix.queryCodes, scoreMatrix, queryPosition, &numNeighbours, neighbours); neighborLookup[codeword].numNeighbours = numNeighbours; neighborLookup[codeword].neighbours = (int4 *)global_malloc(sizeof(int4) * numNeighbours); while (numNeighbours > 0) { numNeighbours--; neighborLookup[codeword].neighbours[numNeighbours] = neighbours[numNeighbours].codeword; } } // printf("%d %d\n", codeword, neighborLookup[codeword].numNeighbours); queryPosition++; } free(neighbours); }
// Given the results of dynamic programming (a matrix of trace codes and a // highest scoring position in // the matrix) for finding the START of the alignment, performs the simple // operation of finding the path // from the highest scoring point back to the seed struct trace gappedExtension_traceBeforeSeed(struct dpResults beforeDpResults, struct coordinate seed) { int4 queryPosition, subjectPosition; unsigned char **traceback; unsigned char traceCode; unsigned char state = 0; struct trace trace; unsigned char *traceCodes; uint4 traceCount = 0; traceback = beforeDpResults.traceback; trace.queryStart = queryPosition = beforeDpResults.best.queryOffset; trace.subjectStart = subjectPosition = beforeDpResults.best.subjectOffset; // Declare memory for tracecodes; for maximum possible number of codes that // could // be generated by this trace traceCodes = (unsigned char *)global_malloc( sizeof(unsigned char) * (seed.queryOffset - queryPosition + seed.subjectOffset - subjectPosition)); while (queryPosition < seed.queryOffset && subjectPosition < seed.subjectOffset) { // Construct the trace traceCodes[traceCount] = state; traceCount++; // printf("(%p)", traceback[queryPosition]); // printf("(%d,%d)", queryPosition, subjectPosition); fflush(stdout); traceCode = traceback[queryPosition][subjectPosition]; // If we got to current cell through a MATCH if (state == 0) { // Move to cell before this one queryPosition++; subjectPosition++; // We are only interested in lowest 2 bits of tracecode traceCode = traceCode << 6; traceCode = traceCode >> 6; // Tracecode determines if we matched or inserted here state = traceCode; } // If we got to current cell through an Ix else if (state == 1) {
char* getSequence(uint4 seqId) { char* sequence; // Declare memory for the sequence sequence = (char*)global_malloc(sizeof(char) * (readdb_sequenceData[seqId].sequenceLength + 1)); int ii; for(ii = 0; ii < readdb_sequenceData[seqId].sequenceLength; ii++) { sequence[ii] = encoding_getLetter(readdb_sequenceData[seqId].sequence[ii]); if(sequence[ii] == 'U') { fprintf(stderr, "Selenocysteine (U) at position %d replaced by X\n", ii); sequence[ii] = 'X'; } } sequence[ii] = '\0'; return sequence; }
// Extend the start of a region if necessary void unpack_extendRegionStart(int4 position, struct unpackRegion *unpackRegion) { unsigned char *newUnpackedSubject; int4 newRegionStart, newRegionEnd; if (position < unpackRegion->startOffset) { // Extend the region start newRegionStart = unpackRegion->startOffset - constants_unpackRegionExtend; if (newRegionStart < 0) newRegionStart = 0; newRegionEnd = unpackRegion->endOffset; // Make start of region a multiple of 4 newRegionStart = (newRegionStart / 4) * 4; // Declare memory for the new region newUnpackedSubject = (unsigned char *)global_malloc( sizeof(char) * (newRegionEnd - newRegionStart)); newUnpackedSubject -= newRegionStart; // Copy unpacked subject from old region to new memcpy(newUnpackedSubject + unpackRegion->startOffset, unpackRegion->unpackedSubject + unpackRegion->startOffset, sizeof(char) * (unpackRegion->endOffset - unpackRegion->startOffset)); // Free old subject unpackRegion->unpackedSubject += unpackRegion->startOffset; free(unpackRegion->unpackedSubject); // Unpack the new part of the region encoding_byteUnpackRegion(newUnpackedSubject + newRegionStart, unpackRegion->subject + (newRegionStart / 4), unpackRegion->startOffset - newRegionStart); unpackRegion->unpackedSubject = newUnpackedSubject; unpackRegion->startOffset = newRegionStart; } }
// Get a run of consecutive entries from the block void *memBlocks_newEntries(struct memBlocks *memBlocks, uint4 numNewEntries) { void *newEntry; // Check if we need to create a new block of memory if (memBlocks->numEntries[memBlocks->numBlocks - 1] + numNewEntries > memBlocks->blockSizes) { // Declare memory for the new block memBlocks->lastBlock = (void *)global_malloc(memBlocks->entrySize * memBlocks->blockSizes); // Check if we need more memory for block pointers if (memBlocks->numBlocks >= memBlocks->maxNumBlocks) { // Allocate more memBlocks->maxNumBlocks *= 2; memBlocks->blocks = (void **)global_realloc( memBlocks->blocks, sizeof(void *) * memBlocks->maxNumBlocks); memBlocks->numEntries = (int4 *)global_realloc( memBlocks->numEntries, sizeof(int4) * memBlocks->maxNumBlocks); } // Store the address of this new block memBlocks->blocks[memBlocks->numBlocks] = memBlocks->lastBlock; // Reset number of entries in this block memBlocks->numEntries[memBlocks->numBlocks] = 0; memBlocks->numBlocks++; } // Use the next available slot in the latest block newEntry = ((char *)(memBlocks->lastBlock)) + memBlocks->numEntries[memBlocks->numBlocks - 1] * memBlocks->entrySize; memBlocks->numEntries[memBlocks->numBlocks - 1] += numNewEntries; memBlocks->numTotalEntries += numNewEntries; return newEntry; }
// Initialize the creation of a new index structure void index_initializeBuild(uint4 fromCodeword, uint4 toCodeword) { uint4 codeword; // index_numWords = pow(4, index_wordSize); index_words = (struct wordList*)global_malloc(sizeof(struct wordList) * (toCodeword - fromCodeword)); index_words -= fromCodeword; // For each word codeword = fromCodeword; while (codeword < toCodeword) { // Initialize list of occurrences index_words[codeword].offsets = NULL; index_words[codeword].length = 0; index_words[codeword].allocated = 0; index_words[codeword].lastOffset = 0; index_words[codeword].lastSequenceNumber = 0; codeword++; } index_subjectNumber = 0; }
// Unpack entire or sections of a subject sequence before gapped alignment void unpack_unpackSubject(struct PSSMatrix PSSMatrix, struct alignment *alignment) { unsigned char *subject, *unpackedSubject, wildcard, *edits, *endEdits; uint4 wildcardPosition; struct unpackRegion *firstRegion = NULL, *lastRegion, *currentRegion, *unpackRegion; int4 regionStart, regionEnd, numRegions; // No need to unpack a protein subject, or already unpacked nucleotide subject if (parameters_ssearch || encoding_alphabetType == encoding_protein) { // Just create a single region covering the entire sequence firstRegion = memBlocks_newEntry(unpack_unpackRegions); firstRegion->startOffset = 0; firstRegion->endOffset = alignment->subjectLength; firstRegion->subject = alignment->subject; firstRegion->unpackedSubject = alignment->subject; firstRegion->subjectLength = alignment->subjectLength; alignment->unpackRegions = firstRegion; alignment->numUnpackRegions = 1; return; } // Get the subject regions for this alignment numRegions = unpack_getRegions(PSSMatrix, alignment, 0, unpack_unpackRegions); lastRegion = memBlocks_getLastEntry(unpack_unpackRegions); lastRegion++; firstRegion = lastRegion - numRegions; // Sort the regions in order of start position qsort(firstRegion, lastRegion - firstRegion, sizeof(struct unpackRegion), unpack_compareUnpackRegions); // Unpack each region currentRegion = firstRegion; while (currentRegion < lastRegion) { regionEnd = currentRegion->endOffset; regionStart = currentRegion->startOffset; #ifdef VERBOSE if (parameters_verboseDloc == alignment->descriptionLocation) { printf("Unpack subject region %d to %d (length=%d)\n", regionStart, regionEnd, alignment->subjectLength); fflush(stdout); } #endif // Get the subject region to be unpacked if (alignment->unpackRegions == NULL) { subject = alignment->subject; } else { unpackRegion = unpack_selectRegion( alignment->unpackRegions, alignment->numUnpackRegions, regionStart); subject = unpackRegion->subject; } // Declare memory for the region unpackedSubject = (unsigned char *)global_malloc(sizeof(char) * (regionEnd - regionStart)); // Unpack the region of interest encoding_byteUnpackRegion(unpackedSubject, subject + (regionStart / 4), regionEnd - regionStart); unpackedSubject -= regionStart; currentRegion->unpackedSubject = unpackedSubject; currentRegion->subject = subject; currentRegion->subjectLength = alignment->subjectLength; blast_totalUnpacked += (regionEnd - regionStart); currentRegion++; } currentRegion = firstRegion; // Get wildcard edits for the sequence edits = alignment->edits; endEdits = alignment->edits + alignment->encodedLength - ((alignment->subjectLength + 3) / 4); // If there are edits if (edits < endEdits) { // Read first wildcard wildcard = *edits; edits++; // Read its position vbyte_getVbyte(edits, &wildcardPosition); // For each region in order of position in the subject while (currentRegion < lastRegion) { // Skip past edits that are before current region while (edits < endEdits && wildcardPosition < currentRegion->startOffset) { // Read wildcard wildcard = *edits; edits++; // Read its position vbyte_getVbyte(edits, &wildcardPosition); } // Process edits that are in the current region while (edits < endEdits && wildcardPosition < currentRegion->endOffset) { // Insert wildcard into sequence currentRegion->unpackedSubject[wildcardPosition] = wildcard; // Read next wildcard wildcard = *edits; edits++; // Read its position vbyte_getVbyte(edits, &wildcardPosition); } // Advance to the next region currentRegion++; } } alignment->unpackRegions = firstRegion; alignment->numUnpackRegions = lastRegion - firstRegion; }
// Perform dynamic programming to explore possible start points and alignments // that end at // the given seed and find the best score struct dpResults semiGappedScoring_dpBeforeSeed(unsigned char *subject, struct PSSMatrix PSSMatrix, struct coordinate seed, int4 dropoff) { int2 **queryPosition, **bestQueryPosition; int2 *matrixColumn; unsigned char *rowDropoff, *columnDropoff; unsigned char *subjectPosition, *bestSubjectPosition, *startSubjectPosition; int4 bestScore = 0; int4 *bestRow, *insertQrow, insertS, rowOffset; int4 subjectDistance; int4 oldBest, match, previousOldBest; unsigned char rightOfDropoff; int4 queryCount, subjectCount; struct dpResults dpResults; // Declare processing rows for storing match, insert-subject and insert-query // values // If current malloced rows aren't big enough if (seed.subjectOffset >= semiGappedScoring_rowSizes) { // Free existing rows free(semiGappedScoring_bestRow); free(semiGappedScoring_insertQrow); // Set size to double current needed length semiGappedScoring_rowSizes = (seed.subjectOffset) * 2; // Malloc new rows semiGappedScoring_bestRow = (int4 *)global_malloc(sizeof(int4) * semiGappedScoring_rowSizes); semiGappedScoring_insertQrow = (int4 *)global_malloc(sizeof(int4) * semiGappedScoring_rowSizes); } bestSubjectPosition = subjectPosition = startSubjectPosition = subject + seed.subjectOffset - 1; bestQueryPosition = queryPosition = PSSMatrix.matrix + seed.queryOffset - 1; // Initialize row pointers rowOffset = (subjectPosition - subject); // printf("rowOffset=%d Dloc=%d\n", rowOffset, dloc); fflush(stdout); bestRow = semiGappedScoring_bestRow + rowOffset; insertQrow = semiGappedScoring_insertQrow + rowOffset; // Set initial row dropoff and column dropoff rowDropoff = subject; columnDropoff = subject + seed.subjectOffset; // Using first column of query matrix matrixColumn = *queryPosition; // -----FIRST ROW----- // -----FIRST CELL----- // Set M value for bottom-right cell match = matrixColumn[*subjectPosition]; // M must be the best *bestRow = match; // Only gap opens possible *insertQrow = insertS = match - parameters_semiGappedOpenGap; // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } subjectDistance = 0; subjectPosition--; bestRow--; insertQrow--; // ----- REMAINING CELLS ----- // For each remaining column in the bottom row, scanning from right-to-left while (subjectPosition >= subject) { // Set value for M match = matrixColumn[*subjectPosition] - parameters_semiGappedOpenGap - subjectDistance * parameters_semiGappedExtendGap; // Determine the best of M and Iy if (match > insertS) { *bestRow = match; // Calculate new Iy insertS = maximum(match - parameters_semiGappedOpenGap, insertS - parameters_semiGappedExtendGap); } else { *bestRow = insertS; // Since M <= Iy, new Iy must derive from Iy insertS -= parameters_semiGappedExtendGap; } // Set DUMMY Ix value, which should never be used *insertQrow = constants_gappedExtensionDummyValue; // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Record dropoff position rowDropoff = subjectPosition; // And stop processing row break; } subjectPosition--; bestRow--; insertQrow--; subjectDistance++; } // if (dloc == 746829265) // print(semiGappedScoring_bestRow, subject, rowDropoff, columnDropoff); // Start queryCount at N. Only allow insertS for every Nth row when queryCount // reaches 0 queryCount = parameters_semiGappedExtensionN; // -----REMAINING ROWS----- while (queryPosition > PSSMatrix.matrix && rowDropoff < columnDropoff) { queryPosition--; queryCount--; subjectPosition = columnDropoff - 1; // Determine subjectCount for initial subjectPosition. Is used to only allow // insertQ when subjectOffset % parameters_semiGappedExtensionN == 0 subjectCount = (int4)(startSubjectPosition - subjectPosition) % parameters_semiGappedExtensionN; if (subjectCount) subjectCount = parameters_semiGappedExtensionN - subjectCount; // Reset row pointers to start of rows rowOffset = (subjectPosition - subject); bestRow = semiGappedScoring_bestRow + rowOffset; insertQrow = semiGappedScoring_insertQrow + rowOffset; // Using next column of query matrix matrixColumn = *queryPosition; // ************ All rows we are not allowing insertS if (queryCount) { // ** No insertQ allowed this column, this cell will only get a DUMMY // score if (subjectCount) { previousOldBest = *bestRow; *bestRow = constants_gappedExtensionDummyValue; // Score at this cell is below dropoff columnDropoff = subjectPosition; rightOfDropoff = 1; } // ** We are allowing insertQ this column else { // -----FAR RIGHT CELL----- // Record some old values previousOldBest = *bestRow; // Set Ix value *bestRow = *insertQrow; *insertQrow -= parameters_semiGappedExtendGap; // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = subjectPosition; rightOfDropoff = 1; } else { // We are left of the column dropoff for this row rightOfDropoff = 0; } // Reset subjectCount subjectCount = parameters_semiGappedExtensionN; } subjectPosition--; bestRow--; insertQrow--; subjectCount--; // -----CELLS RIGHT OF ROW DROPOFF----- while (subjectPosition >= rowDropoff) { // ** We are not allowing insertQ this column if (subjectCount) { // Calculate new M value, which is also the best oldBest = *bestRow; match = *bestRow = matrixColumn[*subjectPosition] + previousOldBest; previousOldBest = oldBest; // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } } // We are allowing insertQ this column else { // Calculate new M value oldBest = *bestRow; match = matrixColumn[*subjectPosition] + previousOldBest; previousOldBest = oldBest; // Determine the best of M and Ix if (match > *insertQrow) { *bestRow = match; // Calculate new Ix *insertQrow = maximum(match - parameters_semiGappedOpenGap, *insertQrow - parameters_semiGappedExtendGap); } else { *bestRow = *insertQrow; // Since M <= Ix, new Ix must derive from Ix *insertQrow -= parameters_semiGappedExtendGap; } // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } // Reset subjectCount subjectCount = parameters_semiGappedExtensionN; } subjectPosition--; bestRow--; insertQrow--; subjectCount--; } // -----SINGLE CELL LEFT OF ROW DROPOFF ----- if (!(bestScore > previousOldBest + dropoff) && (subjectPosition >= subject)) { // Set value for best *bestRow = match = previousOldBest + matrixColumn[*subjectPosition]; // Set DUMMY values for Ix *insertQrow = constants_gappedExtensionDummyValue; if (match + dropoff >= bestScore) { // Record dropoff position rowDropoff = subjectPosition; } } } // ************ Every Nth row we allow insertS else { // -----FAR RIGHT CELL----- // ** No insertQ allowed this column, this cell will only get a DUMMY // score if (subjectCount) { previousOldBest = *bestRow; *bestRow = constants_gappedExtensionDummyValue; // Score at this cell is below dropoff columnDropoff = subjectPosition; rightOfDropoff = 1; } // ** We are allowing insertQ this column else { // Record some old values previousOldBest = *bestRow; // Set Ix value *bestRow = *insertQrow; *insertQrow -= parameters_semiGappedExtendGap; // Set DUMMY value for Iy, which should never be used insertS = constants_gappedExtensionDummyValue; // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = subjectPosition; rightOfDropoff = 1; } else { // We are left of the column dropoff for this row rightOfDropoff = 0; } // Reset subjectCount subjectCount = parameters_semiGappedExtensionN; } subjectPosition--; bestRow--; insertQrow--; subjectCount--; // -----CELLS RIGHT OF ROW DROPOFF----- while (subjectPosition >= rowDropoff) { // ** We are not allowing insertQ this column if (subjectCount) { // Remember old M value (for cell below this one) oldBest = *bestRow; match = matrixColumn[*subjectPosition] + previousOldBest; previousOldBest = oldBest; // Determine the best of M and Iy if (match > insertS) { *bestRow = match; // Calculate new Iy insertS = maximum(match - parameters_semiGappedOpenGap, insertS - parameters_semiGappedExtendGap); } else { *bestRow = insertS; // Since M <= Iy, new Iy must derive from Iy insertS -= parameters_semiGappedExtendGap; } // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } // If score at current cell (and cells to its right) are below dropoff if (rightOfDropoff) { if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = subjectPosition; } else { // We are left of the column dropoff for this row rightOfDropoff = 0; } } } // ** We are allowing insertQ this column else { // Remember old M value (for cell below this one) oldBest = *bestRow; match = matrixColumn[*subjectPosition] + previousOldBest; previousOldBest = oldBest; // Determine the best of M, Ix and Iy if (match > insertS) { if (match > *insertQrow) { // Match is largest *bestRow = match; // Calculate new Ix *insertQrow = maximum(match - parameters_semiGappedOpenGap, *insertQrow - parameters_semiGappedExtendGap); // Calculate new Iy insertS = maximum(match - parameters_semiGappedOpenGap, insertS - parameters_semiGappedExtendGap); // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } } else { // insertQ is largest *bestRow = *insertQrow; // Calculate new Ix *insertQrow -= parameters_semiGappedExtendGap; // Dummy Iy insertS = constants_gappedExtensionDummyValue; } } else { if (insertS > *insertQrow) { // insertS is largest *bestRow = insertS; // Dummy Ix *insertQrow = constants_gappedExtensionDummyValue; // Calculate new Iy insertS -= parameters_semiGappedExtendGap; } else { // insertQ is largest *bestRow = *insertQrow; // Calculate new Ix *insertQrow -= parameters_semiGappedExtendGap; // Dummy Iy insertS = constants_gappedExtensionDummyValue; } } // If score at current cell (and cells to its right) are below dropoff if (rightOfDropoff) { if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = subjectPosition; } else { // We are left of the column dropoff for this row rightOfDropoff = 0; } } // Reset subjectCount subjectCount = parameters_semiGappedExtensionN; } subjectPosition--; bestRow--; insertQrow--; subjectCount--; } // -----SINGLE CELL LEFT OF ROW DROPOFF ----- if (!(bestScore > previousOldBest + dropoff) && (subjectPosition >= subject)) { // Calculate match value match = previousOldBest + matrixColumn[*subjectPosition]; // Set value for best *bestRow = maximum(match, insertS); // Calculate new Iy insertS = maximum(match - parameters_semiGappedOpenGap, insertS - parameters_semiGappedExtendGap); // Set DUMMY values for Ix *insertQrow = constants_gappedExtensionDummyValue; subjectPosition--; bestRow--; insertQrow--; } // -----CELLS LEFT OF ROW DROPOFF ----- if (!(bestScore > *(bestRow + 1) + dropoff)) { while (subjectPosition >= subject) { // Set value for Iy and best *bestRow = insertS; insertS = insertS - parameters_semiGappedExtendGap; // Set DUMMY values for Ix *insertQrow = constants_gappedExtensionDummyValue; // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Stop processing row subjectPosition--; break; } subjectPosition--; bestRow--; insertQrow--; } } // Record dropoff position rowDropoff = subjectPosition + 1; // Clear insertS for next row insertS = constants_gappedExtensionDummyValue; // Reset queryCount queryCount = parameters_semiGappedExtensionN; } // if (dloc == 746829265) // print(semiGappedScoring_bestRow, subject, rowDropoff, // columnDropoff); } dpResults.best.queryOffset = bestQueryPosition - PSSMatrix.matrix; dpResults.best.subjectOffset = bestSubjectPosition - subject; dpResults.bestScore = bestScore; dpResults.traceback = NULL; return dpResults; }
struct scoreMatrix scoreMatrix_convertCafe(int4 **cafeMatrix, int4 cafeMatrixSize) { FILE *matrixFile; int4 MAXLINELENGTH = 8096; int4 lineNumber = 0; int4 tokenCount; char line[MAXLINELENGTH]; char *token, *tempAddress; unsigned char columnHeadings[24]; unsigned char rowHeadings[24]; int2 value; struct scoreMatrix scoreMatrix; int4 x, y; scoreMatrix.highestValue = 0; scoreMatrix.lowestValue = 0; // Declare memory used by scoreMatrix scoreMatrix.matrix = (int2 **)global_malloc( sizeof(int2 *) * encoding_numCodes + sizeof(int2) * encoding_numCodes * encoding_numCodes); tempAddress = (char *)scoreMatrix.matrix; x = 0; while (x < encoding_numCodes) { scoreMatrix.matrix[x] = (int2 *)(tempAddress + sizeof(int2 *) * encoding_numCodes + sizeof(int2) * encoding_numCodes * x); // Initialize the score matrix, by setting all values to sentinal score y = 0; while (y < encoding_numCodes) { scoreMatrix.matrix[x][y] = constants_sentinalScore; y++; } x++; } // Go through the CAFE scoring matrix x = 0; while (x < cafeMatrixSize) { y = 0; while (y < cafeMatrixSize) { // If both current row and column represents a valid amino acid if (encoding_getCode('A' + x) != encoding_unknownCode && encoding_getCode('A' + y) != encoding_unknownCode) { // Copy value to BLAST scoring matrix scoreMatrix.matrix[encoding_getCode('A' + x)] [encoding_getCode('A' + y)] = cafeMatrix[x][y]; // Update highest and lowest values if (cafeMatrix[x][y] > scoreMatrix.highestValue) scoreMatrix.highestValue = cafeMatrix[x][y]; if (cafeMatrix[x][y] < scoreMatrix.lowestValue) scoreMatrix.lowestValue = cafeMatrix[x][y]; } y++; } x++; } // For cells in the score matrix that did not recieve a score, use 1 and // lowestValue instead x = 0; while (x < encoding_numCodes) { y = 0; while (y < encoding_numCodes) { if (scoreMatrix.matrix[x][y] == constants_sentinalScore) { if (x == y) { scoreMatrix.matrix[x][y] = 1; } else { scoreMatrix.matrix[x][y] = scoreMatrix.lowestValue; } } y++; } x++; } // Every letter scores poorly against the sentinal code x = 0; while (x < encoding_numCodes) { scoreMatrix.matrix[x][encoding_sentinalCode] = constants_sentinalScore; scoreMatrix.matrix[encoding_sentinalCode][x] = constants_sentinalScore; x++; } return scoreMatrix; }
// Load the score matrix (eg. BLOSUM62) from disk and return contents in an // array // 25 by 25 for the 25 possible amino acids (actually 20 plus 3 wilds, 1 // unknown, // and a sentinal code which scores poorly, and flanks sequences) struct scoreMatrix scoreMatrix_load(char *filename) { FILE *matrixFile; int4 MAXLINELENGTH = 8096; int4 lineNumber = 0; int4 tokenCount; char line[MAXLINELENGTH]; char *token, *tempAddress; unsigned char columnHeadings[24]; unsigned char rowHeadings[24]; int2 value; struct scoreMatrix scoreMatrix; int4 x, y; scoreMatrix.highestValue = 0; scoreMatrix.lowestValue = 0; // Declare memory used by scoreMatrix scoreMatrix.matrix = (int2 **)global_malloc( sizeof(int2 *) * encoding_numCodes + sizeof(int2) * encoding_numCodes * encoding_numCodes); tempAddress = (char *)scoreMatrix.matrix; x = 0; while (x < encoding_numCodes) { scoreMatrix.matrix[x] = (int2 *)(tempAddress + sizeof(int2 *) * encoding_numCodes + sizeof(int2) * encoding_numCodes * x); // Initialize the score matrix, by setting all values to sentinal score y = 0; while (y < encoding_numCodes) { scoreMatrix.matrix[x][y] = constants_sentinalScore; y++; } x++; } // Open file for reading if ((matrixFile = fopen(filename, "r")) == NULL) { fprintf(stderr, "%s\n", strerror(errno)); fprintf(stderr, "Error opening matrix file %s for reading\n", filename); exit(-1); } // Read each line in turn while (fgets(line, MAXLINELENGTH, matrixFile) != NULL) { // Check we didn't max out the buffer if (strlen(line) >= MAXLINELENGTH - 1) { fprintf(stderr, "%s\n", strerror(errno)); fprintf(stderr, "Error reading file %s: maximum line length %d exceeded\n", filename, MAXLINELENGTH); exit(-1); } // Check not a comment or blank line if (line[0] != '\0' && line[0] != '#' && lineNumber < 25) { // Read each of the space seperated tokens from the line tokenCount = 0; token = strtok(line, " \n"); while (token != NULL && tokenCount < 25) { // First line - tokens are column headings if (lineNumber == 0) { columnHeadings[tokenCount] = token[0]; } // Subsequent lines, first token is row heading else if (tokenCount == 0) { rowHeadings[lineNumber - 1] = token[0]; } // Subsequent lines, subsequent tokens are array values else { // Get integer value of token value = atoi(token); // Add to scoring matrix scoreMatrix.matrix[encoding_getCode(rowHeadings[lineNumber - 1])] [encoding_getCode(columnHeadings[tokenCount - 1])] = value; // Determine the highest and lowest values in the matrix if (value > scoreMatrix.highestValue) { scoreMatrix.highestValue = value; } if (value < scoreMatrix.lowestValue) { scoreMatrix.lowestValue = value; } } token = strtok(NULL, " \n"); tokenCount++; } lineNumber++; } } fclose(matrixFile); // For cells in the score matrix that did not recieve a score, use 1 and // lowestValue instead x = 0; while (x < encoding_numCodes) { y = 0; while (y < encoding_numCodes) { if (scoreMatrix.matrix[x][y] == constants_sentinalScore) { if (x == y) { scoreMatrix.matrix[x][y] = 1; } else { scoreMatrix.matrix[x][y] = scoreMatrix.lowestValue; } } y++; } x++; } // Every letter scores well against the wildcard x = 0; while (x < encoding_numCodes) { scoreMatrix.matrix[x][encoding_aaStartWildcards] = 1; scoreMatrix.matrix[encoding_aaStartWildcards][x] = 1; x++; } // Every letter scores poorly against the sentinal code x = 0; while (x < encoding_numCodes) { scoreMatrix.matrix[x][encoding_sentinalCode] = constants_sentinalScore; scoreMatrix.matrix[encoding_sentinalCode][x] = constants_sentinalScore; x++; } // Process wildcard scores y = 0; while (y < wildcards_numClusterWildcards) { x = 0; while (x < encoding_numLetters) { scoreMatrix.matrix[y + encoding_aaStartWildcards][x] = wildcards_clusterWildcards[y].scoreMatrixRow[x]; scoreMatrix.matrix[x][y + encoding_aaStartWildcards] = wildcards_clusterWildcards[y].scoreMatrixRow[x]; x++; } y++; } // Calculate average match score for two residues scoreMatrix.averageMatchScore = 0; y = 0; while (y < encoding_numLetters) { x = 0; while (x < encoding_numLetters) { scoreMatrix.averageMatchScore += scoreMatrix.matrix[y][x] * Robinson_prob[x] * Robinson_prob[y]; x++; } y++; } scoreMatrix.averageMatchScore /= 1000000; return scoreMatrix; }
// Create a nucleotide scoring matrix use match and mismatch penalties struct scoreMatrix scoreMatrix_create(int2 match, int2 mismatch) { struct scoreMatrix scoreMatrix; char *tempAddress; int4 x, y, numXcodes, numYcodes, count, xCode, yCode, total; scoreMatrix.highestValue = match; scoreMatrix.lowestValue = mismatch; // Declare memory used by scoreMatrix scoreMatrix.matrix = (int2 **)global_malloc( sizeof(int2 *) * encoding_numCodes + sizeof(int2) * encoding_numCodes * encoding_numCodes); tempAddress = (char *)scoreMatrix.matrix; // For each row in the matrix x = 0; while (x < encoding_numCodes) { // Initialize memory scoreMatrix.matrix[x] = (int2 *)(tempAddress + sizeof(int2 *) * encoding_numCodes + sizeof(int2) * encoding_numCodes * x); // For each column, determine value y = 0; while (y < encoding_numCodes) { // If either is the sentinal code, use sentinal score if (x == encoding_sentinalCode || y == encoding_sentinalCode) { scoreMatrix.matrix[x][y] = constants_sentinalScore; } // If both characters are wilds, calculate score for match else if (x >= encoding_numRegularLetters && y >= encoding_numRegularLetters) { // For each possible letter for x total = 0; count = 0; xCode = 0; numXcodes = encoding_wildcards[x].numCodes; while (xCode < numXcodes) { // For each possible letter for y yCode = 0; numYcodes = encoding_wildcards[y].numCodes; while (yCode < numYcodes) { // If the letters match if (encoding_wildcards[x].replacementCodes[xCode] == encoding_wildcards[y].replacementCodes[yCode]) { count++; } total++; yCode++; } xCode++; } // Calculate frequency of the letters matching and probably score scoreMatrix.matrix[x][y] = (int4)ceilf( ((float)match * (float)count / (float)total) + ((float)mismatch * (float)(total - count) / (float)total)); } // If the characters match else if (x == y) { scoreMatrix.matrix[x][y] = match; } // Mismatch else { scoreMatrix.matrix[x][y] = mismatch; // If y is in x's list of ambigious codes count = 0; numXcodes = encoding_wildcards[x].numCodes; while (count < numXcodes) { if (encoding_wildcards[x].replacementCodes[count] == y) { // Give score based on probability of a match scoreMatrix.matrix[x][y] = (int4)ceilf( ((float)match / (float)numXcodes) + ((float)mismatch * (float)(numXcodes - 1) / (float)numXcodes)); } count++; } // Similarly if x is in y's list of ambigious codes count = 0; numYcodes = encoding_wildcards[y].numCodes; while (count < numYcodes) { if (encoding_wildcards[y].replacementCodes[count] == x) { // Give score based on probability of a match scoreMatrix.matrix[x][y] = (int4)ceilf( ((float)match / (float)numYcodes) + ((float)mismatch * (float)(numYcodes - 1) / (float)numYcodes)); } count++; } } y++; } x++; } return scoreMatrix; }
// Perform dynamic programming to explore possible start points and alignments // that end at // the given seed and find the best score struct dpResults nuGappedScoring_dpBeforeSeed(unsigned char *subject, struct PSSMatrix PSSMatrix, struct coordinate seed, int4 dropoff) { int2 **queryPosition, **bestQueryPosition; int2 **rowDropoff, **columnDropoff; unsigned char *subjectPosition, *bestSubjectPosition, subjectChar; int4 bestScore = 0; int4 *bestRow, *insertQrow, insertS, rowOffset; int4 queryDistance; int4 oldBest, match, previousOldBest; unsigned char rightOfDropoff; struct dpResults dpResults; int4 bytePosition; if (seed.queryOffset == 0 || seed.subjectOffset == 0) { dpResults.best.queryOffset = 0; dpResults.best.subjectOffset = 0; dpResults.bestScore = bestScore; dpResults.traceback = NULL; return dpResults; } // Declare processing rows for storing match, insert-subject and insert-query // values // If current malloced rows aren't big enough if (seed.queryOffset >= nuGappedScoring_rowSizes) { // Free existing rows free(nuGappedScoring_bestRow); free(nuGappedScoring_insertQrow); // Set size to double current needed length nuGappedScoring_rowSizes = (seed.queryOffset) * 2; // Malloc new rows nuGappedScoring_bestRow = (int4 *)global_malloc(sizeof(int4) * nuGappedScoring_rowSizes); nuGappedScoring_insertQrow = (int4 *)global_malloc(sizeof(int4) * nuGappedScoring_rowSizes); } // Convert subject offset to point to bytepacked subject bytePosition = (seed.subjectOffset - 1) % 4; bestSubjectPosition = subjectPosition = subject + ((seed.subjectOffset - 1) / 4); bestQueryPosition = queryPosition = PSSMatrix.matrix + seed.queryOffset - 1; // Initialize row pointers rowOffset = (queryPosition - PSSMatrix.matrix); bestRow = nuGappedScoring_bestRow + rowOffset; insertQrow = nuGappedScoring_insertQrow + rowOffset; // Set initial row dropoff and column dropoff rowDropoff = PSSMatrix.matrix; columnDropoff = PSSMatrix.matrix + seed.queryOffset; // -----FIRST ROW----- subjectChar = encoding_extractBase(*subjectPosition, bytePosition); // -----FIRST CELL----- // Set M value for bottom-right cell match = (*queryPosition)[subjectChar]; // M must be the best *bestRow = match; // Only gap opens possible *insertQrow = insertS = match - parameters_openGap; // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } queryDistance = 0; queryPosition--; bestRow--; insertQrow--; // ----- REMAINING CELLS ----- // For each remaining column in the bottom row, scanning from right-to-left while (queryPosition >= PSSMatrix.matrix) { // Set value for M match = (*queryPosition)[subjectChar] - parameters_openGap - queryDistance * parameters_extendGap; // Determine the best of M and Iy if (match > insertS) { *bestRow = match; // Calculate new Iy insertS = maximum(match - parameters_openGap, insertS - parameters_extendGap); } else { *bestRow = insertS; // Since M <= Iy, new Iy must derive from Iy insertS -= parameters_extendGap; } // Set DUMMY Ix value, which should never be used *insertQrow = constants_gappedExtensionDummyValue; // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Record dropoff position rowDropoff = queryPosition; // And stop processing row break; } queryPosition--; bestRow--; insertQrow--; queryDistance++; } // Clear insertS for next row insertS = constants_gappedExtensionDummyValue; #ifdef VERBOSE if (parameters_verboseDloc == blast_dloc) nuGappedScoring_printBeforeRow(nuGappedScoring_bestRow, PSSMatrix.matrix, rowDropoff, columnDropoff); #endif // -----REMAINING ROWS----- while (rowDropoff < columnDropoff) { // Move to next subject characters if (bytePosition) { // Next char in current byte bytePosition--; } else { // Involves moving to next byte bytePosition = 3; subjectPosition--; if (subjectPosition < subject) break; } // Extract the subject characters subjectChar = encoding_extractBase(*subjectPosition, bytePosition); // printf("[%d/%d]", subjectPosition - subject, bytePosition); queryPosition = columnDropoff - 1; // Reset row pointers to start of rows rowOffset = (queryPosition - PSSMatrix.matrix); bestRow = nuGappedScoring_bestRow + rowOffset; insertQrow = nuGappedScoring_insertQrow + rowOffset; // -----FAR RIGHT CELL----- // Record some old values previousOldBest = *bestRow; // Ix is the best *bestRow = *insertQrow; // Calculate new Ix value *insertQrow -= parameters_extendGap; // Set DUMMY value for Iy, which should never be used insertS = constants_gappedExtensionDummyValue; // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = queryPosition; rightOfDropoff = 1; } else { // We are left of the column dropoff for this row rightOfDropoff = 0; } queryPosition--; bestRow--; insertQrow--; // -----CELLS RIGHT OF ROW DROPOFF----- start1: // Loop 1 when insertS has no value while (queryPosition >= rowDropoff) { // Calculate new M value oldBest = *bestRow; match = (*queryPosition)[subjectChar] + previousOldBest; previousOldBest = oldBest; // Determine the best of M and Ix if (match > *insertQrow) { // Match is largest *bestRow = match; // Calculate new Ix *insertQrow = maximum(match - parameters_openGap, *insertQrow - parameters_extendGap); // Calculate new Iy insertS = maximum(match - parameters_openGap, insertS - parameters_extendGap); // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } // If score at current cell (and cells to its right) are below dropoff if (rightOfDropoff) { if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = queryPosition; } else { // We are left of the column dropoff for this row rightOfDropoff = 0; } } queryPosition--; bestRow--; insertQrow--; // InsertS now has a value break; } else { // insertQ is largest *bestRow = *insertQrow; // Calculate new Ix *insertQrow -= parameters_extendGap; } // If score at current cell (and cells to its right) are below dropoff if (rightOfDropoff) { if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = queryPosition; } else { // We are left of the column dropoff for this row rightOfDropoff = 0; } } queryPosition--; bestRow--; insertQrow--; } // Loop2 whilst insertS does have a value while (queryPosition >= rowDropoff) { // Calculate new M value oldBest = *bestRow; match = (*queryPosition)[subjectChar] + previousOldBest; previousOldBest = oldBest; // Determine the best of M, Ix and Iy if (match > insertS) { if (match > *insertQrow) { // Match is largest *bestRow = match; // Calculate new Ix *insertQrow = maximum(match - parameters_openGap, *insertQrow - parameters_extendGap); // Calculate new Iy insertS = maximum(match - parameters_openGap, insertS - parameters_extendGap); // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } } else { // insertQ is largest *bestRow = *insertQrow; // Calculate new Ix *insertQrow -= parameters_extendGap; insertS = constants_gappedExtensionDummyValue; break; } } else { if (insertS > *insertQrow) { // insertS is largest *bestRow = insertS; // Dummy Ix *insertQrow = constants_gappedExtensionDummyValue; // Calculate new Iy insertS -= parameters_extendGap; } else { // insertQ is largest *bestRow = *insertQrow; // Calculate new Ix *insertQrow -= parameters_extendGap; insertS = constants_gappedExtensionDummyValue; break; } } // If score at current cell (and cells to its right) are below dropoff if (rightOfDropoff) { if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = queryPosition; } else { // We are left of the column dropoff for this row rightOfDropoff = 0; } } queryPosition--; bestRow--; insertQrow--; } if (queryPosition >= rowDropoff) { // If score at current cell (and cells to its right) are below dropoff if (rightOfDropoff) { if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = queryPosition; } else { // We are left of the column dropoff for this row rightOfDropoff = 0; } } queryPosition--; bestRow--; insertQrow--; goto start1; } // -----CELLS LEFT OF ROW DROPOFF ----- if (!(bestScore > *(bestRow + 1) + dropoff)) { while (queryPosition >= PSSMatrix.matrix) { // Set value for Iy and best *bestRow = insertS; insertS = insertS - parameters_extendGap; // Set DUMMY values for Ix *insertQrow = constants_gappedExtensionDummyValue; // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Stop processing row queryPosition--; break; } queryPosition--; bestRow--; insertQrow--; } } // Record dropoff position rowDropoff = queryPosition + 1; // Clear insertS for next row insertS = constants_gappedExtensionDummyValue; #ifdef VERBOSE if (parameters_verboseDloc == blast_dloc) nuGappedScoring_printBeforeRow(nuGappedScoring_bestRow, PSSMatrix.matrix, rowDropoff, columnDropoff); #endif } dpResults.best.queryOffset = bestQueryPosition - PSSMatrix.matrix; dpResults.best.subjectOffset = bestSubjectPosition - subject; dpResults.bestScore = bestScore; dpResults.traceback = NULL; return dpResults; }
// Perform dynamic programming to explore possible END points and alignments // that start at // the given seed and find the best score struct dpResults nuGappedScoring_dpAfterSeed(unsigned char *subject, struct PSSMatrix PSSMatrix, int4 dropoff, int4 subjectLength) { int2 **queryPosition, **bestQueryPosition, **queryEnd; int2 **rowDropoff, **columnDropoff; unsigned char *subjectPosition, *bestSubjectPosition, *subjectEnd, subjectChar; int4 bestScore = 0; int4 *bestRow, *insertQrow, insertS, rowOffset; int4 queryDistance; int4 oldBest, match, previousOldBest; unsigned char leftOfDropoff; int4 queryLength; struct dpResults dpResults; int4 bytePosition, subjectEndBytePosition; queryLength = PSSMatrix.length; queryEnd = PSSMatrix.matrix + queryLength; subjectEnd = subject + (subjectLength / 4); subjectEndBytePosition = subjectLength % 4; // Declare processing rows for storing match, insert-subject and insert-query // values // If current malloced rows aren't big enough if (queryLength >= nuGappedScoring_rowSizes) { // Free existing rows free(nuGappedScoring_bestRow); free(nuGappedScoring_insertQrow); // Set size to double current needed length nuGappedScoring_rowSizes = queryLength * 2; // Malloc new rows nuGappedScoring_bestRow = (int4 *)global_malloc(sizeof(int4) * nuGappedScoring_rowSizes); nuGappedScoring_insertQrow = (int4 *)global_malloc(sizeof(int4) * nuGappedScoring_rowSizes); } bestSubjectPosition = subjectPosition = subject; bytePosition = 1; bestQueryPosition = queryPosition = PSSMatrix.matrix + 1; // Initialize rows bestRow = nuGappedScoring_bestRow + 1; insertQrow = nuGappedScoring_insertQrow + 1; // Set initial row dropoff and column dropoff rowDropoff = PSSMatrix.matrix + queryLength - 1; columnDropoff = PSSMatrix.matrix; // -----FIRST ROW----- subjectChar = encoding_extractBase(*subjectPosition, bytePosition); // -----FIRST CELL----- // Set M value for top-left cell match = (*queryPosition)[subjectChar]; // M must be the best *bestRow = match; // Only gap opens possible *insertQrow = insertS = match - parameters_openGap; // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } queryDistance = 0; queryPosition++; bestRow++; insertQrow++; // ----- REMAINING CELLS ----- // For each remaining columns in the top row, scanning from left-to-right while (queryPosition < queryEnd) { // Set value for M match = (*queryPosition)[subjectChar] - parameters_openGap - queryDistance * parameters_extendGap; // Determine the best of M and Iy if (match > insertS) { *bestRow = match; // Calculate new Iy insertS = maximum(match - parameters_openGap, insertS - parameters_extendGap); } else { *bestRow = insertS; // Since M <= Iy, new Iy must derive from Iy insertS -= parameters_extendGap; } // Set DUMMY Ix value, which should never be used *insertQrow = constants_gappedExtensionDummyValue; // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Record dropoff position rowDropoff = queryPosition; // And stop processing row break; } queryPosition++; bestRow++; insertQrow++; queryDistance++; } // Clear insertS for next row insertS = constants_gappedExtensionDummyValue; #ifdef VERBOSE if (parameters_verboseDloc == blast_dloc) nuGappedScoring_printAfterRow(nuGappedScoring_bestRow + 1, PSSMatrix.matrix, rowDropoff, columnDropoff); #endif // Move to next subject character if (bytePosition == 3) { bytePosition = 0; subjectPosition++; } else { bytePosition++; } // -----REMAINING ROWS----- while (rowDropoff > columnDropoff) { // Stop at end of subject if (subjectPosition == subjectEnd && bytePosition == subjectEndBytePosition) break; subjectChar = encoding_extractBase(*subjectPosition, bytePosition); queryPosition = columnDropoff + 1; // Reset rows rowOffset = (queryPosition - PSSMatrix.matrix); bestRow = nuGappedScoring_bestRow + rowOffset; insertQrow = nuGappedScoring_insertQrow + rowOffset; // -----FAR LEFT CELL----- // Record some old values previousOldBest = *bestRow; // Ix is the best *bestRow = *insertQrow; // Calculate new Ix value *insertQrow -= parameters_extendGap; // Set DUMMY value for Iy, which should never be used insertS = constants_gappedExtensionDummyValue; // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = queryPosition; leftOfDropoff = 1; } else { // We are left of the column dropoff for this row leftOfDropoff = 0; } queryPosition++; bestRow++; insertQrow++; // -----CELLS LEFT OF ROW DROPOFF----- start2: // Loop 1 when insertS has no value while (queryPosition <= rowDropoff) { // Calculate new M value oldBest = *bestRow; match = (*queryPosition)[subjectChar] + previousOldBest; previousOldBest = oldBest; // Determine the best of M and Ix if (match > *insertQrow) { // Match is largest *bestRow = match; // Calculate new Ix *insertQrow = maximum(match - parameters_openGap, *insertQrow - parameters_extendGap); // Calculate new Iy insertS = maximum(match - parameters_openGap, insertS - parameters_extendGap); // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } // If score at current cell (and cells to its right) are below dropoff if (leftOfDropoff) { if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = queryPosition; } else { // We are left of the column dropoff for this row leftOfDropoff = 0; } } queryPosition++; bestRow++; insertQrow++; // InsertS now has a value break; } else { // insertQ is largest *bestRow = *insertQrow; // Calculate new Ix *insertQrow -= parameters_extendGap; } // If score at current cell (and cells to its right) are below dropoff if (leftOfDropoff) { if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = queryPosition; } else { // We are left of the column dropoff for this row leftOfDropoff = 0; } } queryPosition++; bestRow++; insertQrow++; } // Loop2 whilst insertS does have a value while (queryPosition <= rowDropoff) { // Calculate new M value oldBest = *bestRow; match = (*queryPosition)[subjectChar] + previousOldBest; previousOldBest = oldBest; // Determine the best of M, Ix and Iy if (match > insertS) { if (match > *insertQrow) { // Match is largest *bestRow = match; // Calculate new Ix *insertQrow = maximum(match - parameters_openGap, *insertQrow - parameters_extendGap); // Calculate new Iy insertS = maximum(match - parameters_openGap, insertS - parameters_extendGap); // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } } else { // insertQ is largest *bestRow = *insertQrow; // Calculate new Ix *insertQrow -= parameters_extendGap; insertS = constants_gappedExtensionDummyValue; break; } } else { if (insertS > *insertQrow) { // insertS is largest *bestRow = insertS; // Dummy Ix *insertQrow = constants_gappedExtensionDummyValue; // Calculate new Iy insertS -= parameters_extendGap; } else { // insertQ is largest *bestRow = *insertQrow; // Calculate new Ix *insertQrow -= parameters_extendGap; insertS = constants_gappedExtensionDummyValue; break; } } // If score at current cell (and cells to its right) are below dropoff if (leftOfDropoff) { if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = queryPosition; } else { // We are left of the column dropoff for this row leftOfDropoff = 0; } } queryPosition++; bestRow++; insertQrow++; } if (queryPosition <= rowDropoff) { // If score at current cell (and cells to its right) are below dropoff if (leftOfDropoff) { if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = queryPosition; } else { // We are left of the column dropoff for this row leftOfDropoff = 0; } } queryPosition++; bestRow++; insertQrow++; goto start2; } // -----CELLS RIGHT OF ROW DROPOFF ----- if (!(bestScore > *(bestRow - 1) + dropoff)) { while (queryPosition < queryEnd) { // Set value for Iy and best *bestRow = insertS; insertS = insertS - parameters_extendGap; // Set DUMMY value for Ix, which should never be used *insertQrow = constants_gappedExtensionDummyValue; // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // And stop processing row queryPosition++; break; } queryPosition++; bestRow++; insertQrow++; } } // Record dropoff position rowDropoff = queryPosition - 1; // Clear insertS for next row insertS = constants_gappedExtensionDummyValue; // Move to next subject character if (bytePosition == 3) { bytePosition = 0; subjectPosition++; } else { bytePosition++; } #ifdef VERBOSE if (parameters_verboseDloc == blast_dloc) nuGappedScoring_printAfterRow(nuGappedScoring_bestRow + 1, PSSMatrix.matrix, rowDropoff, columnDropoff); #endif } dpResults.best.queryOffset = bestQueryPosition - PSSMatrix.matrix; dpResults.best.subjectOffset = bestSubjectPosition - subject; dpResults.bestScore = bestScore; dpResults.traceback = NULL; return dpResults; }
// Load a single subject into memory int4 unpack_loadSubject(struct PSSMatrix PSSMatrix, struct alignment *alignment) { uint4 totalCopied = 0; unsigned char *subject, *edits, *endEdits; struct unpackRegion *firstRegion = NULL, *lastRegion, *currentRegion; int4 numRegions, regionStart, regionEnd; // If protein search if (encoding_alphabetType == encoding_protein) { // Make copy of sequence subject = (unsigned char *)global_malloc(sizeof(unsigned char) * alignment->encodedLength); subject++; memcpy(subject - 1, alignment->subject - 1, alignment->encodedLength); alignment->subject = subject; blast_totalCopied += alignment->encodedLength; } // If a nucleotide search else { // Get a list of regions to copy numRegions = unpack_getRegions(PSSMatrix, alignment, 1, unpack_subjectRegions); lastRegion = memBlocks_getLastEntry(unpack_subjectRegions); lastRegion++; firstRegion = lastRegion - numRegions; #ifdef VERBOSE if (parameters_verboseDloc == alignment->descriptionLocation) { printf("%d regions for subject\n", lastRegion - firstRegion); fflush(stdout); } #endif // Copy each region into memory currentRegion = firstRegion; while (currentRegion < lastRegion) { #ifdef VERBOSE if (parameters_verboseDloc == alignment->descriptionLocation) { printf("Load region %d to %d into memory\n", currentRegion->startOffset, currentRegion->endOffset); fflush(stdout); fflush(stdout); } #endif regionStart = currentRegion->startOffset / 4; regionEnd = (currentRegion->endOffset + 3) / 4; currentRegion->unpackedSubject = NULL; currentRegion->subject = (unsigned char *)global_malloc( sizeof(unsigned char) * (regionEnd - regionStart)); totalCopied += regionEnd - regionStart; memcpy(currentRegion->subject, alignment->subject + regionStart, regionEnd - regionStart); currentRegion->subject -= regionStart; currentRegion->subjectLength = alignment->subjectLength; blast_totalCopied += (regionEnd - regionStart); currentRegion++; } // Store new alignment regions alignment->unpackRegions = firstRegion; alignment->numUnpackRegions = lastRegion - firstRegion; // If there are edits for this subject if (alignment->edits != NULL) { edits = alignment->edits; endEdits = alignment->subject + alignment->encodedLength; // Make an in-memory copy of them alignment->edits = (unsigned char *)malloc(sizeof(char) * (endEdits - edits)); memcpy(alignment->edits, edits, endEdits - edits); } alignment->subject = NULL; } alignment->inMemorySubject = 1; return totalCopied; }
// Perform dynamic programming to explore possible start points and alignments that end at // the given seed and find the best score struct dpResults oldGappedScoring_dpBeforeSeed(unsigned char* subject, struct PSSMatrix PSSMatrix, struct coordinate seed, int4 dropoff) { int2 **queryPosition, **bestQueryPosition; int2* matrixColumn; unsigned char *rowDropoff, *columnDropoff; unsigned char* subjectPosition, *bestSubjectPosition; int4 bestScore = 0; int4 *bestRow, *insertQrow, insertS, rowOffset; int4 subjectDistance; int4 oldBest, match, previousOldBest; unsigned char rightOfDropoff; struct dpResults dpResults; // Declare processing rows for storing match, insert-subject and insert-query values // If current malloced rows aren't big enough if (seed.subjectOffset >= oldGappedScoring_rowSizes) { // Free existing rows free(oldGappedScoring_bestRow); free(oldGappedScoring_insertQrow); // Set size to double current needed length oldGappedScoring_rowSizes = (seed.subjectOffset) * 2; // Malloc new rows oldGappedScoring_bestRow = (int4*)global_malloc(sizeof(int4) * oldGappedScoring_rowSizes); oldGappedScoring_insertQrow = (int4*)global_malloc(sizeof(int4) * oldGappedScoring_rowSizes); } bestSubjectPosition = subjectPosition = subject + seed.subjectOffset - 1; bestQueryPosition = queryPosition = PSSMatrix.matrix + seed.queryOffset - 1; // Initialize row pointers rowOffset = (subjectPosition - subject); bestRow = oldGappedScoring_bestRow + rowOffset; insertQrow = oldGappedScoring_insertQrow + rowOffset; // Set initial row dropoff and column dropoff rowDropoff = subject; columnDropoff = subject + seed.subjectOffset; // Using first column of query matrix matrixColumn = *queryPosition; // -----FIRST ROW----- // -----FIRST CELL----- // Set M value for bottom-right cell match = matrixColumn[*subjectPosition]; // M must be the best *bestRow = match; // Only gap opens possible *insertQrow = insertS = match - parameters_openGap; // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } subjectDistance = 0; subjectPosition--; bestRow--; insertQrow--; // ----- REMAINING CELLS ----- // For each remaining column in the bottom row, scanning from right-to-left while (subjectPosition >= subject) { // Set value for M match = matrixColumn[*subjectPosition] - parameters_openGap - subjectDistance * parameters_extendGap; // Determine the best of M and Iy if (match > insertS) { *bestRow = match; // Calculate new Iy insertS = maximum(match - parameters_openGap, insertS - parameters_extendGap); } else { *bestRow = insertS; // Since M <= Iy, new Iy must derive from Iy insertS -= parameters_extendGap; } // Set DUMMY Ix value, which should never be used *insertQrow = constants_gappedExtensionDummyValue; // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Record dropoff position rowDropoff = subjectPosition; // And stop processing row break; } subjectPosition--; bestRow--; insertQrow--; subjectDistance++; } // Clear insertS for next row insertS = constants_gappedExtensionDummyValue; // if (dloc == 19063576) // print(oldGappedScoring_bestRow, subject, rowDropoff, columnDropoff); // -----REMAINING ROWS----- while (queryPosition > PSSMatrix.matrix && rowDropoff < columnDropoff) { queryPosition--; subjectPosition = columnDropoff - 1; // Reset row pointers to start of rows rowOffset = (subjectPosition - subject); bestRow = oldGappedScoring_bestRow + rowOffset; insertQrow = oldGappedScoring_insertQrow + rowOffset; // Using next column of query matrix matrixColumn = *queryPosition; // -----FAR RIGHT CELL----- // Record some old values previousOldBest = *bestRow; // Ix is the best *bestRow = *insertQrow; // Calculate new Ix value *insertQrow -= parameters_extendGap; // Set DUMMY value for Iy, which should never be used insertS = constants_gappedExtensionDummyValue; // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = subjectPosition; rightOfDropoff = 1; } else { // We are left of the column dropoff for this row rightOfDropoff = 0; } subjectPosition--; bestRow--; insertQrow--; // -----CELLS RIGHT OF ROW DROPOFF----- while (subjectPosition >= rowDropoff) { // Remember old M value (for cell below this one) oldBest = *bestRow; // Calculate new M value match = matrixColumn[*subjectPosition] + previousOldBest; previousOldBest = oldBest; // Determine the best of M, Ix and Iy if (match > insertS) { if (match > *insertQrow) { // Match is largest *bestRow = match; // Calculate new Ix *insertQrow = maximum(match - parameters_openGap, *insertQrow - parameters_extendGap); // Calculate new Iy insertS = maximum(match - parameters_openGap, insertS - parameters_extendGap); // If this is the best-yet scoring cell if (match > bestScore) { // Update best start cell data bestScore = match; bestQueryPosition = queryPosition; bestSubjectPosition = subjectPosition; } } else { // insertQ is largest *bestRow = *insertQrow; // Calculate new Ix *insertQrow -= parameters_extendGap; // Calculate new Iy insertS = maximum(match - parameters_openGap, insertS - parameters_extendGap); } } else { if (insertS > *insertQrow) { // insertS is largest *bestRow = insertS; // Calculate new Ix *insertQrow = maximum(match - parameters_openGap, *insertQrow - parameters_extendGap); // Calculate new Iy insertS -= parameters_extendGap; } else { // insertQ is largest *bestRow = *insertQrow; // Calculate new Ix *insertQrow -= parameters_extendGap; // Calculate new Iy insertS = maximum(match - parameters_openGap, insertS - parameters_extendGap); } } // If score at current cell (and cells to its right) are below dropoff if (rightOfDropoff) { if (bestScore > *bestRow + dropoff) { // Record dropoff position columnDropoff = subjectPosition; } else { // We are left of the column dropoff for this row rightOfDropoff = 0; } } subjectPosition--; bestRow--; insertQrow--; } // -----CELLS LEFT OF ROW DROPOFF ----- if (!(bestScore > *(bestRow + 1) + dropoff)) { while (subjectPosition >= subject) { // Set value for Iy and best *bestRow = insertS; insertS = insertS - parameters_extendGap; // Set DUMMY values for Ix *insertQrow = constants_gappedExtensionDummyValue; // If score at current cell is below dropoff if (bestScore > *bestRow + dropoff) { // Stop processing row subjectPosition--; break; } subjectPosition--; bestRow--; insertQrow--; subjectDistance++; } } // Record dropoff position rowDropoff = subjectPosition + 1; // Clear insertS for next row insertS = constants_gappedExtensionDummyValue; // if (dloc == 20877970) // print(oldGappedScoring_bestRow, subject, rowDropoff, columnDropoff); } dpResults.best.queryOffset = bestQueryPosition - PSSMatrix.matrix; dpResults.best.subjectOffset = bestSubjectPosition - subject; dpResults.bestScore = bestScore; dpResults.traceback = NULL; return dpResults; }
// Build a gapped extension with a trace and nominal score from the seed point // of an ungapped // extension using dynamic programming struct gappedExtension * gappedExtension_build(struct ungappedExtension *ungappedExtension, struct PSSMatrix PSSMatrix, int4 subjectSize, unsigned char *subject, struct unpackRegion *unpackRegion, int4 dropoff) { struct coordinate seed; unsigned char *choppedSubject; struct dpResults beforeDpResults, afterDpResults; struct trace beforeTrace, afterTrace, trace; struct PSSMatrix choppedPSSMatrix; int4 choppedSubjectSize; struct gappedExtension *gappedExtension; int4 strandOffset = 0; // Perform dynamic programming for points before the seed seed = ungappedExtension->seed; if (seed.queryOffset > PSSMatrix.strandLength) { // If query position is in the second strand, remove first strand from PSSM strandOffset = PSSMatrix.strandLength; seed.queryOffset -= PSSMatrix.strandLength; PSSMatrix = PSSMatrix_chop(PSSMatrix, PSSMatrix.strandLength); } else { // Otherwise remove second strand PSSMatrix.length = PSSMatrix.strandLength; } beforeDpResults = gappedExtension_dpBeforeSeed(PSSMatrix, dropoff, seed, unpackRegion); // Trace back and create the trace which specifies the first half of the // alignment beforeTrace = gappedExtension_traceBeforeSeed(beforeDpResults, seed); // Chop the start off the query and subject so they begin at the seed choppedPSSMatrix = PSSMatrix_chop(PSSMatrix, seed.queryOffset); choppedSubject = subject + seed.subjectOffset; choppedSubjectSize = subjectSize - (seed.subjectOffset); // Perform dynamic programming for points after the seed afterDpResults = gappedExtension_dpAfterSeed(choppedPSSMatrix, dropoff, unpackRegion, choppedSubjectSize, seed.subjectOffset); // Trace back to get the trace for the seed onwards afterTrace = gappedExtension_traceAfterSeed(afterDpResults, choppedPSSMatrix.length); // Join afterTrace to the end of beforeTrace trace = gappedExtension_joinTraces(beforeTrace, afterTrace); free(afterTrace.traceCodes); // Adjust coordinates if extension was performed in the second strand afterDpResults.best.queryOffset += strandOffset; beforeDpResults.best.queryOffset += strandOffset; trace.queryStart += strandOffset; // Create gapped extension gappedExtension = (struct gappedExtension *)global_malloc(sizeof(struct gappedExtension)); gappedExtension->trace = trace; gappedExtension->next = NULL; // Start of afterTrace is end of the gapped extension, but we need to add seed // position // to get correct offset gappedExtension->queryEnd = seed.queryOffset + afterTrace.queryStart + strandOffset; gappedExtension->subjectEnd = seed.subjectOffset + afterTrace.subjectStart; // if (dloc == 88197331) // printf("final[%d,%d,%d](%d)\n", beforeDpResults.bestScore, // afterDpResults.bestScore, // choppedPSSMatrix.matrix[0][unpackRegion->unpackedSubject[seed.subjectOffset]], // seed.queryOffset); // Determine score by combining score from the two traces, and the match score // at // the seed position gappedExtension->nominalScore = beforeDpResults.bestScore + afterDpResults.bestScore + choppedPSSMatrix.matrix [0][unpackRegion->unpackedSubject[seed.subjectOffset]]; // Update ungappedExtension start/end ungappedExtension->start.queryOffset = trace.queryStart; ungappedExtension->end.queryOffset = gappedExtension->queryEnd; ungappedExtension->start.subjectOffset = trace.subjectStart; ungappedExtension->end.subjectOffset = gappedExtension->subjectEnd; ungappedExtension->nominalScore = gappedExtension->nominalScore; #ifdef VERBOSE if (parameters_verboseDloc == blast_dloc) { printf("Gapped Extension from %d,%d to %d,%d score %d\n", trace.queryStart, trace.subjectStart, gappedExtension->queryEnd, gappedExtension->subjectEnd, gappedExtension->nominalScore); } #endif return gappedExtension; }
// Process a given query position list void qPosList_processList(int2* queryPositions, int2 numQueryPositions, int4 codeword) { int4 listCount = 0, queryPositionCount, subset, present; struct memSingleBlock* list; struct queryPosition* queryPosition = NULL; struct codeword* newCodeword; // Iterative through existing query positions lists (ordered from longest to shortest) while (listCount < qPosList_numQPosLists) { // Check for one that contains a subset of to-be-added query positions list = qPosList_qPosLists + listCount; // Start by assuming it is subset = 1; // Iterate through each query position in the current existing list memSingleBlock_resetCurrent(list); while ((queryPosition = memSingleBlock_getCurrent(list)) != NULL && subset) { // Iterate through each query position in the new list (which is sorted) queryPositionCount = 0; while (queryPositionCount < numQueryPositions) { // Found a match, break out and proceed to next position in current list if (queryPosition->queryPosition == queryPositions[queryPositionCount]) { break; } // The query position is not present in the new list, then existing list // is not a subset of the new one else if (queryPosition->queryPosition < queryPositions[queryPositionCount]) { subset = 0; break; } // Otherwise keep going queryPositionCount++; } // If we got to the end of the list, and didn't find a match, not a subset if (queryPositionCount == numQueryPositions) subset = 0; // If the query positions in the existing list processed so far match all of // the positions in the new list if (list->currentEntry == numQueryPositions && subset) { // We have a match, starting here newCodeword = global_malloc(sizeof(struct codeword)); newCodeword->codeword = codeword; newCodeword->next = queryPosition->codewords; queryPosition->codewords = newCodeword; return; } } if (subset) { // If this existing list is a subset of the new list then add the new/additional // query positions to the end of it queryPosition = memSingleBlock_getLastEntry(list); // Iterate through each query position in the new list while (numQueryPositions > 0) { numQueryPositions--; present = 0; // Check if present in the existing list memSingleBlock_resetCurrent(list); while ((queryPosition = memSingleBlock_getCurrent(list)) != NULL && subset) { // Found it if (queryPosition->queryPosition == queryPositions[numQueryPositions]) { present = 1; break; } } // Not present - add to the existing list with a null reference codeword if (!present) { queryPosition = memSingleBlock_newEntry(list); queryPosition->queryPosition = queryPositions[numQueryPositions]; // No refering codeword for any of the positions except the last queryPosition->codewords = NULL; } queryPositionCount++; } // Get the last, new query position queryPosition = memSingleBlock_getLastEntry(list); // Add reference codeword to the last query position (will become first) newCodeword = global_malloc(sizeof(struct codeword)); newCodeword->next = NULL; newCodeword->codeword = codeword; queryPosition->codewords = newCodeword; // Re-sort the lists of query positions from longest to shortest qsort(qPosList_qPosLists, qPosList_numQPosLists, sizeof(struct memSingleBlock), qPosList_compareList); return; } listCount++; } // Instead use a new list of query positions list = qPosList_qPosLists + qPosList_numQPosLists; list->numEntries = 0; qPosList_numQPosLists++; // And copy values into it while (numQueryPositions > 0) { numQueryPositions--; queryPosition = memSingleBlock_newEntry(list); queryPosition->queryPosition = queryPositions[numQueryPositions]; // No refering codeword for any of the positions except the last queryPosition->codewords = NULL; } // Reference at the last query position (will become the first) to the // new query position list's codeword newCodeword = global_malloc(sizeof(struct codeword)); newCodeword->next = NULL; newCodeword->codeword = codeword; queryPosition->codewords = newCodeword; // Sort the lists from longest to shortest qsort(qPosList_qPosLists, qPosList_numQPosLists, sizeof(struct memSingleBlock), qPosList_compareList); }
int4 main(int4 argc, char *argv[]) { unsigned char *filename, *readdb_address, *sequence, code, *wildcardsFilename; uint4 descriptionStart = 0, descriptionLength = 0, sequenceLength; uint4 encodedLength, numChildren, count; char *description; struct child *children, *child; uint4 candidateNum, change, childNum; uint4 numWilds = 0; struct wild *wilds, defaultWild, *candidates, bestNewCandidate; struct wild *wildSubset, *newCandidates, *bestNewCandidates; uint4 sizeWildSubset, numOccurences, numCandidates; float defaultWildscore, candidatesScore, bestScore; // User must provide FASTA format file at command line if (argc < 4) { fprintf(stderr, "Useage: chooseWilds <database> <Wildcard score constant> " "<Wildcards output file>\n"); exit(-1); } filename = argv[1]; wildcards_scoringConstant = atof(argv[2]); wildcardsFilename = argv[3]; readdb_open(filename); printf("Number of clusters = %u\n", readdb_numberOfClusters); printf("Number of sequences = %u\n", readdb_numberOfSequences); printf("Number of volumes = %u\n", readdb_numberOfVolumes); printf("Total number of letters = %llu\n", readdb_numberOfLetters); printf("Length of longest sequence = %u\n", readdb_longestSequenceLength); printf("Alphabet type = %s\n", encoding_alphabetTypes[readdb_dbAlphabetType]); // Initialize codes array encoding_initialize(readdb_dbAlphabetType); // Load score matrix parameters_findScoringMatrix(); wildcards_scoreMatrix = scoreMatrix_load(parameters_scoringMatrixPath); // Count occurences of each wildcard set wildcards_initializeCountOccurences(readdb_longestSequenceLength); do { // Read each sequence in the collection while (readdb_readSequence(&sequence, &sequenceLength, &descriptionStart, &descriptionLength, &encodedLength)) { // If a protein sequence cluster if (encoding_alphabetType == encoding_protein && sequenceLength + 2 != encodedLength) { // Get the children children = readdb_getChildren(sequence, sequenceLength, encodedLength, descriptionStart, &numChildren); // Add to list of occurences wildcards_countOccurences(children, numChildren, sequenceLength); childNum = 0; while (childNum < numChildren) { free(children[childNum].edits); free(children[childNum].sequence - 1); childNum++; } free(children); } } } while (readdb_nextVolume()); // Get final list of number of occurences of each wild wilds = wildcards_getOccurences(&numWilds); chooseWilds_printOccurenceMatrix(wilds, numWilds); // Build default wildcard defaultWild.code = 0; defaultWild.count = 0; code = 0; while (code < encoding_numLetters) { setbit(defaultWild.code, code); code++; } // Get average score for default wildcard wildSubset = wildcards_getSubset(defaultWild, wilds, numWilds, &sizeWildSubset, &numOccurences); defaultWildscore = wildcards_averageResidueWildMatch(defaultWild, wildSubset, sizeWildSubset); printf("defaultWildScore=%f occurences=%d\n", defaultWildscore, numOccurences); // Build up list of wildcard candidates candidates = (struct wild *)global_malloc(sizeof(struct wild) * wildcards_numClusterWildcards); numCandidates = 0; while (numCandidates < wildcards_numClusterWildcards - 1) { // Explore each possible option to add to list of candidates count = 0; bestScore = 0; while (count < numWilds) { // printf("set pos %d to ", numCandidates); // wildcards_printWildcard(wilds[count].code); candidates[numCandidates] = wilds[count]; // Score a set of candidates candidatesScore = wildcards_scoreCandidates( candidates, numCandidates + 1, wilds, numWilds, defaultWildscore); // printf("Candidates saving=%f\n", candidatesScore); if (candidatesScore > bestScore) { bestScore = candidatesScore; bestNewCandidate = wilds[count]; } count++; } printf("Score=%f Best new candidate (%d): ", bestScore, numCandidates); wildcards_printWildcard(bestNewCandidate.code); candidates[numCandidates] = bestNewCandidate; numCandidates++; } newCandidates = (struct wild *)global_malloc(sizeof(struct wild) * wildcards_numClusterWildcards); bestNewCandidates = (struct wild *)global_malloc( sizeof(struct wild) * wildcards_numClusterWildcards); // Perform hill climbing; consider changing each position change = 1; while (change) { change = 0; candidateNum = 0; bestScore = 0; while (candidateNum < numCandidates) { // Start with current candidates memcpy(newCandidates, candidates, sizeof(struct wild) * wildcards_numClusterWildcards - 1); // Change current position to every possible candidate count = 0; while (count < numWilds) { newCandidates[candidateNum] = wilds[count]; // Score a possible new set of candidates candidatesScore = wildcards_scoreCandidates( newCandidates, numCandidates, wilds, numWilds, defaultWildscore); // Check if best new candidates if (candidatesScore > bestScore) { bestScore = candidatesScore; memcpy(bestNewCandidates, newCandidates, sizeof(struct wild) * wildcards_numClusterWildcards - 1); } count++; } candidateNum++; } // Update candidates if (bestScore > wildcards_scoreCandidates(candidates, numCandidates, wilds, numWilds, defaultWildscore)) { printf("New bestScore=%f\n", bestScore); memcpy(candidates, bestNewCandidates, sizeof(struct wild) * wildcards_numClusterWildcards - 1); change = 1; } candidateNum = 0; while (candidateNum < numCandidates) { wildcards_printWildcard(candidates[candidateNum].code); candidateNum++; } } // Print out final set of clusters with default wild added candidates[numCandidates] = defaultWild; numCandidates++; wildcards_scoreCandidates(candidates, numCandidates, wilds, numWilds, defaultWildscore); wildcards_outputWildcards(wildcardsFilename); printf("%d sequences read.\n", readdb_numberOfSequences); fflush(stdout); free(candidates); free(newCandidates); free(bestNewCandidates); return 0; }