Ejemplo n.º 1
0
// Create a new memory block
struct memBlocks *memBlocks_initialize(size_t entrySize, int4 blockSizes) {
  struct memBlocks *memBlocks;

  // Declare memory for first block
  memBlocks = (struct memBlocks *)global_malloc(sizeof(struct memBlocks));

  memBlocks->blockSizes = blockSizes;
  memBlocks->entrySize = entrySize;
  memBlocks->numBlocks = 0;
  memBlocks->numTotalEntries = 0;
  memBlocks->maxNumBlocks = 10;

  // Declare memory for the first block
  memBlocks->lastBlock =
      (void *)global_malloc(memBlocks->entrySize * memBlocks->blockSizes);

  // Declare memory for pointers to blocks and add the first block
  memBlocks->blocks =
      (void **)global_malloc(sizeof(void *) * memBlocks->maxNumBlocks);
  memBlocks->numEntries =
      (int4 *)global_malloc(sizeof(int4) * memBlocks->maxNumBlocks);

  memBlocks->blocks[memBlocks->numBlocks] = memBlocks->lastBlock;
  memBlocks->numEntries[memBlocks->numBlocks] = 0;
  memBlocks->numBlocks++;

  return memBlocks;
}
Ejemplo n.º 2
0
// Load a BLOSUM or PAM scoring matrix
void parameters_findScoringMatrix()
{
    FILE* matrixFile, *ncbircFile;
    char* homeDirectory, *ncbircFilename;

    // Get home user's directory
    homeDirectory = getenv("HOME");

    // Construct name of .ncbirc file
    ncbircFilename = (char*)global_malloc(sizeof(char) * (strlen(homeDirectory) + 9));
    sprintf(ncbircFilename, "%s/.ncbirc", homeDirectory);

    // Check for existence of NCBI file
	if ((ncbircFile = fopen(ncbircFilename, "r")) != NULL)
    {
        // Determine the location of the scoring matrix file by consulting the .ncbirc file
        parameters_scoringMatrixPath = (char*)global_malloc(sizeof(char) * 1024);

        if (!(fscanf(ncbircFile, "[NCBI]\nData=%s", parameters_scoringMatrixPath)))
        {
            fprintf(stderr, "Error reading scoring matrix path from %s file\n", ncbircFilename);
            fprintf(stderr, "BLAST requires the file .ncbirc in the user's home directory\n");
            fprintf(stderr, "containing the text:\n\n");
            fprintf(stderr, "[NCBI]\n");
            fprintf(stderr, "Data=/home/user/fsa-blast/data\n\n");
            fprintf(stderr, "Where the path specified contains scoring matrix files (ie. BLOSUM62)\n");
            exit(-1);
        }
    	fclose(ncbircFile);
    }
    else
    {
    	// If not available guess location of scoring matrix files
        parameters_scoringMatrixPath = (char*)global_malloc(sizeof(char) * 1024);
        strcpy(parameters_scoringMatrixPath, "data");
    }

    // Append scoring matrix filename then check for existance
    sprintf(parameters_scoringMatrixPath, "%s/%s", parameters_scoringMatrixPath,
            parameters_scoringMatrix);
    matrixFile = fopen(parameters_scoringMatrixPath, "r");
    if (matrixFile == NULL)
    {
        fprintf(stderr, "%s\n", strerror(errno));
        fprintf(stderr, "Error reading matrix file %s\n", parameters_scoringMatrixPath);
        fprintf(stderr, "BLAST requires the file .ncbirc in the user's home directory\n");
        fprintf(stderr, "containing the text:\n\n");
        fprintf(stderr, "[NCBI]\n");
        fprintf(stderr, "Data=/home/user/fsa-blast/data\n\n");
        fprintf(stderr, "Where the path specified contains scoring matrix files (ie. BLOSUM62)\n");
        exit(-1);
    }
    fclose(matrixFile);
    free(ncbircFilename);
}
Ejemplo n.º 3
0
// Initialize writing to formatted database
void writedb_initialize(char *filename, uint4 alphabetType) {
    char *wildcardsFilename;

    writedb_filename = filename;
    writedb_alphabetType = alphabetType;
    writedb_maximumSequenceLength = 0;
    writedb_minimumSequenceLength = 0;
    writedb_numberOfLetters = 0;
    writedb_volume = 0;
    writedb_sequenceCount = 0;
    writedb_numberOfClusters = 0;

    // Construct sequence and description filenames
    writedb_sequenceFilename = (char *)global_malloc(strlen(filename) + 13);
    sprintf(writedb_sequenceFilename, "%s.sequences", filename);
    writedb_descriptionsFilename = (char *)global_malloc(strlen(filename) + 15);
    sprintf(writedb_descriptionsFilename, "%s.descriptions", filename);
    writedb_dataFilename = (char *)global_malloc(strlen(filename) + 8);
    sprintf(writedb_dataFilename, "%s.data", filename);
    wildcardsFilename = (char *)global_malloc(strlen(filename) + 12);
    sprintf(wildcardsFilename, "%s.wildcards", filename);

    // Delete the wildcards file if one exists
    rename(wildcardsFilename, writedb_sequenceFilename);

    // Open sequence file for writing
    if ((writedb_sequenceFile = fopen(writedb_sequenceFilename, "w")) == NULL) {
        fprintf(stderr, "Error opening file %s for writing\n",
                writedb_sequenceFilename);
        exit(-1);
    }

    // Write sentinal/padding byte at start
    if (alphabetType == encoding_protein)
        fputc(encoding_sentinalCode, writedb_sequenceFile);
    else
        fputc(0, writedb_sequenceFile);

    // Open descriptions file for writing
    if ((writedb_descriptionsFile = fopen(writedb_descriptionsFilename, "w")) ==
            NULL) {
        fprintf(stderr, "Error opening file %s for writing\n",
                writedb_descriptionsFilename);
        exit(-1);
    }

    writedb_volumeSize = 1;
    writedb_sequenceData = memBlocks_initialize(sizeof(struct sequenceData),
                           constants_initialSequenceData);
}
Ejemplo n.º 4
0
// Initialize the hitMatrix by declaring memory for the maximum number
// of diagonals required by a subject sequence
void hitMatrix_initialize(int4 queryLength, int4 maximumSubjectLength, unsigned char* startAddress)
{
	int4 numDiagonals;
	unsigned char **minOffset, **offset, **maxOffset;

    // Use more memory efficient but slower hit matrix for nucleotide
    if (encoding_alphabetType == encoding_nucleotide)
    {
        // Calculate number of diagonals that will be required during search
        numDiagonals = 1;
        while (numDiagonals < queryLength + parameters_wordSize)
        {
            numDiagonals <<= 1;
        }

        // Construct mask
        hitMatrix_diagonalMask = numDiagonals - 1;

        // Declare memory for diagonal slots
        hitMatrix_furthest = (unsigned char**)global_malloc(sizeof(unsigned char*) * numDiagonals);
        minOffset = hitMatrix_furthest;
	}
    // Use less memory efficient but faster hit matrix for protein
    else
    {
        // Maximum number of diagonals that will be required during search
        numDiagonals = queryLength + maximumSubjectLength - parameters_wordSize + 1;
        minOffset = (unsigned char**)global_malloc(sizeof(unsigned char*) * numDiagonals);

        // Advance array pointer to allow offset values ranging from
        // -queryLength to subjectLength - wordSize
        hitMatrix_furthest = minOffset + queryLength;
    }

	// Record query length
	hitMatrix_queryLength = queryLength;

	// Start from smallest possible offset value and iterate through to largest
	offset = minOffset;
	maxOffset = minOffset + numDiagonals;

	// For each diagonal, reset furthest to address at start of file
	while (offset < maxOffset)
	{
		*offset = startAddress;
		offset++;
	}
}
Ejemplo n.º 5
0
void read_dbIdxBlock(FILE *read_dbIdxBlockFile1) {

    int4 totalNumEntries = 
                (proteinLookup_numWords + 1) * 
                proteinLookup_numBlocks;

    proteinLookup_db_b[0].subPositionOffset = 
        (uint4 *)global_malloc(sizeof(uint4) * totalNumEntries); 

    totalIndexSize += sizeof(uint4) * totalNumEntries; 

    if (fread(proteinLookup_db_b[0].subPositionOffset, sizeof(uint4),
            totalNumEntries,
            read_dbIdxBlockFile1) != totalNumEntries) {
        fprintf(stderr, "Error reading LoopkupHeader to dbIdxBlock file\n");
        exit(-1);
    }

    int ii;
    uint8 totalHits = 0;
    for(ii = 0; ii < proteinLookup_numBlocks; ii++)
    {
        proteinLookup_db_b[ii].subPositionOffset = 
            proteinLookup_db_b[0].subPositionOffset + (proteinLookup_numWords + 1) * ii;
        totalHits += proteinLookup_db_b[ii].subPositionOffset[proteinLookup_numWords]; 
    }

    //int totalHits = proteinLookup_db_b[blockNum].subPositionOffset[proteinLookup_numWords];

    ASSERT(totalNumPositions == totalHits);
    proteinLookup_db_b[0].subSequencePositions = (subPos_t *)global_malloc(sizeof(subPos_t) * totalHits);

    totalIndexSize += sizeof(subPos_t) * totalHits;

    if (fread(proteinLookup_db_b[0].subSequencePositions, sizeof(subPos_t), totalHits,
                read_dbIdxBlockFile1) != totalHits) {
        fprintf(stderr, "Error reading numSubPositions to dbIdxBlock file\n");
        exit(-1);
    }

    for(ii = 1; ii < proteinLookup_numBlocks; ii++)
    {
        proteinLookup_db_b[ii].subSequencePositions = 
            proteinLookup_db_b[ii - 1].subSequencePositions +
            proteinLookup_db_b[ii - 1].subPositionOffset[proteinLookup_numWords];
    }

}
Ejemplo n.º 6
0
int main(int argc, char *argv[]) {
  char *filename, *sequence, *description, *sequenceCopy;
  int sequenceLength;

  // User must provide FASTA format file at command line
  if (argc < 2) {
    fprintf(stderr, "Useage: dust <FASTA file>\n");
    exit(-1);
  }
  filename = argv[1];

  // Initialize encoding routines
  encoding_initialize(encoding_nucleotide);

  // Open FASTA file for reading
  readFasta_open(filename);

  // Read each sequence from the file
  while (readFasta_readSequence()) {
    // Get sequence just read
    sequence = readFasta_sequenceBuffer;
    description = readFasta_descriptionBuffer;
    sequenceLength = readFasta_sequenceLength;

    // Make in-memory copy of it
    sequenceCopy = (char *)global_malloc(sequenceLength);
    strcpy(sequenceCopy, sequence);

    // Perform dust filtering
    dust_dustSequence(sequence);

    // Print description and filtering sequence
    printf(">%s\n%s\n", description, sequence);
  }
}
Ejemplo n.º 7
0
void proteinLookup_db_initial(int4 numCodes, int wordLength) {
    struct initialWord_protein_db *initialLookup, *initialWord;
    struct initialWord_neighborLookup *initialLookup_n;
    uint4 codeword, numEntries;

    totalNumPositions = 0;
    maxNumSeqBlk = 0;
    wordLookupDFA_numCodes = numCodes;
    wordLookupDFA_wordLength = wordLength;
    numEntries = ceil(pow(numCodes, wordLength));
    int4 proteinLookup_wordLength = wordLength;

    uint4 numBlocks = ceil((float)readdb_numVolumeLetters / dbIdx_block_size) + 20;
    proteinLookup_numBlocks = numBlocks;

    // Declare memory for initial DB index blocks
    proteinLookup_db_b = (struct proteinLookup_db_blk *)global_malloc(
            sizeof(struct proteinLookup_db_blk) * numBlocks);

    // Declare memory for initial lookup table
    proteinLookup_db = initialLookup =
        (struct initialWord_protein_db *)global_malloc(
                sizeof(struct initialWord_protein_db) * numEntries * numBlocks);

    proteinLookup_numWords = numEntries;

    // Iterate through every possible codeword
    codeword = 0;
    while (codeword < numBlocks) {
        proteinLookup_db_b[codeword].subPositionOffset = 
            (uint4 *)global_malloc(sizeof(uint4) * (numEntries + 1));
        proteinLookup_db_b[codeword].dbIdxblk_longestSeq = 0;
        proteinLookup_db_b[codeword].numSeqBlk = 0;
        proteinLookup_db_b[codeword].subSequencePositions = NULL;
        codeword++;
    }

    codeword = 0;
    while (codeword < numEntries * numBlocks) {
        // Initialize list of query positions as empty
        initialLookup[codeword].numSubPositions = 0;
        initialLookup[codeword].allocSubPositions = 0;
        initialLookup[codeword].subSequencePositions = NULL;
        codeword++;
    }
}
Ejemplo n.º 8
0
// Initialize for the construction of a new query position list
void qPosList_initialize(int4 maxNumLists)
{
    struct memSingleBlock* list;

    qPosList_qPosLists = (struct memSingleBlock*)global_malloc(sizeof(struct memSingleBlock) * maxNumLists);
    qPosList_numQPosLists = 0;
    qPosList_maxQPosLists = 0;

    // Initialize lists of query positions
    while (qPosList_maxQPosLists < maxNumLists)
    {
        list = qPosList_qPosLists + qPosList_maxQPosLists;
        memSingleBlock_initializeExisting(list, sizeof(struct queryPosition), 10);
        qPosList_maxQPosLists++;
    }

    qPosList_initialQPosLists = (struct initialQPosList*)global_malloc(sizeof(struct initialQPosList) * maxNumLists);
    qPosList_numInitialQPosLists = 0;
}
Ejemplo n.º 9
0
void read_dbLookupAux(char *read_dbLookupFilename) {
    char *read_dbLookupAuxFilename;
    FILE *read_dbLookupAuxFile;

    read_dbLookupAuxFilename =
        (char *)global_malloc(strlen(read_dbLookupFilename) + 40);
    sprintf(read_dbLookupAuxFilename, "%s.sequence%d.dbLookupAux", read_dbLookupFilename, readdb_volume);

    // Open dbLookup file for writing
    if ((read_dbLookupAuxFile = fopen(read_dbLookupAuxFilename, "r")) == NULL) {
        fprintf(stderr, "Error opening file %s for reading\n",
                read_dbLookupAuxFilename);
        exit(-1);
    }

    struct dbLookupAux dbLookupAux;

    if(fread(&dbLookupAux, sizeof(struct dbLookupAux), 1, read_dbLookupAuxFile) != 1)
    {
        fprintf(stderr, "Error reading read_dbLookupAuxFile\n");
        exit(0);
    }

    proteinLookup_numBlocks = dbLookupAux.proteinLookup_numBlocks;
    proteinLookup_numWords = dbLookupAux.proteinLookup_numWords;
    wordLookupDFA_wordLength = dbLookupAux.proteinLookup_wordLength;
    wordLookupDFA_numCodes = dbLookupAux.proteinLookup_numCodes;
    dbIdx_block_size = dbLookupAux.dbIdx_block_size;
    totalNumPositions = dbLookupAux.totalNumPositions;
    maxNumSeqBlk = dbLookupAux.maxNumSeqBlk;

    proteinLookup_db_b = (struct proteinLookup_db_blk *)malloc(
            sizeof(struct proteinLookup_db_blk) * proteinLookup_numBlocks);

    totalIndexSize += sizeof(struct proteinLookup_db_blk) * proteinLookup_numBlocks;

    if(fread(proteinLookup_db_b, sizeof(struct proteinLookup_db_blk),
            proteinLookup_numBlocks, read_dbLookupAuxFile) != proteinLookup_numBlocks)
    {
        fprintf(stderr, "Error reading read_dbLookupAuxFile\n");
        exit(0);
    }
           

    //fprintf(stderr, "dbIdx: volumn: %d numBlocks: %d\n", readdb_volume, proteinLookup_numBlocks);


    proteinLookup_db = (struct initialWord_protein_db *)malloc(
            sizeof(struct initialWord_protein_db) * proteinLookup_numWords *
            proteinLookup_numBlocks);

    free(read_dbLookupAuxFilename);
    fclose(read_dbLookupAuxFile);
}
Ejemplo n.º 10
0
void write_dbLookupAux(char *write_dbLookupFilename) {
    char *write_dbLookupAuxFilename;
    FILE *write_dbLookupAuxFile;

    write_dbLookupAuxFilename =
        (char *)global_malloc(strlen(write_dbLookupFilename) + 40);

    sprintf(write_dbLookupAuxFilename, "%s.sequence%d.dbLookupAux", write_dbLookupFilename, readdb_volume);

    // Open dbLookup file for writing
    if ((write_dbLookupAuxFile = fopen(write_dbLookupAuxFilename, "w")) == NULL) {
        fprintf(stderr, "Error opening file %s for writing\n",
                write_dbLookupAuxFilename);
        exit(-1);
    }

    struct dbLookupAux dbLookupAux;
    dbLookupAux.proteinLookup_numBlocks = proteinLookup_numBlocks;
    dbLookupAux.proteinLookup_numWords = proteinLookup_numWords;
    dbLookupAux.proteinLookup_wordLength = wordLookupDFA_wordLength;
    dbLookupAux.proteinLookup_numCodes = wordLookupDFA_numCodes;
    dbLookupAux.dbIdx_block_size = dbIdx_block_size;
    dbLookupAux.totalNumPositions = totalNumPositions;
    dbLookupAux.maxNumSeqBlk = maxNumSeqBlk;

    if (fwrite(&dbLookupAux, sizeof(struct dbLookupAux), 1,
                write_dbLookupAuxFile) != 1) {
        fprintf(stderr, "Error writing data to dbLookup aux file %s\n",
                write_dbLookupAuxFilename);
        exit(-1);
    }

    if (fwrite(proteinLookup_db_b, sizeof(struct proteinLookup_db_blk),
                proteinLookup_numBlocks,
                write_dbLookupAuxFile) != proteinLookup_numBlocks) {
        fprintf(stderr, "Error writing data to dbLookup aux file %s\n",
                write_dbLookupAuxFilename);
        exit(-1);
    }


#if 1
    fprintf(stderr,
            "dbIdxBlockSize:%d(KB) "
            "maxNumSeqPerBlk:%d longestSeqLength:%d maxBinSize: %d\n",
            dbIdx_block_size / 1024, 
            maxNumSeqBlk, 
            readdb_longestSequenceLength,
            maxBinSize);
#endif

    free(write_dbLookupAuxFilename);
    fclose(write_dbLookupAuxFile);
}
Ejemplo n.º 11
0
/// Allocate global blocks from the global heap.
static hpx_addr_t _pgas_gas_alloc_local(size_t n, uint32_t bsize,
                                        uint32_t boundary, uint32_t attr) {
  size_t bytes = n * bsize;
  void *lva = NULL;
  if (boundary) {
    lva = global_memalign(boundary, bytes);
  } else {
    lva = global_malloc(bytes);
  }
  return (lva) ? pgas_lva_to_gpa(lva) : HPX_NULL;
}
Ejemplo n.º 12
0
void neighbourLookup_init() {
    int4 numEntries = proteinLookup_numWords;

    neighborLookup = (struct initialWord_neighborLookup *)global_malloc(
            sizeof(struct initialWord_neighborLookup) * numEntries);

    int4 codeword = 0;
    while (codeword < numEntries) {
        neighborLookup[codeword].numNeighbours = 0;
        neighborLookup[codeword].neighbours = NULL;
        codeword++;
    }
}
Ejemplo n.º 13
0
void neighbourLookup_build(struct PSSMatrix PSSMatrix,
        struct scoreMatrix scoreMatrix, int4 wordLength) {
    int4 queryPosition = 0;
    int4 numNeighbours;
    int4 codeword;
    int4 numWords = proteinLookup_numWords;
    struct neighbour *neighbours =
        (struct neighbour *)global_malloc(sizeof(struct neighbour) * numWords);

    while (queryPosition < PSSMatrix.length - wordLength + 1) {
        codeword =
            getCodeword(PSSMatrix.queryCodes + queryPosition, wordLength);

        if (neighborLookup[codeword].numNeighbours == 0) {
            numNeighbours = 0;
            // wordLookupDFA_getNeighbours(PSSMatrix, queryPosition, &numNeighbours,
            // neighbours);

            wordLookupSM_getNeighbours(PSSMatrix.queryCodes, scoreMatrix,
                    queryPosition, &numNeighbours, neighbours);

            neighborLookup[codeword].numNeighbours = numNeighbours;
            neighborLookup[codeword].neighbours =
                (int4 *)global_malloc(sizeof(int4) * numNeighbours);

            while (numNeighbours > 0) {
                numNeighbours--;
                neighborLookup[codeword].neighbours[numNeighbours] =
                    neighbours[numNeighbours].codeword;
            }
        }

        // printf("%d %d\n", codeword, neighborLookup[codeword].numNeighbours);
        queryPosition++;
    }
    free(neighbours);
}
Ejemplo n.º 14
0
// Given the results of dynamic programming (a matrix of trace codes and a
// highest scoring position in
// the matrix) for finding the START of the alignment, performs the simple
// operation of finding the path
// from the highest scoring point back to the seed
struct trace gappedExtension_traceBeforeSeed(struct dpResults beforeDpResults,
                                             struct coordinate seed) {
  int4 queryPosition, subjectPosition;
  unsigned char **traceback;
  unsigned char traceCode;
  unsigned char state = 0;
  struct trace trace;
  unsigned char *traceCodes;
  uint4 traceCount = 0;

  traceback = beforeDpResults.traceback;
  trace.queryStart = queryPosition = beforeDpResults.best.queryOffset;
  trace.subjectStart = subjectPosition = beforeDpResults.best.subjectOffset;

  // Declare memory for tracecodes; for maximum possible number of codes that
  // could
  // be generated by this trace
  traceCodes = (unsigned char *)global_malloc(
      sizeof(unsigned char) * (seed.queryOffset - queryPosition +
                               seed.subjectOffset - subjectPosition));

  while (queryPosition < seed.queryOffset &&
         subjectPosition < seed.subjectOffset) {
    // Construct the trace
    traceCodes[traceCount] = state;
    traceCount++;

    //        printf("(%p)", traceback[queryPosition]);
    //        printf("(%d,%d)", queryPosition, subjectPosition); fflush(stdout);
    traceCode = traceback[queryPosition][subjectPosition];

    // If we got to current cell through a MATCH
    if (state == 0) {
      // Move to cell before this one
      queryPosition++;
      subjectPosition++;

      // We are only interested in lowest 2 bits of tracecode
      traceCode = traceCode << 6;
      traceCode = traceCode >> 6;

      // Tracecode determines if we matched or inserted here
      state = traceCode;
    }
    // If we got to current cell through an Ix
    else if (state == 1) {
Ejemplo n.º 15
0
char* getSequence(uint4 seqId)
{
	char* sequence;

    // Declare memory for the sequence
	sequence = (char*)global_malloc(sizeof(char) * (readdb_sequenceData[seqId].sequenceLength + 1));
    int ii;
    for(ii = 0; ii < readdb_sequenceData[seqId].sequenceLength; ii++)
    {
        sequence[ii] = encoding_getLetter(readdb_sequenceData[seqId].sequence[ii]);
        if(sequence[ii] == 'U')
        {
            fprintf(stderr, "Selenocysteine (U) at position %d replaced by X\n", ii);
            sequence[ii] = 'X';
        }
    }
    sequence[ii] = '\0';
    return sequence;
}
Ejemplo n.º 16
0
// Extend the start of a region if necessary
void unpack_extendRegionStart(int4 position,
                              struct unpackRegion *unpackRegion) {
  unsigned char *newUnpackedSubject;
  int4 newRegionStart, newRegionEnd;

  if (position < unpackRegion->startOffset) {
    // Extend the region start
    newRegionStart = unpackRegion->startOffset - constants_unpackRegionExtend;
    if (newRegionStart < 0)
      newRegionStart = 0;
    newRegionEnd = unpackRegion->endOffset;

    // Make start of region a multiple of 4
    newRegionStart = (newRegionStart / 4) * 4;

    // Declare memory for the new region
    newUnpackedSubject = (unsigned char *)global_malloc(
        sizeof(char) * (newRegionEnd - newRegionStart));
    newUnpackedSubject -= newRegionStart;

    // Copy unpacked subject from old region to new
    memcpy(newUnpackedSubject + unpackRegion->startOffset,
           unpackRegion->unpackedSubject + unpackRegion->startOffset,
           sizeof(char) *
               (unpackRegion->endOffset - unpackRegion->startOffset));

    // Free old subject
    unpackRegion->unpackedSubject += unpackRegion->startOffset;
    free(unpackRegion->unpackedSubject);

    // Unpack the new part of the region
    encoding_byteUnpackRegion(newUnpackedSubject + newRegionStart,
                              unpackRegion->subject + (newRegionStart / 4),
                              unpackRegion->startOffset - newRegionStart);

    unpackRegion->unpackedSubject = newUnpackedSubject;

    unpackRegion->startOffset = newRegionStart;
  }
}
Ejemplo n.º 17
0
// Get a run of consecutive entries from the block
void *memBlocks_newEntries(struct memBlocks *memBlocks, uint4 numNewEntries) {
  void *newEntry;

  // Check if we need to create a new block of memory
  if (memBlocks->numEntries[memBlocks->numBlocks - 1] + numNewEntries >
      memBlocks->blockSizes) {
    // Declare memory for the new block
    memBlocks->lastBlock =
        (void *)global_malloc(memBlocks->entrySize * memBlocks->blockSizes);

    // Check if we need more memory for block pointers
    if (memBlocks->numBlocks >= memBlocks->maxNumBlocks) {
      // Allocate more
      memBlocks->maxNumBlocks *= 2;
      memBlocks->blocks = (void **)global_realloc(
          memBlocks->blocks, sizeof(void *) * memBlocks->maxNumBlocks);
      memBlocks->numEntries = (int4 *)global_realloc(
          memBlocks->numEntries, sizeof(int4) * memBlocks->maxNumBlocks);
    }

    // Store the address of this new block
    memBlocks->blocks[memBlocks->numBlocks] = memBlocks->lastBlock;

    // Reset number of entries in this block
    memBlocks->numEntries[memBlocks->numBlocks] = 0;
    memBlocks->numBlocks++;
  }

  // Use the next available slot in the latest block
  newEntry =
      ((char *)(memBlocks->lastBlock)) +
      memBlocks->numEntries[memBlocks->numBlocks - 1] * memBlocks->entrySize;

  memBlocks->numEntries[memBlocks->numBlocks - 1] += numNewEntries;
  memBlocks->numTotalEntries += numNewEntries;

  return newEntry;
}
Ejemplo n.º 18
0
// Initialize the creation of a new index structure
void index_initializeBuild(uint4 fromCodeword, uint4 toCodeword)
{
	uint4 codeword;

//	index_numWords = pow(4, index_wordSize);
    index_words = (struct wordList*)global_malloc(sizeof(struct wordList) * (toCodeword - fromCodeword));
	index_words -= fromCodeword;

    // For each word
    codeword = fromCodeword;
    while (codeword < toCodeword)
    {
    	// Initialize list of occurrences
    	index_words[codeword].offsets = NULL;
		index_words[codeword].length = 0;
        index_words[codeword].allocated = 0;
        index_words[codeword].lastOffset = 0;
        index_words[codeword].lastSequenceNumber = 0;

    	codeword++;
    }

    index_subjectNumber = 0;
}
Ejemplo n.º 19
0
// Unpack entire or sections of a subject sequence before gapped alignment
void unpack_unpackSubject(struct PSSMatrix PSSMatrix,
                          struct alignment *alignment) {
  unsigned char *subject, *unpackedSubject, wildcard, *edits, *endEdits;
  uint4 wildcardPosition;
  struct unpackRegion *firstRegion = NULL, *lastRegion, *currentRegion,
                      *unpackRegion;
  int4 regionStart, regionEnd, numRegions;

  // No need to unpack a protein subject, or already unpacked nucleotide subject
  if (parameters_ssearch || encoding_alphabetType == encoding_protein) {
    // Just create a single region covering the entire sequence
    firstRegion = memBlocks_newEntry(unpack_unpackRegions);
    firstRegion->startOffset = 0;
    firstRegion->endOffset = alignment->subjectLength;
    firstRegion->subject = alignment->subject;
    firstRegion->unpackedSubject = alignment->subject;
    firstRegion->subjectLength = alignment->subjectLength;
    alignment->unpackRegions = firstRegion;
    alignment->numUnpackRegions = 1;
    return;
  }

  // Get the subject regions for this alignment
  numRegions = unpack_getRegions(PSSMatrix, alignment, 0, unpack_unpackRegions);
  lastRegion = memBlocks_getLastEntry(unpack_unpackRegions);
  lastRegion++;
  firstRegion = lastRegion - numRegions;

  // Sort the regions in order of start position
  qsort(firstRegion, lastRegion - firstRegion, sizeof(struct unpackRegion),
        unpack_compareUnpackRegions);

  // Unpack each region
  currentRegion = firstRegion;
  while (currentRegion < lastRegion) {
    regionEnd = currentRegion->endOffset;
    regionStart = currentRegion->startOffset;

#ifdef VERBOSE
    if (parameters_verboseDloc == alignment->descriptionLocation) {
      printf("Unpack subject region %d to %d (length=%d)\n", regionStart,
             regionEnd, alignment->subjectLength);
      fflush(stdout);
    }
#endif

    // Get the subject region to be unpacked
    if (alignment->unpackRegions == NULL) {
      subject = alignment->subject;
    } else {
      unpackRegion = unpack_selectRegion(
          alignment->unpackRegions, alignment->numUnpackRegions, regionStart);
      subject = unpackRegion->subject;
    }

    // Declare memory for the region
    unpackedSubject = (unsigned char *)global_malloc(sizeof(char) *
                                                     (regionEnd - regionStart));

    // Unpack the region of interest
    encoding_byteUnpackRegion(unpackedSubject, subject + (regionStart / 4),
                              regionEnd - regionStart);
    unpackedSubject -= regionStart;
    currentRegion->unpackedSubject = unpackedSubject;

    currentRegion->subject = subject;
    currentRegion->subjectLength = alignment->subjectLength;

    blast_totalUnpacked += (regionEnd - regionStart);

    currentRegion++;
  }

  currentRegion = firstRegion;

  // Get wildcard edits for the sequence
  edits = alignment->edits;
  endEdits = alignment->edits + alignment->encodedLength -
             ((alignment->subjectLength + 3) / 4);

  // If there are edits
  if (edits < endEdits) {
    // Read first wildcard
    wildcard = *edits;
    edits++;

    // Read its position
    vbyte_getVbyte(edits, &wildcardPosition);

    // For each region in order of position in the subject
    while (currentRegion < lastRegion) {
      // Skip past edits that are before current region
      while (edits < endEdits &&
             wildcardPosition < currentRegion->startOffset) {
        // Read wildcard
        wildcard = *edits;
        edits++;

        // Read its position
        vbyte_getVbyte(edits, &wildcardPosition);
      }

      // Process edits that are in the current region
      while (edits < endEdits && wildcardPosition < currentRegion->endOffset) {
        // Insert wildcard into sequence
        currentRegion->unpackedSubject[wildcardPosition] = wildcard;

        // Read next wildcard
        wildcard = *edits;
        edits++;

        // Read its position
        vbyte_getVbyte(edits, &wildcardPosition);
      }

      // Advance to the next region
      currentRegion++;
    }
  }

  alignment->unpackRegions = firstRegion;
  alignment->numUnpackRegions = lastRegion - firstRegion;
}
Ejemplo n.º 20
0
// Perform dynamic programming to explore possible start points and alignments
// that end at
// the given seed and find the best score
struct dpResults semiGappedScoring_dpBeforeSeed(unsigned char *subject,
                                                struct PSSMatrix PSSMatrix,
                                                struct coordinate seed,
                                                int4 dropoff) {
  int2 **queryPosition, **bestQueryPosition;
  int2 *matrixColumn;
  unsigned char *rowDropoff, *columnDropoff;
  unsigned char *subjectPosition, *bestSubjectPosition, *startSubjectPosition;
  int4 bestScore = 0;
  int4 *bestRow, *insertQrow, insertS, rowOffset;
  int4 subjectDistance;
  int4 oldBest, match, previousOldBest;
  unsigned char rightOfDropoff;
  int4 queryCount, subjectCount;
  struct dpResults dpResults;

  // Declare processing rows for storing match, insert-subject and insert-query
  // values
  // If current malloced rows aren't big enough
  if (seed.subjectOffset >= semiGappedScoring_rowSizes) {
    // Free existing rows
    free(semiGappedScoring_bestRow);
    free(semiGappedScoring_insertQrow);
    // Set size to double current needed length
    semiGappedScoring_rowSizes = (seed.subjectOffset) * 2;
    // Malloc new rows
    semiGappedScoring_bestRow =
        (int4 *)global_malloc(sizeof(int4) * semiGappedScoring_rowSizes);
    semiGappedScoring_insertQrow =
        (int4 *)global_malloc(sizeof(int4) * semiGappedScoring_rowSizes);
  }

  bestSubjectPosition = subjectPosition = startSubjectPosition =
      subject + seed.subjectOffset - 1;
  bestQueryPosition = queryPosition = PSSMatrix.matrix + seed.queryOffset - 1;

  // Initialize row pointers
  rowOffset = (subjectPosition - subject);
  //    printf("rowOffset=%d Dloc=%d\n", rowOffset, dloc); fflush(stdout);
  bestRow = semiGappedScoring_bestRow + rowOffset;
  insertQrow = semiGappedScoring_insertQrow + rowOffset;

  // Set initial row dropoff and column dropoff
  rowDropoff = subject;
  columnDropoff = subject + seed.subjectOffset;

  // Using first column of query matrix
  matrixColumn = *queryPosition;

  // -----FIRST ROW-----

  // -----FIRST CELL-----
  // Set M value for bottom-right cell
  match = matrixColumn[*subjectPosition];

  // M must be the best
  *bestRow = match;

  // Only gap opens possible
  *insertQrow = insertS = match - parameters_semiGappedOpenGap;

  // If this is the best-yet scoring cell
  if (match > bestScore) {
    // Update best start cell data
    bestScore = match;
    bestQueryPosition = queryPosition;
    bestSubjectPosition = subjectPosition;
  }

  subjectDistance = 0;
  subjectPosition--;
  bestRow--;
  insertQrow--;

  // ----- REMAINING CELLS -----
  // For each remaining column in the bottom row, scanning from right-to-left
  while (subjectPosition >= subject) {
    // Set value for M
    match = matrixColumn[*subjectPosition] - parameters_semiGappedOpenGap -
            subjectDistance * parameters_semiGappedExtendGap;

    // Determine the best of M and Iy
    if (match > insertS) {
      *bestRow = match;

      // Calculate new Iy
      insertS = maximum(match - parameters_semiGappedOpenGap,
                        insertS - parameters_semiGappedExtendGap);
    } else {
      *bestRow = insertS;

      // Since M <= Iy, new Iy must derive from Iy
      insertS -= parameters_semiGappedExtendGap;
    }

    // Set DUMMY Ix value, which should never be used
    *insertQrow = constants_gappedExtensionDummyValue;

    // If this is the best-yet scoring cell
    if (match > bestScore) {
      // Update best start cell data
      bestScore = match;
      bestQueryPosition = queryPosition;
      bestSubjectPosition = subjectPosition;
    }

    // If score at current cell is below dropoff
    if (bestScore > *bestRow + dropoff) {
      // Record dropoff position
      rowDropoff = subjectPosition;
      // And stop processing row
      break;
    }

    subjectPosition--;
    bestRow--;
    insertQrow--;
    subjectDistance++;
  }

  //    if (dloc == 746829265)
  //    print(semiGappedScoring_bestRow, subject, rowDropoff, columnDropoff);

  // Start queryCount at N. Only allow insertS for every Nth row when queryCount
  // reaches 0
  queryCount = parameters_semiGappedExtensionN;

  // -----REMAINING ROWS-----
  while (queryPosition > PSSMatrix.matrix && rowDropoff < columnDropoff) {
    queryPosition--;
    queryCount--;
    subjectPosition = columnDropoff - 1;

    // Determine subjectCount for initial subjectPosition. Is used to only allow
    // insertQ when subjectOffset % parameters_semiGappedExtensionN == 0
    subjectCount = (int4)(startSubjectPosition - subjectPosition) %
                   parameters_semiGappedExtensionN;
    if (subjectCount)
      subjectCount = parameters_semiGappedExtensionN - subjectCount;

    // Reset row pointers to start of rows
    rowOffset = (subjectPosition - subject);
    bestRow = semiGappedScoring_bestRow + rowOffset;
    insertQrow = semiGappedScoring_insertQrow + rowOffset;

    // Using next column of query matrix
    matrixColumn = *queryPosition;

    // ************ All rows we are not allowing insertS
    if (queryCount) {
      // ** No insertQ allowed this column, this cell will only get a DUMMY
      // score
      if (subjectCount) {
        previousOldBest = *bestRow;
        *bestRow = constants_gappedExtensionDummyValue;

        // Score at this cell is below dropoff
        columnDropoff = subjectPosition;
        rightOfDropoff = 1;
      }
      // ** We are allowing insertQ this column
      else {
        // -----FAR RIGHT CELL-----
        // Record some old values
        previousOldBest = *bestRow;

        // Set Ix value
        *bestRow = *insertQrow;
        *insertQrow -= parameters_semiGappedExtendGap;

        // If score at current cell is below dropoff
        if (bestScore > *bestRow + dropoff) {
          // Record dropoff position
          columnDropoff = subjectPosition;
          rightOfDropoff = 1;
        } else {
          // We are left of the column dropoff for this row
          rightOfDropoff = 0;
        }

        // Reset subjectCount
        subjectCount = parameters_semiGappedExtensionN;
      }

      subjectPosition--;
      bestRow--;
      insertQrow--;
      subjectCount--;

      // -----CELLS RIGHT OF ROW DROPOFF-----
      while (subjectPosition >= rowDropoff) {
        // ** We are not allowing insertQ this column
        if (subjectCount) {
          // Calculate new M value, which is also the best
          oldBest = *bestRow;
          match = *bestRow = matrixColumn[*subjectPosition] + previousOldBest;
          previousOldBest = oldBest;

          // If this is the best-yet scoring cell
          if (match > bestScore) {
            // Update best start cell data
            bestScore = match;
            bestQueryPosition = queryPosition;
            bestSubjectPosition = subjectPosition;
          }
        }
        // We are allowing insertQ this column
        else {
          // Calculate new M value
          oldBest = *bestRow;
          match = matrixColumn[*subjectPosition] + previousOldBest;
          previousOldBest = oldBest;

          // Determine the best of M and Ix
          if (match > *insertQrow) {
            *bestRow = match;

            // Calculate new Ix
            *insertQrow = maximum(match - parameters_semiGappedOpenGap,
                                  *insertQrow - parameters_semiGappedExtendGap);
          } else {
            *bestRow = *insertQrow;

            // Since M <= Ix, new Ix must derive from Ix
            *insertQrow -= parameters_semiGappedExtendGap;
          }

          // If this is the best-yet scoring cell
          if (match > bestScore) {
            // Update best start cell data
            bestScore = match;
            bestQueryPosition = queryPosition;
            bestSubjectPosition = subjectPosition;
          }

          // Reset subjectCount
          subjectCount = parameters_semiGappedExtensionN;
        }

        subjectPosition--;
        bestRow--;
        insertQrow--;
        subjectCount--;
      }

      // -----SINGLE CELL LEFT OF ROW DROPOFF -----
      if (!(bestScore > previousOldBest + dropoff) &&
          (subjectPosition >= subject)) {
        // Set value for best
        *bestRow = match = previousOldBest + matrixColumn[*subjectPosition];

        // Set DUMMY values for Ix
        *insertQrow = constants_gappedExtensionDummyValue;

        if (match + dropoff >= bestScore) {
          // Record dropoff position
          rowDropoff = subjectPosition;
        }
      }
    }

    // ************ Every Nth row we allow insertS
    else {
      // -----FAR RIGHT CELL-----

      // ** No insertQ allowed this column, this cell will only get a DUMMY
      // score
      if (subjectCount) {
        previousOldBest = *bestRow;
        *bestRow = constants_gappedExtensionDummyValue;

        // Score at this cell is below dropoff
        columnDropoff = subjectPosition;
        rightOfDropoff = 1;
      }
      // ** We are allowing insertQ this column
      else {
        // Record some old values
        previousOldBest = *bestRow;

        // Set Ix value
        *bestRow = *insertQrow;
        *insertQrow -= parameters_semiGappedExtendGap;

        // Set DUMMY value for Iy, which should never be used
        insertS = constants_gappedExtensionDummyValue;

        // If score at current cell is below dropoff
        if (bestScore > *bestRow + dropoff) {
          // Record dropoff position
          columnDropoff = subjectPosition;
          rightOfDropoff = 1;
        } else {
          // We are left of the column dropoff for this row
          rightOfDropoff = 0;
        }

        // Reset subjectCount
        subjectCount = parameters_semiGappedExtensionN;
      }

      subjectPosition--;
      bestRow--;
      insertQrow--;
      subjectCount--;

      // -----CELLS RIGHT OF ROW DROPOFF-----
      while (subjectPosition >= rowDropoff) {
        // ** We are not allowing insertQ this column
        if (subjectCount) {
          // Remember old M value (for cell below this one)
          oldBest = *bestRow;
          match = matrixColumn[*subjectPosition] + previousOldBest;
          previousOldBest = oldBest;

          // Determine the best of M and Iy
          if (match > insertS) {
            *bestRow = match;

            // Calculate new Iy
            insertS = maximum(match - parameters_semiGappedOpenGap,
                              insertS - parameters_semiGappedExtendGap);
          } else {
            *bestRow = insertS;

            // Since M <= Iy, new Iy must derive from Iy
            insertS -= parameters_semiGappedExtendGap;
          }

          // If this is the best-yet scoring cell
          if (match > bestScore) {
            // Update best start cell data
            bestScore = match;
            bestQueryPosition = queryPosition;
            bestSubjectPosition = subjectPosition;
          }

          // If score at current cell (and cells to its right) are below dropoff
          if (rightOfDropoff) {
            if (bestScore > *bestRow + dropoff) {
              // Record dropoff position
              columnDropoff = subjectPosition;
            } else {
              // We are left of the column dropoff for this row
              rightOfDropoff = 0;
            }
          }
        }
        // ** We are allowing insertQ this column
        else {
          // Remember old M value (for cell below this one)
          oldBest = *bestRow;
          match = matrixColumn[*subjectPosition] + previousOldBest;
          previousOldBest = oldBest;

          // Determine the best of M, Ix and Iy
          if (match > insertS) {
            if (match > *insertQrow) {
              // Match is largest
              *bestRow = match;

              // Calculate new Ix
              *insertQrow =
                  maximum(match - parameters_semiGappedOpenGap,
                          *insertQrow - parameters_semiGappedExtendGap);

              // Calculate new Iy
              insertS = maximum(match - parameters_semiGappedOpenGap,
                                insertS - parameters_semiGappedExtendGap);

              // If this is the best-yet scoring cell
              if (match > bestScore) {
                // Update best start cell data
                bestScore = match;
                bestQueryPosition = queryPosition;
                bestSubjectPosition = subjectPosition;
              }
            } else {
              // insertQ is largest
              *bestRow = *insertQrow;

              // Calculate new Ix
              *insertQrow -= parameters_semiGappedExtendGap;

              // Dummy Iy
              insertS = constants_gappedExtensionDummyValue;
            }
          } else {
            if (insertS > *insertQrow) {
              // insertS is largest
              *bestRow = insertS;

              // Dummy Ix
              *insertQrow = constants_gappedExtensionDummyValue;

              // Calculate new Iy
              insertS -= parameters_semiGappedExtendGap;

            } else {
              // insertQ is largest
              *bestRow = *insertQrow;

              // Calculate new Ix
              *insertQrow -= parameters_semiGappedExtendGap;

              // Dummy Iy
              insertS = constants_gappedExtensionDummyValue;
            }
          }

          // If score at current cell (and cells to its right) are below dropoff
          if (rightOfDropoff) {
            if (bestScore > *bestRow + dropoff) {
              // Record dropoff position
              columnDropoff = subjectPosition;
            } else {
              // We are left of the column dropoff for this row
              rightOfDropoff = 0;
            }
          }

          // Reset subjectCount
          subjectCount = parameters_semiGappedExtensionN;
        }

        subjectPosition--;
        bestRow--;
        insertQrow--;
        subjectCount--;
      }

      // -----SINGLE CELL LEFT OF ROW DROPOFF -----
      if (!(bestScore > previousOldBest + dropoff) &&
          (subjectPosition >= subject)) {
        // Calculate match value
        match = previousOldBest + matrixColumn[*subjectPosition];

        // Set value for best
        *bestRow = maximum(match, insertS);

        // Calculate new Iy
        insertS = maximum(match - parameters_semiGappedOpenGap,
                          insertS - parameters_semiGappedExtendGap);

        // Set DUMMY values for Ix
        *insertQrow = constants_gappedExtensionDummyValue;

        subjectPosition--;
        bestRow--;
        insertQrow--;
      }

      // -----CELLS LEFT OF ROW DROPOFF -----
      if (!(bestScore > *(bestRow + 1) + dropoff)) {
        while (subjectPosition >= subject) {
          // Set value for Iy and best
          *bestRow = insertS;
          insertS = insertS - parameters_semiGappedExtendGap;

          // Set DUMMY values for Ix
          *insertQrow = constants_gappedExtensionDummyValue;

          // If score at current cell is below dropoff
          if (bestScore > *bestRow + dropoff) {
            // Stop processing row
            subjectPosition--;
            break;
          }

          subjectPosition--;
          bestRow--;
          insertQrow--;
        }
      }

      // Record dropoff position
      rowDropoff = subjectPosition + 1;

      // Clear insertS for next row
      insertS = constants_gappedExtensionDummyValue;

      // Reset queryCount
      queryCount = parameters_semiGappedExtensionN;
    }
    //        if (dloc == 746829265)
    //                print(semiGappedScoring_bestRow, subject, rowDropoff,
    // columnDropoff);
  }

  dpResults.best.queryOffset = bestQueryPosition - PSSMatrix.matrix;
  dpResults.best.subjectOffset = bestSubjectPosition - subject;
  dpResults.bestScore = bestScore;
  dpResults.traceback = NULL;
  return dpResults;
}
Ejemplo n.º 21
0
struct scoreMatrix scoreMatrix_convertCafe(int4 **cafeMatrix,
                                           int4 cafeMatrixSize) {
  FILE *matrixFile;
  int4 MAXLINELENGTH = 8096;
  int4 lineNumber = 0;
  int4 tokenCount;
  char line[MAXLINELENGTH];
  char *token, *tempAddress;
  unsigned char columnHeadings[24];
  unsigned char rowHeadings[24];
  int2 value;
  struct scoreMatrix scoreMatrix;
  int4 x, y;

  scoreMatrix.highestValue = 0;
  scoreMatrix.lowestValue = 0;

  // Declare memory used by scoreMatrix
  scoreMatrix.matrix = (int2 **)global_malloc(
      sizeof(int2 *) * encoding_numCodes +
      sizeof(int2) * encoding_numCodes * encoding_numCodes);
  tempAddress = (char *)scoreMatrix.matrix;

  x = 0;
  while (x < encoding_numCodes) {
    scoreMatrix.matrix[x] =
        (int2 *)(tempAddress + sizeof(int2 *) * encoding_numCodes +
                 sizeof(int2) * encoding_numCodes * x);
    // Initialize the score matrix, by setting all values to sentinal score
    y = 0;
    while (y < encoding_numCodes) {
      scoreMatrix.matrix[x][y] = constants_sentinalScore;
      y++;
    }
    x++;
  }

  // Go through the CAFE scoring matrix
  x = 0;
  while (x < cafeMatrixSize) {
    y = 0;
    while (y < cafeMatrixSize) {
      // If both current row and column represents a valid amino acid
      if (encoding_getCode('A' + x) != encoding_unknownCode &&
          encoding_getCode('A' + y) != encoding_unknownCode) {
        // Copy value to BLAST scoring matrix
        scoreMatrix.matrix[encoding_getCode('A' + x)]
                          [encoding_getCode('A' + y)] = cafeMatrix[x][y];

        // Update highest and lowest values
        if (cafeMatrix[x][y] > scoreMatrix.highestValue)
          scoreMatrix.highestValue = cafeMatrix[x][y];

        if (cafeMatrix[x][y] < scoreMatrix.lowestValue)
          scoreMatrix.lowestValue = cafeMatrix[x][y];
      }
      y++;
    }
    x++;
  }

  // For cells in the score matrix that did not recieve a score, use 1 and
  // lowestValue instead
  x = 0;
  while (x < encoding_numCodes) {
    y = 0;
    while (y < encoding_numCodes) {
      if (scoreMatrix.matrix[x][y] == constants_sentinalScore) {
        if (x == y) {
          scoreMatrix.matrix[x][y] = 1;
        } else {
          scoreMatrix.matrix[x][y] = scoreMatrix.lowestValue;
        }
      }
      y++;
    }
    x++;
  }

  // Every letter scores poorly against the sentinal code
  x = 0;
  while (x < encoding_numCodes) {
    scoreMatrix.matrix[x][encoding_sentinalCode] = constants_sentinalScore;
    scoreMatrix.matrix[encoding_sentinalCode][x] = constants_sentinalScore;
    x++;
  }

  return scoreMatrix;
}
Ejemplo n.º 22
0
// Load the score matrix (eg. BLOSUM62) from disk and return contents in an
// array
// 25 by 25 for the 25 possible amino acids (actually 20 plus 3 wilds, 1
// unknown,
// and a sentinal code which scores poorly, and flanks sequences)
struct scoreMatrix scoreMatrix_load(char *filename) {
  FILE *matrixFile;
  int4 MAXLINELENGTH = 8096;
  int4 lineNumber = 0;
  int4 tokenCount;
  char line[MAXLINELENGTH];
  char *token, *tempAddress;
  unsigned char columnHeadings[24];
  unsigned char rowHeadings[24];
  int2 value;
  struct scoreMatrix scoreMatrix;
  int4 x, y;

  scoreMatrix.highestValue = 0;
  scoreMatrix.lowestValue = 0;

  // Declare memory used by scoreMatrix
  scoreMatrix.matrix = (int2 **)global_malloc(
      sizeof(int2 *) * encoding_numCodes +
      sizeof(int2) * encoding_numCodes * encoding_numCodes);
  tempAddress = (char *)scoreMatrix.matrix;
  x = 0;
  while (x < encoding_numCodes) {
    scoreMatrix.matrix[x] =
        (int2 *)(tempAddress + sizeof(int2 *) * encoding_numCodes +
                 sizeof(int2) * encoding_numCodes * x);
    // Initialize the score matrix, by setting all values to sentinal score
    y = 0;
    while (y < encoding_numCodes) {
      scoreMatrix.matrix[x][y] = constants_sentinalScore;
      y++;
    }
    x++;
  }

  // Open file for reading
  if ((matrixFile = fopen(filename, "r")) == NULL) {
    fprintf(stderr, "%s\n", strerror(errno));
    fprintf(stderr, "Error opening matrix file %s for reading\n", filename);
    exit(-1);
  }

  // Read each line in turn
  while (fgets(line, MAXLINELENGTH, matrixFile) != NULL) {
    // Check we didn't max out the buffer
    if (strlen(line) >= MAXLINELENGTH - 1) {
      fprintf(stderr, "%s\n", strerror(errno));
      fprintf(stderr,
              "Error reading file %s: maximum line length %d exceeded\n",
              filename, MAXLINELENGTH);
      exit(-1);
    }
    // Check not a comment or blank line
    if (line[0] != '\0' && line[0] != '#' && lineNumber < 25) {
      // Read each of the space seperated tokens from the line
      tokenCount = 0;
      token = strtok(line, " \n");
      while (token != NULL && tokenCount < 25) {
        // First line - tokens are column headings
        if (lineNumber == 0) {
          columnHeadings[tokenCount] = token[0];
        }
        // Subsequent lines, first token is row heading
        else if (tokenCount == 0) {
          rowHeadings[lineNumber - 1] = token[0];
        }
        // Subsequent lines, subsequent tokens are array values
        else {
          // Get integer value of token
          value = atoi(token);

          // Add to scoring matrix
          scoreMatrix.matrix[encoding_getCode(rowHeadings[lineNumber - 1])]
                            [encoding_getCode(columnHeadings[tokenCount - 1])] =
              value;

          // Determine the highest and lowest values in the matrix
          if (value > scoreMatrix.highestValue) {
            scoreMatrix.highestValue = value;
          }
          if (value < scoreMatrix.lowestValue) {
            scoreMatrix.lowestValue = value;
          }
        }

        token = strtok(NULL, " \n");
        tokenCount++;
      }

      lineNumber++;
    }
  }

  fclose(matrixFile);

  // For cells in the score matrix that did not recieve a score, use 1 and
  // lowestValue instead
  x = 0;
  while (x < encoding_numCodes) {
    y = 0;
    while (y < encoding_numCodes) {
      if (scoreMatrix.matrix[x][y] == constants_sentinalScore) {
        if (x == y) {
          scoreMatrix.matrix[x][y] = 1;
        } else {
          scoreMatrix.matrix[x][y] = scoreMatrix.lowestValue;
        }
      }
      y++;
    }
    x++;
  }

  // Every letter scores well against the wildcard
  x = 0;
  while (x < encoding_numCodes) {
    scoreMatrix.matrix[x][encoding_aaStartWildcards] = 1;
    scoreMatrix.matrix[encoding_aaStartWildcards][x] = 1;
    x++;
  }

  // Every letter scores poorly against the sentinal code
  x = 0;
  while (x < encoding_numCodes) {
    scoreMatrix.matrix[x][encoding_sentinalCode] = constants_sentinalScore;
    scoreMatrix.matrix[encoding_sentinalCode][x] = constants_sentinalScore;
    x++;
  }

  // Process wildcard scores
  y = 0;
  while (y < wildcards_numClusterWildcards) {
    x = 0;
    while (x < encoding_numLetters) {
      scoreMatrix.matrix[y + encoding_aaStartWildcards][x] =
          wildcards_clusterWildcards[y].scoreMatrixRow[x];
      scoreMatrix.matrix[x][y + encoding_aaStartWildcards] =
          wildcards_clusterWildcards[y].scoreMatrixRow[x];
      x++;
    }
    y++;
  }

  // Calculate average match score for two residues
  scoreMatrix.averageMatchScore = 0;
  y = 0;
  while (y < encoding_numLetters) {
    x = 0;
    while (x < encoding_numLetters) {
      scoreMatrix.averageMatchScore +=
          scoreMatrix.matrix[y][x] * Robinson_prob[x] * Robinson_prob[y];
      x++;
    }
    y++;
  }
  scoreMatrix.averageMatchScore /= 1000000;

  return scoreMatrix;
}
Ejemplo n.º 23
0
// Create a nucleotide scoring matrix use match and mismatch penalties
struct scoreMatrix scoreMatrix_create(int2 match, int2 mismatch) {
  struct scoreMatrix scoreMatrix;
  char *tempAddress;
  int4 x, y, numXcodes, numYcodes, count, xCode, yCode, total;

  scoreMatrix.highestValue = match;
  scoreMatrix.lowestValue = mismatch;

  // Declare memory used by scoreMatrix
  scoreMatrix.matrix = (int2 **)global_malloc(
      sizeof(int2 *) * encoding_numCodes +
      sizeof(int2) * encoding_numCodes * encoding_numCodes);
  tempAddress = (char *)scoreMatrix.matrix;

  // For each row in the matrix
  x = 0;
  while (x < encoding_numCodes) {
    // Initialize memory
    scoreMatrix.matrix[x] =
        (int2 *)(tempAddress + sizeof(int2 *) * encoding_numCodes +
                 sizeof(int2) * encoding_numCodes * x);

    // For each column, determine value
    y = 0;
    while (y < encoding_numCodes) {
      // If either is the sentinal code, use sentinal score
      if (x == encoding_sentinalCode || y == encoding_sentinalCode) {
        scoreMatrix.matrix[x][y] = constants_sentinalScore;
      }
      // If both characters are wilds, calculate score for match
      else if (x >= encoding_numRegularLetters &&
               y >= encoding_numRegularLetters) {
        // For each possible letter for x
        total = 0;
        count = 0;
        xCode = 0;
        numXcodes = encoding_wildcards[x].numCodes;
        while (xCode < numXcodes) {
          // For each possible letter for y
          yCode = 0;
          numYcodes = encoding_wildcards[y].numCodes;
          while (yCode < numYcodes) {
            // If the letters match
            if (encoding_wildcards[x].replacementCodes[xCode] ==
                encoding_wildcards[y].replacementCodes[yCode]) {
              count++;
            }
            total++;
            yCode++;
          }
          xCode++;
        }

        // Calculate frequency of the letters matching and probably score
        scoreMatrix.matrix[x][y] = (int4)ceilf(
            ((float)match * (float)count / (float)total) +
            ((float)mismatch * (float)(total - count) / (float)total));
      }
      // If the characters match
      else if (x == y) {
        scoreMatrix.matrix[x][y] = match;
      }
      // Mismatch
      else {
        scoreMatrix.matrix[x][y] = mismatch;

        // If y is in x's list of ambigious codes
        count = 0;
        numXcodes = encoding_wildcards[x].numCodes;
        while (count < numXcodes) {
          if (encoding_wildcards[x].replacementCodes[count] == y) {
            // Give score based on probability of a match
            scoreMatrix.matrix[x][y] = (int4)ceilf(
                ((float)match / (float)numXcodes) +
                ((float)mismatch * (float)(numXcodes - 1) / (float)numXcodes));
          }
          count++;
        }

        // Similarly if x is in y's list of ambigious codes
        count = 0;
        numYcodes = encoding_wildcards[y].numCodes;
        while (count < numYcodes) {
          if (encoding_wildcards[y].replacementCodes[count] == x) {
            // Give score based on probability of a match
            scoreMatrix.matrix[x][y] = (int4)ceilf(
                ((float)match / (float)numYcodes) +
                ((float)mismatch * (float)(numYcodes - 1) / (float)numYcodes));
          }
          count++;
        }
      }

      y++;
    }
    x++;
  }

  return scoreMatrix;
}
Ejemplo n.º 24
0
// Perform dynamic programming to explore possible start points and alignments
// that end at
// the given seed and find the best score
struct dpResults nuGappedScoring_dpBeforeSeed(unsigned char *subject,
                                              struct PSSMatrix PSSMatrix,
                                              struct coordinate seed,
                                              int4 dropoff) {
  int2 **queryPosition, **bestQueryPosition;
  int2 **rowDropoff, **columnDropoff;
  unsigned char *subjectPosition, *bestSubjectPosition, subjectChar;
  int4 bestScore = 0;
  int4 *bestRow, *insertQrow, insertS, rowOffset;
  int4 queryDistance;
  int4 oldBest, match, previousOldBest;
  unsigned char rightOfDropoff;
  struct dpResults dpResults;
  int4 bytePosition;

  if (seed.queryOffset == 0 || seed.subjectOffset == 0) {
    dpResults.best.queryOffset = 0;
    dpResults.best.subjectOffset = 0;
    dpResults.bestScore = bestScore;
    dpResults.traceback = NULL;
    return dpResults;
  }

  // Declare processing rows for storing match, insert-subject and insert-query
  // values
  // If current malloced rows aren't big enough
  if (seed.queryOffset >= nuGappedScoring_rowSizes) {
    // Free existing rows
    free(nuGappedScoring_bestRow);
    free(nuGappedScoring_insertQrow);
    // Set size to double current needed length
    nuGappedScoring_rowSizes = (seed.queryOffset) * 2;
    // Malloc new rows
    nuGappedScoring_bestRow =
        (int4 *)global_malloc(sizeof(int4) * nuGappedScoring_rowSizes);
    nuGappedScoring_insertQrow =
        (int4 *)global_malloc(sizeof(int4) * nuGappedScoring_rowSizes);
  }

  // Convert subject offset to point to bytepacked subject
  bytePosition = (seed.subjectOffset - 1) % 4;
  bestSubjectPosition = subjectPosition =
      subject + ((seed.subjectOffset - 1) / 4);
  bestQueryPosition = queryPosition = PSSMatrix.matrix + seed.queryOffset - 1;

  // Initialize row pointers
  rowOffset = (queryPosition - PSSMatrix.matrix);
  bestRow = nuGappedScoring_bestRow + rowOffset;
  insertQrow = nuGappedScoring_insertQrow + rowOffset;

  // Set initial row dropoff and column dropoff
  rowDropoff = PSSMatrix.matrix;
  columnDropoff = PSSMatrix.matrix + seed.queryOffset;

  // -----FIRST ROW-----
  subjectChar = encoding_extractBase(*subjectPosition, bytePosition);

  // -----FIRST CELL-----
  // Set M value for bottom-right cell
  match = (*queryPosition)[subjectChar];

  // M must be the best
  *bestRow = match;

  // Only gap opens possible
  *insertQrow = insertS = match - parameters_openGap;

  // If this is the best-yet scoring cell
  if (match > bestScore) {
    // Update best start cell data
    bestScore = match;
    bestQueryPosition = queryPosition;
    bestSubjectPosition = subjectPosition;
  }

  queryDistance = 0;
  queryPosition--;
  bestRow--;
  insertQrow--;

  // ----- REMAINING CELLS -----
  // For each remaining column in the bottom row, scanning from right-to-left
  while (queryPosition >= PSSMatrix.matrix) {
    // Set value for M
    match = (*queryPosition)[subjectChar] - parameters_openGap -
            queryDistance * parameters_extendGap;

    // Determine the best of M and Iy
    if (match > insertS) {
      *bestRow = match;

      // Calculate new Iy
      insertS =
          maximum(match - parameters_openGap, insertS - parameters_extendGap);
    } else {
      *bestRow = insertS;

      // Since M <= Iy, new Iy must derive from Iy
      insertS -= parameters_extendGap;
    }

    // Set DUMMY Ix value, which should never be used
    *insertQrow = constants_gappedExtensionDummyValue;

    // If this is the best-yet scoring cell
    if (match > bestScore) {
      // Update best start cell data
      bestScore = match;
      bestQueryPosition = queryPosition;
      bestSubjectPosition = subjectPosition;
    }

    // If score at current cell is below dropoff
    if (bestScore > *bestRow + dropoff) {
      // Record dropoff position
      rowDropoff = queryPosition;
      // And stop processing row
      break;
    }

    queryPosition--;
    bestRow--;
    insertQrow--;
    queryDistance++;
  }

  // Clear insertS for next row
  insertS = constants_gappedExtensionDummyValue;

#ifdef VERBOSE
  if (parameters_verboseDloc == blast_dloc)
    nuGappedScoring_printBeforeRow(nuGappedScoring_bestRow, PSSMatrix.matrix,
                                   rowDropoff, columnDropoff);
#endif

  // -----REMAINING ROWS-----
  while (rowDropoff < columnDropoff) {
    // Move to next subject characters
    if (bytePosition) {
      // Next char in current byte
      bytePosition--;
    } else {
      // Involves moving to next byte
      bytePosition = 3;
      subjectPosition--;
      if (subjectPosition < subject)
        break;
    }

    // Extract the subject characters
    subjectChar = encoding_extractBase(*subjectPosition, bytePosition);

    //    	printf("[%d/%d]", subjectPosition - subject, bytePosition);

    queryPosition = columnDropoff - 1;

    // Reset row pointers to start of rows
    rowOffset = (queryPosition - PSSMatrix.matrix);
    bestRow = nuGappedScoring_bestRow + rowOffset;
    insertQrow = nuGappedScoring_insertQrow + rowOffset;

    // -----FAR RIGHT CELL-----
    // Record some old values
    previousOldBest = *bestRow;

    // Ix is the best
    *bestRow = *insertQrow;

    // Calculate new Ix value
    *insertQrow -= parameters_extendGap;

    // Set DUMMY value for Iy, which should never be used
    insertS = constants_gappedExtensionDummyValue;

    // If score at current cell is below dropoff
    if (bestScore > *bestRow + dropoff) {
      // Record dropoff position
      columnDropoff = queryPosition;
      rightOfDropoff = 1;
    } else {
      // We are left of the column dropoff for this row
      rightOfDropoff = 0;
    }

    queryPosition--;
    bestRow--;
    insertQrow--;

  // -----CELLS RIGHT OF ROW DROPOFF-----
  start1:
    // Loop 1 when insertS has no value
    while (queryPosition >= rowDropoff) {
      // Calculate new M value
      oldBest = *bestRow;
      match = (*queryPosition)[subjectChar] + previousOldBest;
      previousOldBest = oldBest;

      // Determine the best of M and Ix
      if (match > *insertQrow) {
        // Match is largest
        *bestRow = match;

        // Calculate new Ix
        *insertQrow = maximum(match - parameters_openGap,
                              *insertQrow - parameters_extendGap);

        // Calculate new Iy
        insertS =
            maximum(match - parameters_openGap, insertS - parameters_extendGap);

        // If this is the best-yet scoring cell
        if (match > bestScore) {
          // Update best start cell data
          bestScore = match;
          bestQueryPosition = queryPosition;
          bestSubjectPosition = subjectPosition;
        }

        // If score at current cell (and cells to its right) are below dropoff
        if (rightOfDropoff) {
          if (bestScore > *bestRow + dropoff) {
            // Record dropoff position
            columnDropoff = queryPosition;
          } else {
            // We are left of the column dropoff for this row
            rightOfDropoff = 0;
          }
        }
        queryPosition--;
        bestRow--;
        insertQrow--;

        // InsertS now has a value
        break;
      } else {
        // insertQ is largest
        *bestRow = *insertQrow;

        // Calculate new Ix
        *insertQrow -= parameters_extendGap;
      }

      // If score at current cell (and cells to its right) are below dropoff
      if (rightOfDropoff) {
        if (bestScore > *bestRow + dropoff) {
          // Record dropoff position
          columnDropoff = queryPosition;
        } else {
          // We are left of the column dropoff for this row
          rightOfDropoff = 0;
        }
      }

      queryPosition--;
      bestRow--;
      insertQrow--;
    }

    // Loop2 whilst insertS does have a value
    while (queryPosition >= rowDropoff) {
      // Calculate new M value
      oldBest = *bestRow;
      match = (*queryPosition)[subjectChar] + previousOldBest;
      previousOldBest = oldBest;

      // Determine the best of M, Ix and Iy
      if (match > insertS) {
        if (match > *insertQrow) {
          // Match is largest
          *bestRow = match;

          // Calculate new Ix
          *insertQrow = maximum(match - parameters_openGap,
                                *insertQrow - parameters_extendGap);

          // Calculate new Iy
          insertS = maximum(match - parameters_openGap,
                            insertS - parameters_extendGap);

          // If this is the best-yet scoring cell
          if (match > bestScore) {
            // Update best start cell data
            bestScore = match;
            bestQueryPosition = queryPosition;
            bestSubjectPosition = subjectPosition;
          }
        } else {
          // insertQ is largest
          *bestRow = *insertQrow;

          // Calculate new Ix
          *insertQrow -= parameters_extendGap;

          insertS = constants_gappedExtensionDummyValue;
          break;
        }
      } else {
        if (insertS > *insertQrow) {
          // insertS is largest
          *bestRow = insertS;

          // Dummy Ix
          *insertQrow = constants_gappedExtensionDummyValue;

          // Calculate new Iy
          insertS -= parameters_extendGap;

        } else {
          // insertQ is largest
          *bestRow = *insertQrow;

          // Calculate new Ix
          *insertQrow -= parameters_extendGap;

          insertS = constants_gappedExtensionDummyValue;
          break;
        }
      }

      // If score at current cell (and cells to its right) are below dropoff
      if (rightOfDropoff) {
        if (bestScore > *bestRow + dropoff) {
          // Record dropoff position
          columnDropoff = queryPosition;
        } else {
          // We are left of the column dropoff for this row
          rightOfDropoff = 0;
        }
      }

      queryPosition--;
      bestRow--;
      insertQrow--;
    }

    if (queryPosition >= rowDropoff) {
      // If score at current cell (and cells to its right) are below dropoff
      if (rightOfDropoff) {
        if (bestScore > *bestRow + dropoff) {
          // Record dropoff position
          columnDropoff = queryPosition;
        } else {
          // We are left of the column dropoff for this row
          rightOfDropoff = 0;
        }
      }

      queryPosition--;
      bestRow--;
      insertQrow--;
      goto start1;
    }

    // -----CELLS LEFT OF ROW DROPOFF -----
    if (!(bestScore > *(bestRow + 1) + dropoff)) {
      while (queryPosition >= PSSMatrix.matrix) {
        // Set value for Iy and best
        *bestRow = insertS;
        insertS = insertS - parameters_extendGap;

        // Set DUMMY values for Ix
        *insertQrow = constants_gappedExtensionDummyValue;

        // If score at current cell is below dropoff
        if (bestScore > *bestRow + dropoff) {
          // Stop processing row
          queryPosition--;
          break;
        }

        queryPosition--;
        bestRow--;
        insertQrow--;
      }
    }

    // Record dropoff position
    rowDropoff = queryPosition + 1;

    // Clear insertS for next row
    insertS = constants_gappedExtensionDummyValue;

#ifdef VERBOSE
    if (parameters_verboseDloc == blast_dloc)
      nuGappedScoring_printBeforeRow(nuGappedScoring_bestRow, PSSMatrix.matrix,
                                     rowDropoff, columnDropoff);
#endif
  }

  dpResults.best.queryOffset = bestQueryPosition - PSSMatrix.matrix;
  dpResults.best.subjectOffset = bestSubjectPosition - subject;
  dpResults.bestScore = bestScore;
  dpResults.traceback = NULL;
  return dpResults;
}
Ejemplo n.º 25
0
// Perform dynamic programming to explore possible END points and alignments
// that start at
// the given seed and find the best score
struct dpResults nuGappedScoring_dpAfterSeed(unsigned char *subject,
                                             struct PSSMatrix PSSMatrix,
                                             int4 dropoff, int4 subjectLength) {
  int2 **queryPosition, **bestQueryPosition, **queryEnd;
  int2 **rowDropoff, **columnDropoff;
  unsigned char *subjectPosition, *bestSubjectPosition, *subjectEnd,
      subjectChar;
  int4 bestScore = 0;
  int4 *bestRow, *insertQrow, insertS, rowOffset;
  int4 queryDistance;
  int4 oldBest, match, previousOldBest;
  unsigned char leftOfDropoff;
  int4 queryLength;
  struct dpResults dpResults;
  int4 bytePosition, subjectEndBytePosition;

  queryLength = PSSMatrix.length;
  queryEnd = PSSMatrix.matrix + queryLength;

  subjectEnd = subject + (subjectLength / 4);
  subjectEndBytePosition = subjectLength % 4;

  // Declare processing rows for storing match, insert-subject and insert-query
  // values
  // If current malloced rows aren't big enough
  if (queryLength >= nuGappedScoring_rowSizes) {
    // Free existing rows
    free(nuGappedScoring_bestRow);
    free(nuGappedScoring_insertQrow);
    // Set size to double current needed length
    nuGappedScoring_rowSizes = queryLength * 2;
    // Malloc new rows
    nuGappedScoring_bestRow =
        (int4 *)global_malloc(sizeof(int4) * nuGappedScoring_rowSizes);
    nuGappedScoring_insertQrow =
        (int4 *)global_malloc(sizeof(int4) * nuGappedScoring_rowSizes);
  }

  bestSubjectPosition = subjectPosition = subject;
  bytePosition = 1;
  bestQueryPosition = queryPosition = PSSMatrix.matrix + 1;

  // Initialize rows
  bestRow = nuGappedScoring_bestRow + 1;
  insertQrow = nuGappedScoring_insertQrow + 1;

  // Set initial row dropoff and column dropoff
  rowDropoff = PSSMatrix.matrix + queryLength - 1;
  columnDropoff = PSSMatrix.matrix;

  // -----FIRST ROW-----
  subjectChar = encoding_extractBase(*subjectPosition, bytePosition);

  // -----FIRST CELL-----
  // Set M value for top-left cell
  match = (*queryPosition)[subjectChar];

  // M must be the best
  *bestRow = match;

  // Only gap opens possible
  *insertQrow = insertS = match - parameters_openGap;

  // If this is the best-yet scoring cell
  if (match > bestScore) {
    // Update best start cell data
    bestScore = match;
    bestQueryPosition = queryPosition;
    bestSubjectPosition = subjectPosition;
  }

  queryDistance = 0;
  queryPosition++;
  bestRow++;
  insertQrow++;

  // ----- REMAINING CELLS -----
  // For each remaining columns in the top row, scanning from left-to-right
  while (queryPosition < queryEnd) {
    // Set value for M
    match = (*queryPosition)[subjectChar] - parameters_openGap -
            queryDistance * parameters_extendGap;

    // Determine the best of M and Iy
    if (match > insertS) {
      *bestRow = match;

      // Calculate new Iy
      insertS =
          maximum(match - parameters_openGap, insertS - parameters_extendGap);
    } else {
      *bestRow = insertS;

      // Since M <= Iy, new Iy must derive from Iy
      insertS -= parameters_extendGap;
    }

    // Set DUMMY Ix value, which should never be used
    *insertQrow = constants_gappedExtensionDummyValue;

    // If this is the best-yet scoring cell
    if (match > bestScore) {
      // Update best start cell data
      bestScore = match;
      bestQueryPosition = queryPosition;
      bestSubjectPosition = subjectPosition;
    }

    // If score at current cell is below dropoff
    if (bestScore > *bestRow + dropoff) {
      // Record dropoff position
      rowDropoff = queryPosition;
      // And stop processing row
      break;
    }

    queryPosition++;
    bestRow++;
    insertQrow++;
    queryDistance++;
  }

  // Clear insertS for next row
  insertS = constants_gappedExtensionDummyValue;

#ifdef VERBOSE
  if (parameters_verboseDloc == blast_dloc)
    nuGappedScoring_printAfterRow(nuGappedScoring_bestRow + 1, PSSMatrix.matrix,
                                  rowDropoff, columnDropoff);
#endif

  // Move to next subject character
  if (bytePosition == 3) {
    bytePosition = 0;
    subjectPosition++;
  } else {
    bytePosition++;
  }

  // -----REMAINING ROWS-----
  while (rowDropoff > columnDropoff) {
    // Stop at end of subject
    if (subjectPosition == subjectEnd && bytePosition == subjectEndBytePosition)
      break;

    subjectChar = encoding_extractBase(*subjectPosition, bytePosition);

    queryPosition = columnDropoff + 1;

    // Reset rows
    rowOffset = (queryPosition - PSSMatrix.matrix);
    bestRow = nuGappedScoring_bestRow + rowOffset;
    insertQrow = nuGappedScoring_insertQrow + rowOffset;

    // -----FAR LEFT CELL-----
    // Record some old values
    previousOldBest = *bestRow;

    // Ix is the best
    *bestRow = *insertQrow;

    // Calculate new Ix value
    *insertQrow -= parameters_extendGap;

    // Set DUMMY value for Iy, which should never be used
    insertS = constants_gappedExtensionDummyValue;

    // If score at current cell is below dropoff
    if (bestScore > *bestRow + dropoff) {
      // Record dropoff position
      columnDropoff = queryPosition;
      leftOfDropoff = 1;
    } else {
      // We are left of the column dropoff for this row
      leftOfDropoff = 0;
    }

    queryPosition++;
    bestRow++;
    insertQrow++;

  // -----CELLS LEFT OF ROW DROPOFF-----
  start2:
    // Loop 1 when insertS has no value
    while (queryPosition <= rowDropoff) {
      // Calculate new M value
      oldBest = *bestRow;
      match = (*queryPosition)[subjectChar] + previousOldBest;
      previousOldBest = oldBest;

      // Determine the best of M and Ix
      if (match > *insertQrow) {
        // Match is largest
        *bestRow = match;

        // Calculate new Ix
        *insertQrow = maximum(match - parameters_openGap,
                              *insertQrow - parameters_extendGap);

        // Calculate new Iy
        insertS =
            maximum(match - parameters_openGap, insertS - parameters_extendGap);

        // If this is the best-yet scoring cell
        if (match > bestScore) {
          // Update best start cell data
          bestScore = match;
          bestQueryPosition = queryPosition;
          bestSubjectPosition = subjectPosition;
        }

        // If score at current cell (and cells to its right) are below dropoff
        if (leftOfDropoff) {
          if (bestScore > *bestRow + dropoff) {
            // Record dropoff position
            columnDropoff = queryPosition;
          } else {
            // We are left of the column dropoff for this row
            leftOfDropoff = 0;
          }
        }
        queryPosition++;
        bestRow++;
        insertQrow++;

        // InsertS now has a value
        break;
      } else {
        // insertQ is largest
        *bestRow = *insertQrow;

        // Calculate new Ix
        *insertQrow -= parameters_extendGap;
      }

      // If score at current cell (and cells to its right) are below dropoff
      if (leftOfDropoff) {
        if (bestScore > *bestRow + dropoff) {
          // Record dropoff position
          columnDropoff = queryPosition;
        } else {
          // We are left of the column dropoff for this row
          leftOfDropoff = 0;
        }
      }

      queryPosition++;
      bestRow++;
      insertQrow++;
    }

    // Loop2 whilst insertS does have a value
    while (queryPosition <= rowDropoff) {
      // Calculate new M value
      oldBest = *bestRow;
      match = (*queryPosition)[subjectChar] + previousOldBest;
      previousOldBest = oldBest;

      // Determine the best of M, Ix and Iy
      if (match > insertS) {
        if (match > *insertQrow) {
          // Match is largest
          *bestRow = match;

          // Calculate new Ix
          *insertQrow = maximum(match - parameters_openGap,
                                *insertQrow - parameters_extendGap);

          // Calculate new Iy
          insertS = maximum(match - parameters_openGap,
                            insertS - parameters_extendGap);

          // If this is the best-yet scoring cell
          if (match > bestScore) {
            // Update best start cell data
            bestScore = match;
            bestQueryPosition = queryPosition;
            bestSubjectPosition = subjectPosition;
          }
        } else {
          // insertQ is largest
          *bestRow = *insertQrow;

          // Calculate new Ix
          *insertQrow -= parameters_extendGap;

          insertS = constants_gappedExtensionDummyValue;
          break;
        }
      } else {
        if (insertS > *insertQrow) {
          // insertS is largest
          *bestRow = insertS;

          // Dummy Ix
          *insertQrow = constants_gappedExtensionDummyValue;

          // Calculate new Iy
          insertS -= parameters_extendGap;

        } else {
          // insertQ is largest
          *bestRow = *insertQrow;

          // Calculate new Ix
          *insertQrow -= parameters_extendGap;

          insertS = constants_gappedExtensionDummyValue;
          break;
        }
      }

      // If score at current cell (and cells to its right) are below dropoff
      if (leftOfDropoff) {
        if (bestScore > *bestRow + dropoff) {
          // Record dropoff position
          columnDropoff = queryPosition;
        } else {
          // We are left of the column dropoff for this row
          leftOfDropoff = 0;
        }
      }

      queryPosition++;
      bestRow++;
      insertQrow++;
    }

    if (queryPosition <= rowDropoff) {
      // If score at current cell (and cells to its right) are below dropoff
      if (leftOfDropoff) {
        if (bestScore > *bestRow + dropoff) {
          // Record dropoff position
          columnDropoff = queryPosition;
        } else {
          // We are left of the column dropoff for this row
          leftOfDropoff = 0;
        }
      }

      queryPosition++;
      bestRow++;
      insertQrow++;
      goto start2;
    }

    // -----CELLS RIGHT OF ROW DROPOFF -----
    if (!(bestScore > *(bestRow - 1) + dropoff)) {
      while (queryPosition < queryEnd) {
        // Set value for Iy and best
        *bestRow = insertS;
        insertS = insertS - parameters_extendGap;

        // Set DUMMY value for Ix, which should never be used
        *insertQrow = constants_gappedExtensionDummyValue;

        // If score at current cell is below dropoff
        if (bestScore > *bestRow + dropoff) {
          // And stop processing row
          queryPosition++;
          break;
        }

        queryPosition++;
        bestRow++;
        insertQrow++;
      }
    }

    // Record dropoff position
    rowDropoff = queryPosition - 1;

    // Clear insertS for next row
    insertS = constants_gappedExtensionDummyValue;

    // Move to next subject character
    if (bytePosition == 3) {
      bytePosition = 0;
      subjectPosition++;
    } else {
      bytePosition++;
    }

#ifdef VERBOSE
    if (parameters_verboseDloc == blast_dloc)
      nuGappedScoring_printAfterRow(nuGappedScoring_bestRow + 1,
                                    PSSMatrix.matrix, rowDropoff,
                                    columnDropoff);
#endif
  }

  dpResults.best.queryOffset = bestQueryPosition - PSSMatrix.matrix;
  dpResults.best.subjectOffset = bestSubjectPosition - subject;
  dpResults.bestScore = bestScore;
  dpResults.traceback = NULL;
  return dpResults;
}
Ejemplo n.º 26
0
// Load a single subject into memory
int4 unpack_loadSubject(struct PSSMatrix PSSMatrix,
                        struct alignment *alignment) {
  uint4 totalCopied = 0;
  unsigned char *subject, *edits, *endEdits;
  struct unpackRegion *firstRegion = NULL, *lastRegion, *currentRegion;
  int4 numRegions, regionStart, regionEnd;

  // If protein search
  if (encoding_alphabetType == encoding_protein) {
    // Make copy of sequence
    subject = (unsigned char *)global_malloc(sizeof(unsigned char) *
                                             alignment->encodedLength);
    subject++;
    memcpy(subject - 1, alignment->subject - 1, alignment->encodedLength);
    alignment->subject = subject;

    blast_totalCopied += alignment->encodedLength;
  }
  // If a nucleotide search
  else {
    // Get a list of regions to copy
    numRegions =
        unpack_getRegions(PSSMatrix, alignment, 1, unpack_subjectRegions);
    lastRegion = memBlocks_getLastEntry(unpack_subjectRegions);
    lastRegion++;
    firstRegion = lastRegion - numRegions;

#ifdef VERBOSE
    if (parameters_verboseDloc == alignment->descriptionLocation) {
      printf("%d regions for subject\n", lastRegion - firstRegion);
      fflush(stdout);
    }
#endif

    // Copy each region into memory
    currentRegion = firstRegion;
    while (currentRegion < lastRegion) {
#ifdef VERBOSE
      if (parameters_verboseDloc == alignment->descriptionLocation) {
        printf("Load region %d to %d into memory\n", currentRegion->startOffset,
               currentRegion->endOffset);
        fflush(stdout);
        fflush(stdout);
      }
#endif

      regionStart = currentRegion->startOffset / 4;
      regionEnd = (currentRegion->endOffset + 3) / 4;

      currentRegion->unpackedSubject = NULL;
      currentRegion->subject = (unsigned char *)global_malloc(
          sizeof(unsigned char) * (regionEnd - regionStart));

      totalCopied += regionEnd - regionStart;
      memcpy(currentRegion->subject, alignment->subject + regionStart,
             regionEnd - regionStart);
      currentRegion->subject -= regionStart;
      currentRegion->subjectLength = alignment->subjectLength;

      blast_totalCopied += (regionEnd - regionStart);

      currentRegion++;
    }

    // Store new alignment regions
    alignment->unpackRegions = firstRegion;
    alignment->numUnpackRegions = lastRegion - firstRegion;

    // If there are edits for this subject
    if (alignment->edits != NULL) {
      edits = alignment->edits;
      endEdits = alignment->subject + alignment->encodedLength;

      // Make an in-memory copy of them
      alignment->edits =
          (unsigned char *)malloc(sizeof(char) * (endEdits - edits));
      memcpy(alignment->edits, edits, endEdits - edits);
    }

    alignment->subject = NULL;
  }

  alignment->inMemorySubject = 1;

  return totalCopied;
}
Ejemplo n.º 27
0
// Perform dynamic programming to explore possible start points and alignments that end at
// the given seed and find the best score
struct dpResults oldGappedScoring_dpBeforeSeed(unsigned char* subject, struct PSSMatrix PSSMatrix,
                                            struct coordinate seed, int4 dropoff)
{
    int2 **queryPosition, **bestQueryPosition;
    int2* matrixColumn;
    unsigned char *rowDropoff, *columnDropoff;
    unsigned char* subjectPosition, *bestSubjectPosition;
    int4 bestScore = 0;
    int4 *bestRow, *insertQrow, insertS, rowOffset;
    int4 subjectDistance;
    int4 oldBest, match, previousOldBest;
    unsigned char rightOfDropoff;
    struct dpResults dpResults;

    // Declare processing rows for storing match, insert-subject and insert-query values
    // If current malloced rows aren't big enough
    if (seed.subjectOffset >= oldGappedScoring_rowSizes)
    {
        // Free existing rows
        free(oldGappedScoring_bestRow);
        free(oldGappedScoring_insertQrow);
        // Set size to double current needed length
        oldGappedScoring_rowSizes = (seed.subjectOffset) * 2;
        // Malloc new rows
        oldGappedScoring_bestRow = (int4*)global_malloc(sizeof(int4) * oldGappedScoring_rowSizes);
        oldGappedScoring_insertQrow = (int4*)global_malloc(sizeof(int4) * oldGappedScoring_rowSizes);
    }

    bestSubjectPosition = subjectPosition = subject + seed.subjectOffset - 1;
    bestQueryPosition = queryPosition = PSSMatrix.matrix + seed.queryOffset - 1;

    // Initialize row pointers
    rowOffset = (subjectPosition - subject);
    bestRow = oldGappedScoring_bestRow + rowOffset;
    insertQrow = oldGappedScoring_insertQrow + rowOffset;

    // Set initial row dropoff and column dropoff
    rowDropoff = subject;
    columnDropoff = subject + seed.subjectOffset;

    // Using first column of query matrix
    matrixColumn = *queryPosition;

    // -----FIRST ROW-----

    // -----FIRST CELL-----
    // Set M value for bottom-right cell
    match = matrixColumn[*subjectPosition];

    // M must be the best
    *bestRow = match;

    // Only gap opens possible
    *insertQrow = insertS = match - parameters_openGap;

    // If this is the best-yet scoring cell
    if (match > bestScore)
    {
        // Update best start cell data
        bestScore = match;
        bestQueryPosition = queryPosition;
        bestSubjectPosition = subjectPosition;
    }

    subjectDistance = 0;
    subjectPosition--; bestRow--; insertQrow--;

    // ----- REMAINING CELLS -----
    // For each remaining column in the bottom row, scanning from right-to-left
    while (subjectPosition >= subject)
    {
        // Set value for M
        match = matrixColumn[*subjectPosition]
              - parameters_openGap - subjectDistance * parameters_extendGap;

        // Determine the best of M and Iy
        if (match > insertS)
        {
            *bestRow = match;

            // Calculate new Iy
            insertS = maximum(match - parameters_openGap,
                              insertS - parameters_extendGap);
        }
        else
        {
            *bestRow = insertS;

            // Since M <= Iy, new Iy must derive from Iy
            insertS -= parameters_extendGap;
        }

        // Set DUMMY Ix value, which should never be used
        *insertQrow = constants_gappedExtensionDummyValue;

        // If this is the best-yet scoring cell
        if (match > bestScore)
        {
            // Update best start cell data
            bestScore = match;
            bestQueryPosition = queryPosition;
            bestSubjectPosition = subjectPosition;
        }

        // If score at current cell is below dropoff
        if (bestScore > *bestRow + dropoff)
        {
            // Record dropoff position
            rowDropoff = subjectPosition;
            // And stop processing row
            break;
        }

        subjectPosition--; bestRow--; insertQrow--;
        subjectDistance++;
    }

    // Clear insertS for next row
    insertS = constants_gappedExtensionDummyValue;

//    if (dloc == 19063576)
//    print(oldGappedScoring_bestRow, subject, rowDropoff, columnDropoff);

    // -----REMAINING ROWS-----
    while (queryPosition > PSSMatrix.matrix && rowDropoff < columnDropoff)
    {
        queryPosition--;
        subjectPosition = columnDropoff - 1;

        // Reset row pointers to start of rows
        rowOffset = (subjectPosition - subject);
        bestRow = oldGappedScoring_bestRow + rowOffset;
        insertQrow = oldGappedScoring_insertQrow + rowOffset;

        // Using next column of query matrix
        matrixColumn = *queryPosition;

        // -----FAR RIGHT CELL-----
        // Record some old values
        previousOldBest = *bestRow;

        // Ix is the best
        *bestRow = *insertQrow;

        // Calculate new Ix value
        *insertQrow -= parameters_extendGap;

        // Set DUMMY value for Iy, which should never be used
        insertS = constants_gappedExtensionDummyValue;

        // If score at current cell is below dropoff
        if (bestScore > *bestRow + dropoff)
        {
            // Record dropoff position
            columnDropoff = subjectPosition;
            rightOfDropoff = 1;
        }
        else
        {
            // We are left of the column dropoff for this row
            rightOfDropoff = 0;
        }

        subjectPosition--; bestRow--; insertQrow--;

        // -----CELLS RIGHT OF ROW DROPOFF-----
        while (subjectPosition >= rowDropoff)
        {
            // Remember old M value (for cell below this one)
            oldBest = *bestRow;

            // Calculate new M value
            match = matrixColumn[*subjectPosition] + previousOldBest;
            previousOldBest = oldBest;

            // Determine the best of M, Ix and Iy
            if (match > insertS)
            {
            	if (match > *insertQrow)
                {
                	// Match is largest
                    *bestRow = match;

                    // Calculate new Ix
                    *insertQrow = maximum(match - parameters_openGap,
                                          *insertQrow - parameters_extendGap);

                    // Calculate new Iy
                    insertS = maximum(match - parameters_openGap,
                                      insertS - parameters_extendGap);

                    // If this is the best-yet scoring cell
                    if (match > bestScore)
                    {
                        // Update best start cell data
                        bestScore = match;
                        bestQueryPosition = queryPosition;
                        bestSubjectPosition = subjectPosition;
                    }
                }
                else
                {
                	// insertQ is largest
                    *bestRow = *insertQrow;

                    // Calculate new Ix
                    *insertQrow -= parameters_extendGap;

                    // Calculate new Iy
                    insertS = maximum(match - parameters_openGap,
                                      insertS - parameters_extendGap);
                }
            }
            else
            {
            	if (insertS > *insertQrow)
                {
                	// insertS is largest
                    *bestRow = insertS;

                    // Calculate new Ix
                    *insertQrow = maximum(match - parameters_openGap,
                                          *insertQrow - parameters_extendGap);

                	// Calculate new Iy
	                insertS -= parameters_extendGap;

                }
                else
                {
                	// insertQ is largest
                    *bestRow = *insertQrow;

                    // Calculate new Ix
                    *insertQrow -= parameters_extendGap;

                    // Calculate new Iy
                    insertS = maximum(match - parameters_openGap,
                                      insertS - parameters_extendGap);
                }
            }

            // If score at current cell (and cells to its right) are below dropoff
            if (rightOfDropoff)
            {
                if (bestScore > *bestRow + dropoff)
                {
                    // Record dropoff position
                    columnDropoff = subjectPosition;
                }
                else
                {
                    // We are left of the column dropoff for this row
                    rightOfDropoff = 0;
                }
            }

            subjectPosition--; bestRow--; insertQrow--;
        }

        // -----CELLS LEFT OF ROW DROPOFF -----
        if (!(bestScore > *(bestRow + 1) + dropoff))
        {
            while (subjectPosition >= subject)
            {
                // Set value for Iy and best
                *bestRow = insertS;
                insertS = insertS - parameters_extendGap;

                // Set DUMMY values for Ix
                *insertQrow = constants_gappedExtensionDummyValue;

                // If score at current cell is below dropoff
                if (bestScore > *bestRow + dropoff)
                {
                    // Stop processing row
                    subjectPosition--;
                    break;
                }

                subjectPosition--; bestRow--; insertQrow--;
                subjectDistance++;
            }
        }

        // Record dropoff position
        rowDropoff = subjectPosition + 1;

        // Clear insertS for next row
        insertS = constants_gappedExtensionDummyValue;

//    if (dloc == 20877970)
//        print(oldGappedScoring_bestRow, subject, rowDropoff, columnDropoff);
    }

    dpResults.best.queryOffset = bestQueryPosition - PSSMatrix.matrix;
    dpResults.best.subjectOffset = bestSubjectPosition - subject;
    dpResults.bestScore = bestScore;
    dpResults.traceback = NULL;
    return dpResults;
}
Ejemplo n.º 28
0
// Build a gapped extension with a trace and nominal score from the seed point
// of an ungapped
// extension using dynamic programming
struct gappedExtension *
gappedExtension_build(struct ungappedExtension *ungappedExtension,
                      struct PSSMatrix PSSMatrix, int4 subjectSize,
                      unsigned char *subject, struct unpackRegion *unpackRegion,
                      int4 dropoff) {
  struct coordinate seed;
  unsigned char *choppedSubject;
  struct dpResults beforeDpResults, afterDpResults;
  struct trace beforeTrace, afterTrace, trace;
  struct PSSMatrix choppedPSSMatrix;
  int4 choppedSubjectSize;
  struct gappedExtension *gappedExtension;
  int4 strandOffset = 0;

  // Perform dynamic programming for points before the seed
  seed = ungappedExtension->seed;
  if (seed.queryOffset > PSSMatrix.strandLength) {
    // If query position is in the second strand, remove first strand from PSSM
    strandOffset = PSSMatrix.strandLength;
    seed.queryOffset -= PSSMatrix.strandLength;
    PSSMatrix = PSSMatrix_chop(PSSMatrix, PSSMatrix.strandLength);
  } else {
    // Otherwise remove second strand
    PSSMatrix.length = PSSMatrix.strandLength;
  }

  beforeDpResults =
      gappedExtension_dpBeforeSeed(PSSMatrix, dropoff, seed, unpackRegion);

  // Trace back and create the trace which specifies the first half of the
  // alignment
  beforeTrace = gappedExtension_traceBeforeSeed(beforeDpResults, seed);

  // Chop the start off the query and subject so they begin at the seed
  choppedPSSMatrix = PSSMatrix_chop(PSSMatrix, seed.queryOffset);
  choppedSubject = subject + seed.subjectOffset;
  choppedSubjectSize = subjectSize - (seed.subjectOffset);

  // Perform dynamic programming for points after the seed
  afterDpResults =
      gappedExtension_dpAfterSeed(choppedPSSMatrix, dropoff, unpackRegion,
                                  choppedSubjectSize, seed.subjectOffset);

  // Trace back to get the trace for the seed onwards
  afterTrace =
      gappedExtension_traceAfterSeed(afterDpResults, choppedPSSMatrix.length);

  // Join afterTrace to the end of beforeTrace
  trace = gappedExtension_joinTraces(beforeTrace, afterTrace);
  free(afterTrace.traceCodes);

  // Adjust coordinates if extension was performed in the second strand
  afterDpResults.best.queryOffset += strandOffset;
  beforeDpResults.best.queryOffset += strandOffset;
  trace.queryStart += strandOffset;

  // Create gapped extension
  gappedExtension =
      (struct gappedExtension *)global_malloc(sizeof(struct gappedExtension));
  gappedExtension->trace = trace;
  gappedExtension->next = NULL;

  // Start of afterTrace is end of the gapped extension, but we need to add seed
  // position
  // to get correct offset
  gappedExtension->queryEnd =
      seed.queryOffset + afterTrace.queryStart + strandOffset;
  gappedExtension->subjectEnd = seed.subjectOffset + afterTrace.subjectStart;

  //	if (dloc == 88197331)
  //		printf("final[%d,%d,%d](%d)\n", beforeDpResults.bestScore,
  // afterDpResults.bestScore,
  //		choppedPSSMatrix.matrix[0][unpackRegion->unpackedSubject[seed.subjectOffset]],
  // seed.queryOffset);

  // Determine score by combining score from the two traces, and the match score
  // at
  // the seed position
  gappedExtension->nominalScore =
      beforeDpResults.bestScore + afterDpResults.bestScore +
      choppedPSSMatrix.matrix
          [0][unpackRegion->unpackedSubject[seed.subjectOffset]];

  // Update ungappedExtension start/end
  ungappedExtension->start.queryOffset = trace.queryStart;
  ungappedExtension->end.queryOffset = gappedExtension->queryEnd;
  ungappedExtension->start.subjectOffset = trace.subjectStart;
  ungappedExtension->end.subjectOffset = gappedExtension->subjectEnd;
  ungappedExtension->nominalScore = gappedExtension->nominalScore;

#ifdef VERBOSE
  if (parameters_verboseDloc == blast_dloc) {
    printf("Gapped Extension from %d,%d to %d,%d score %d\n", trace.queryStart,
           trace.subjectStart, gappedExtension->queryEnd,
           gappedExtension->subjectEnd, gappedExtension->nominalScore);
  }
#endif

  return gappedExtension;
}
Ejemplo n.º 29
0
// Process a given query position list
void qPosList_processList(int2* queryPositions, int2 numQueryPositions, int4 codeword)
{
	int4 listCount = 0, queryPositionCount, subset, present;
    struct memSingleBlock* list;
    struct queryPosition* queryPosition = NULL;
	struct codeword* newCodeword;

    // Iterative through existing query positions lists (ordered from longest to shortest)
    while (listCount < qPosList_numQPosLists)
    {
        // Check for one that contains a subset of to-be-added query positions
        list = qPosList_qPosLists + listCount;

        // Start by assuming it is
        subset = 1;

        // Iterate through each query position in the current existing list
        memSingleBlock_resetCurrent(list);
        while ((queryPosition = memSingleBlock_getCurrent(list)) != NULL && subset)
        {
            // Iterate through each query position in the new list (which is sorted)
            queryPositionCount = 0;
            while (queryPositionCount < numQueryPositions)
            {
                // Found a match, break out and proceed to next position in current list
                if (queryPosition->queryPosition == queryPositions[queryPositionCount])
                {
                    break;
                }
                // The query position is not present in the new list, then existing list
                // is not a subset of the new one
                else if (queryPosition->queryPosition < queryPositions[queryPositionCount])
                {
                    subset = 0;
                    break;
                }
                // Otherwise keep going
                queryPositionCount++;
            }

            // If we got to the end of the list, and didn't find a match, not a subset
            if (queryPositionCount == numQueryPositions)
                subset = 0;

            // If the query positions in the existing list processed so far match all of
            // the positions in the new list
            if (list->currentEntry == numQueryPositions && subset)
            {
                // We have a match, starting here
                newCodeword = global_malloc(sizeof(struct codeword));
				newCodeword->codeword = codeword;
                newCodeword->next = queryPosition->codewords;
                queryPosition->codewords = newCodeword;

                return;
            }
        }

        if (subset)
        {
            // If this existing list is a subset of the new list then add the new/additional
            // query positions to the end of it
            queryPosition = memSingleBlock_getLastEntry(list);

            // Iterate through each query position in the new list
            while (numQueryPositions > 0)
            {
                numQueryPositions--;
                present = 0;

                // Check if present in the existing list
                memSingleBlock_resetCurrent(list);
                while ((queryPosition = memSingleBlock_getCurrent(list)) != NULL && subset)
                {
                    // Found it
                    if (queryPosition->queryPosition == queryPositions[numQueryPositions])
                    {
                        present = 1;
                        break;
                    }
                }

                // Not present - add to the existing list with a null reference codeword
                if (!present)
                {
                    queryPosition = memSingleBlock_newEntry(list);
                    queryPosition->queryPosition = queryPositions[numQueryPositions];
                    // No refering codeword for any of the positions except the last
                    queryPosition->codewords = NULL;
                }

                queryPositionCount++;
            }

            // Get the last, new query position
            queryPosition = memSingleBlock_getLastEntry(list);

            // Add reference codeword to the last query position (will become first)
            newCodeword = global_malloc(sizeof(struct codeword));
            newCodeword->next = NULL;
            newCodeword->codeword = codeword;
            queryPosition->codewords = newCodeword;

            // Re-sort the lists of query positions from longest to shortest
            qsort(qPosList_qPosLists, qPosList_numQPosLists,
                  sizeof(struct memSingleBlock), qPosList_compareList);

            return;
        }

        listCount++;
    }

    // Instead use a new list of query positions
    list = qPosList_qPosLists + qPosList_numQPosLists;
    list->numEntries = 0;
    qPosList_numQPosLists++;

    // And copy values into it
    while (numQueryPositions > 0)
    {
        numQueryPositions--;
        queryPosition = memSingleBlock_newEntry(list);
        queryPosition->queryPosition = queryPositions[numQueryPositions];
        // No refering codeword for any of the positions except the last
        queryPosition->codewords = NULL;
    }

    // Reference at the last query position (will become the first) to the
    // new query position list's codeword
    newCodeword = global_malloc(sizeof(struct codeword));
    newCodeword->next = NULL;
    newCodeword->codeword = codeword;
    queryPosition->codewords = newCodeword;

    // Sort the lists from longest to shortest
    qsort(qPosList_qPosLists, qPosList_numQPosLists,
          sizeof(struct memSingleBlock), qPosList_compareList);
}
Ejemplo n.º 30
0
int4 main(int4 argc, char *argv[]) {
    unsigned char *filename, *readdb_address, *sequence, code, *wildcardsFilename;
    uint4 descriptionStart = 0, descriptionLength = 0, sequenceLength;
    uint4 encodedLength, numChildren, count;
    char *description;
    struct child *children, *child;
    uint4 candidateNum, change, childNum;
    uint4 numWilds = 0;
    struct wild *wilds, defaultWild, *candidates, bestNewCandidate;
    struct wild *wildSubset, *newCandidates, *bestNewCandidates;
    uint4 sizeWildSubset, numOccurences, numCandidates;
    float defaultWildscore, candidatesScore, bestScore;

    // User must provide FASTA format file at command line
    if (argc < 4) {
        fprintf(stderr, "Useage: chooseWilds <database> <Wildcard score constant> "
                "<Wildcards output file>\n");
        exit(-1);
    }
    filename = argv[1];
    wildcards_scoringConstant = atof(argv[2]);
    wildcardsFilename = argv[3];

    readdb_open(filename);

    printf("Number of clusters = %u\n", readdb_numberOfClusters);
    printf("Number of sequences = %u\n", readdb_numberOfSequences);
    printf("Number of volumes = %u\n", readdb_numberOfVolumes);
    printf("Total number of letters = %llu\n", readdb_numberOfLetters);
    printf("Length of longest sequence = %u\n", readdb_longestSequenceLength);
    printf("Alphabet type = %s\n", encoding_alphabetTypes[readdb_dbAlphabetType]);

    // Initialize codes array
    encoding_initialize(readdb_dbAlphabetType);

    // Load score matrix
    parameters_findScoringMatrix();
    wildcards_scoreMatrix = scoreMatrix_load(parameters_scoringMatrixPath);

    // Count occurences of each wildcard set
    wildcards_initializeCountOccurences(readdb_longestSequenceLength);
    do {
        // Read each sequence in the collection
        while (readdb_readSequence(&sequence, &sequenceLength, &descriptionStart,
                                   &descriptionLength, &encodedLength)) {
            // If a protein sequence cluster
            if (encoding_alphabetType == encoding_protein &&
                    sequenceLength + 2 != encodedLength) {
                // Get the children
                children = readdb_getChildren(sequence, sequenceLength, encodedLength,
                                              descriptionStart, &numChildren);

                // Add to list of occurences
                wildcards_countOccurences(children, numChildren, sequenceLength);

                childNum = 0;
                while (childNum < numChildren) {
                    free(children[childNum].edits);
                    free(children[childNum].sequence - 1);
                    childNum++;
                }

                free(children);
            }
        }
    } while (readdb_nextVolume());

    // Get final list of number of occurences of each wild
    wilds = wildcards_getOccurences(&numWilds);

    chooseWilds_printOccurenceMatrix(wilds, numWilds);

    // Build default wildcard
    defaultWild.code = 0;
    defaultWild.count = 0;
    code = 0;
    while (code < encoding_numLetters) {
        setbit(defaultWild.code, code);
        code++;
    }

    // Get average score for default wildcard
    wildSubset = wildcards_getSubset(defaultWild, wilds, numWilds,
                                     &sizeWildSubset, &numOccurences);
    defaultWildscore = wildcards_averageResidueWildMatch(defaultWild, wildSubset,
                       sizeWildSubset);
    printf("defaultWildScore=%f occurences=%d\n", defaultWildscore,
           numOccurences);

    // Build up list of wildcard candidates
    candidates = (struct wild *)global_malloc(sizeof(struct wild) *
                 wildcards_numClusterWildcards);
    numCandidates = 0;
    while (numCandidates < wildcards_numClusterWildcards - 1) {
        // Explore each possible option to add to list of candidates
        count = 0;
        bestScore = 0;
        while (count < numWilds) {
            //        	printf("set pos %d to ", numCandidates);
            //			wildcards_printWildcard(wilds[count].code);
            candidates[numCandidates] = wilds[count];

            // Score a set of candidates
            candidatesScore = wildcards_scoreCandidates(
                                  candidates, numCandidates + 1, wilds, numWilds, defaultWildscore);
            //            printf("Candidates saving=%f\n", candidatesScore);
            if (candidatesScore > bestScore) {
                bestScore = candidatesScore;
                bestNewCandidate = wilds[count];
            }

            count++;
        }

        printf("Score=%f Best new candidate (%d): ", bestScore, numCandidates);
        wildcards_printWildcard(bestNewCandidate.code);
        candidates[numCandidates] = bestNewCandidate;

        numCandidates++;
    }

    newCandidates = (struct wild *)global_malloc(sizeof(struct wild) *
                    wildcards_numClusterWildcards);
    bestNewCandidates = (struct wild *)global_malloc(
                            sizeof(struct wild) * wildcards_numClusterWildcards);

    // Perform hill climbing; consider changing each position
    change = 1;
    while (change) {
        change = 0;
        candidateNum = 0;
        bestScore = 0;
        while (candidateNum < numCandidates) {
            // Start with current candidates
            memcpy(newCandidates, candidates,
                   sizeof(struct wild) * wildcards_numClusterWildcards - 1);

            // Change current position to every possible candidate
            count = 0;
            while (count < numWilds) {
                newCandidates[candidateNum] = wilds[count];

                // Score a possible new set of candidates
                candidatesScore = wildcards_scoreCandidates(
                                      newCandidates, numCandidates, wilds, numWilds, defaultWildscore);

                // Check if best new candidates
                if (candidatesScore > bestScore) {
                    bestScore = candidatesScore;
                    memcpy(bestNewCandidates, newCandidates,
                           sizeof(struct wild) * wildcards_numClusterWildcards - 1);
                }

                count++;
            }

            candidateNum++;
        }

        // Update candidates
        if (bestScore > wildcards_scoreCandidates(candidates, numCandidates, wilds,
                numWilds, defaultWildscore)) {
            printf("New bestScore=%f\n", bestScore);
            memcpy(candidates, bestNewCandidates,
                   sizeof(struct wild) * wildcards_numClusterWildcards - 1);
            change = 1;
        }

        candidateNum = 0;
        while (candidateNum < numCandidates) {
            wildcards_printWildcard(candidates[candidateNum].code);
            candidateNum++;
        }
    }

    // Print out final set of clusters with default wild added
    candidates[numCandidates] = defaultWild;
    numCandidates++;
    wildcards_scoreCandidates(candidates, numCandidates, wilds, numWilds,
                              defaultWildscore);
    wildcards_outputWildcards(wildcardsFilename);

    printf("%d sequences read.\n", readdb_numberOfSequences);
    fflush(stdout);

    free(candidates);
    free(newCandidates);
    free(bestNewCandidates);

    return 0;
}