Beispiel #1
0
// Join traces before and after the seed together
struct trace gappedExtension_joinTraces(struct trace beforeTrace,
                                        struct trace afterTrace) {
  struct trace joinedTrace;
  unsigned char *traceCodes;
  int4 count;

  // Joined trace will have length equal to sum of lengths of the two traces,
  // and will start where the beforeTrace starts
  joinedTrace.length = beforeTrace.length + afterTrace.length;
  joinedTrace.queryStart = beforeTrace.queryStart;
  joinedTrace.subjectStart = beforeTrace.subjectStart;
  // Add memory to space already allocated by before traceCodes
  traceCodes = (unsigned char *)global_realloc(
      beforeTrace.traceCodes, sizeof(unsigned char) * joinedTrace.length);

  // Add after trace codes to end in reverse order
  count = 0;
  while (count < afterTrace.length) {
    traceCodes[beforeTrace.length + count] =
        afterTrace.traceCodes[afterTrace.length - count - 1];
    count++;
  }

  joinedTrace.traceCodes = traceCodes;

  return joinedTrace;
}
Beispiel #2
0
void proteinLookup_db_sub(uint4 sequenceOffset, unsigned char *sequence,
        int4 subjectLength, int4 wordLength, int blockNum) {

    uint4 codeword, numEntries, byteNumber, queryPosition;
    struct initialWord_protein_db *initialLookup, *initialWord;

    // Number of entries in the table
    numEntries = proteinLookup_numWords;

    initialLookup = proteinLookup_db + blockNum * proteinLookup_numWords;

    // Slide a word-sized window across the query
    //queryPosition = 0;

    for(queryPosition = 0; queryPosition < (subjectLength - wordLength + 1); queryPosition++)
    {

        //int4 codeword;
        uint4 codeCount;

        codeword = 0;
        codeCount = 0;
        while (codeCount < wordLength) {
            codeword *= wordLookupDFA_numCodes;
            if (sequence[queryPosition + codeCount] < wordLookupDFA_numCodes)
                codeword += sequence[queryPosition + codeCount];
            codeCount++;
        }

        //codeword = getCodeword(sequence + queryPosition, wordLength);

        initialWord = initialLookup + codeword;

        initialWord->numSubPositions++;
        if (initialWord->numSubPositions > initialWord->allocSubPositions) {
            // total_allocHits += allocSubPositions[codeword];
            initialWord->allocSubPositions = initialWord->allocSubPositions == 0
                ? 10
                : 2 * initialWord->allocSubPositions;
            initialWord->subSequencePositions = (subPos_t *)global_realloc(
                    initialWord->subSequencePositions,
                    sizeof(subPos_t) * initialWord->allocSubPositions);
        }

        // initialWord->subSequencePositions[initialWord->numSubPositions - 1] =
        //((sequenceNum - blockNum * dbIdx_block_size) << 16) + queryPosition;

        initialWord->subSequencePositions[initialWord->numSubPositions - 1].subOff = queryPosition;
        initialWord->subSequencePositions[initialWord->numSubPositions - 1].seqId = sequenceOffset;

        //initialWord->subSequencePositions[initialWord->numSubPositions - 1] =
        //(queryPosition << 16) + (sequenceNum - startSeqNum);

        //queryPosition++;
    }
}
Beispiel #3
0
// Get a run of consecutive entries from the block
void *memBlocks_newEntries(struct memBlocks *memBlocks, uint4 numNewEntries) {
  void *newEntry;

  // Check if we need to create a new block of memory
  if (memBlocks->numEntries[memBlocks->numBlocks - 1] + numNewEntries >
      memBlocks->blockSizes) {
    // Declare memory for the new block
    memBlocks->lastBlock =
        (void *)global_malloc(memBlocks->entrySize * memBlocks->blockSizes);

    // Check if we need more memory for block pointers
    if (memBlocks->numBlocks >= memBlocks->maxNumBlocks) {
      // Allocate more
      memBlocks->maxNumBlocks *= 2;
      memBlocks->blocks = (void **)global_realloc(
          memBlocks->blocks, sizeof(void *) * memBlocks->maxNumBlocks);
      memBlocks->numEntries = (int4 *)global_realloc(
          memBlocks->numEntries, sizeof(int4) * memBlocks->maxNumBlocks);
    }

    // Store the address of this new block
    memBlocks->blocks[memBlocks->numBlocks] = memBlocks->lastBlock;

    // Reset number of entries in this block
    memBlocks->numEntries[memBlocks->numBlocks] = 0;
    memBlocks->numBlocks++;
  }

  // Use the next available slot in the latest block
  newEntry =
      ((char *)(memBlocks->lastBlock)) +
      memBlocks->numEntries[memBlocks->numBlocks - 1] * memBlocks->entrySize;

  memBlocks->numEntries[memBlocks->numBlocks - 1] += numNewEntries;
  memBlocks->numTotalEntries += numNewEntries;

  return newEntry;
}
Beispiel #4
0
// Add a given word position to the index
void index_addWord(uint4 codeword, uint4 subjectNumber, uint4 offset)
{
	struct wordList* wordList;
    unsigned char* vbyteEncoded;
    uint4 sequenceGap, offsetGap, vbyteSize;

    wordList = index_words + codeword;

    // Encoded the difference between this sequence number and the last
    sequenceGap = subjectNumber - wordList->lastSequenceNumber;
    vbyteEncoded = index_vbyteEncoded;
	vbyte_putVbyte(vbyteEncoded, sequenceGap);

    // If last occurrence was in the same sequence, store difference in offet
    if (sequenceGap == 0)
    	offsetGap = offset - wordList->lastOffset;
	else
	    // Else store absolute offset
		offsetGap = offset;
    vbyte_putVbyte(vbyteEncoded, offsetGap);

    vbyteSize = vbyteEncoded - index_vbyteEncoded;

    // If we need more space in the list of offsets
    if (wordList->length + vbyteSize >= wordList->allocated)
    {
		wordList->allocated += vbyteSize + 10;

        wordList->offsets = (unsigned char*)global_realloc(wordList->offsets,
                            sizeof(char) * wordList->allocated);
    }

	// Record encoded offset
    memcpy(wordList->offsets + wordList->length, index_vbyteEncoded, vbyteSize);

    wordList->lastOffset = offset;
    wordList->lastSequenceNumber = subjectNumber;
    wordList->length += vbyteSize;
}
Beispiel #5
0
// Extend the end of a region if necessary
void unpack_extendRegionEnd(int4 position, struct unpackRegion *unpackRegion) {
  unsigned char *newUnpackedSubject;
  int4 newRegionStart, newRegionEnd;

  //    printf("pos=%d region=%d,%d subjectLength=%d\n", position,
  // unpackRegion->startOffset,
  //           unpackRegion->endOffset, unpackRegion->subjectLength);
  // fflush(stdout);

  if (position > unpackRegion->endOffset) {
    // Extend the region end
    newRegionStart = unpackRegion->startOffset;
    newRegionEnd = unpackRegion->endOffset + constants_unpackRegionExtend;
    if (newRegionEnd > unpackRegion->subjectLength)
      newRegionEnd = unpackRegion->subjectLength;

    // Realloc memory for the new region
    unpackRegion->unpackedSubject += unpackRegion->startOffset;
    newUnpackedSubject = (unsigned char *)global_realloc(
        unpackRegion->unpackedSubject,
        sizeof(char) * (newRegionEnd - newRegionStart));
    newUnpackedSubject -= newRegionStart;

    // Round old end
    unpackRegion->endOffset = (unpackRegion->endOffset / 4) * 4;

    // Unpack the new part of the region
    encoding_byteUnpackRegion(newUnpackedSubject + unpackRegion->endOffset,
                              unpackRegion->subject +
                                  (unpackRegion->endOffset / 4),
                              newRegionEnd - unpackRegion->endOffset);

    unpackRegion->unpackedSubject = newUnpackedSubject;

    unpackRegion->endOffset = newRegionEnd;
  }
}
Beispiel #6
0
int4 main(int argc, char* argv[])
{
    char *sequence, *filename;
    uint4 sequenceLength;
    int4 totalWilds = 0, alphabetType;
    struct memSingleBlock* wildcardEdits;
    struct wildcardEdit* wildcardEdit;
    char *wildcardData = NULL, *startWildcardData = NULL;

    // User must provide FASTA format file at command line
    if (argc < 2)
    {
        fprintf(stderr, "Useage: formatdb <FASTA file>\n");
        exit(-1);
    }
    filename = argv[1];

    // Initialize array to store wildcard edits
    wildcardEdits = memSingleBlock_initialize(sizeof(struct wildcardEdit), 10);

    // Determine if database is protein or nucleotide
    alphabetType = determineDbAlphabetType(filename);

    if (alphabetType == encoding_protein)
    {
        printf("PROTEIN database detected.\n");
    }
    else if (alphabetType == encoding_nucleotide)
    {
        printf("NUCLEOTIDE database detected.\n");
    }

    // Initialize codes array
    encoding_initialize(alphabetType);

    // Initialize writing to formatted database
    writedb_initialize(filename, alphabetType);

    // Open FASTA file for reading
    readFasta_open(filename);

    printf("Formatting database...");
    fflush(stdout);

    // Move through the FASTA file reading descriptions and sequences
    while (readFasta_readSequence())
    {
        // Get sequence just read
        sequence = readFasta_sequenceBuffer;
        sequenceLength = readFasta_sequenceLength;

        // Encode the sequence
        encoding_encodeSequence(sequence, sequenceLength, alphabetType);

        // Convert nucleotide sequences to byte-packed format
        if (alphabetType == encoding_nucleotide)
        {
            // Replace any wilds with a random character
            totalWilds += encoding_replaceWildcards(wildcardEdits, sequence, sequenceLength);

            // Declare memory to hold wildcard data
            startWildcardData = global_realloc(startWildcardData,
                                               sizeof(char) * wildcardEdits->numEntries * 5);
            wildcardData = startWildcardData;

            // For each wildcard edit, encode details using chars and vbytes
            memSingleBlock_resetCurrent(wildcardEdits);
            while ((wildcardEdit = memSingleBlock_getCurrent(wildcardEdits)) != NULL)
            {
                // Record wild character
                *wildcardData = wildcardEdit->code;
                wildcardData++;

                // Convert the position to a vbyte
                vbyte_putVbyte(wildcardData, wildcardEdit->position);
            }
        }
        else
        {
            startWildcardData = wildcardData = NULL;
        }

//        printf("[%s](%d)", readFasta_descriptionBuffer, readFasta_descriptionLength); fflush(stdout);

        // Add sequence to the formatted collection
        writedb_addSequence(sequence, sequenceLength, readFasta_descriptionBuffer,
                            readFasta_descriptionLength, startWildcardData,
                            wildcardData - startWildcardData, NULL, 0);

        // Print status dots
        if (writedb_sequenceCount % 10000 == 0)
        {
            printf(".");
            fflush(stdout);
        }
    }

    // Close fasta reader
    readFasta_close();

    // Finalize writing to the formatted collection
    writedb_close();

    printf("done.\n");
    printf("%d sequences processed.\n", writedb_sequenceCount);
    printf("%llu letters processed.\n", writedb_numberOfLetters);
    printf("%d wildcards encoded.\n", totalWilds);
    printf("%d volume(s) created.\n", writedb_volume + 1);
    printf("Longest/shortest sequence was %d/%d letters\n",
           writedb_maximumSequenceLength, writedb_minimumSequenceLength);
    fflush(stdout);

    return 0;
}