Ejemplo n.º 1
0
// Print the contents of the index
void index_print()
{
	uint4 codeword = 0;
	struct wordList* wordList;
    unsigned char* offsets, *endOffsets;
    uint4 offsetGap, offset, numOffsets;
	uint4 totalSize = 0;

    while (codeword < index_numWords)
    {
    	numOffsets = 0; offset = 0;

        wordList = index_words + codeword;

		offsets = wordList->offsets;
        endOffsets = offsets + wordList->length;

        totalSize += wordList->length;

        if (offsets < endOffsets)
        	printf("\nCodeword=%u:", codeword);

        while (offsets < endOffsets)
        {
			vbyte_getVbyte(offsets, (&offsetGap));
            offset += offsetGap;
            printf(" %u", offset);
            numOffsets++;
        }

//    	printf("[%d/%d = %f]\n", wordList->length, numOffsets, (float)(wordList->length) / (float)numOffsets);

        codeword++;
    }

    printf("\nTotal table size=%d bytes\n", totalSize);
}
Ejemplo n.º 2
0
// Unpack entire or sections of a subject sequence before gapped alignment
void unpack_unpackSubject(struct PSSMatrix PSSMatrix,
                          struct alignment *alignment) {
  unsigned char *subject, *unpackedSubject, wildcard, *edits, *endEdits;
  uint4 wildcardPosition;
  struct unpackRegion *firstRegion = NULL, *lastRegion, *currentRegion,
                      *unpackRegion;
  int4 regionStart, regionEnd, numRegions;

  // No need to unpack a protein subject, or already unpacked nucleotide subject
  if (parameters_ssearch || encoding_alphabetType == encoding_protein) {
    // Just create a single region covering the entire sequence
    firstRegion = memBlocks_newEntry(unpack_unpackRegions);
    firstRegion->startOffset = 0;
    firstRegion->endOffset = alignment->subjectLength;
    firstRegion->subject = alignment->subject;
    firstRegion->unpackedSubject = alignment->subject;
    firstRegion->subjectLength = alignment->subjectLength;
    alignment->unpackRegions = firstRegion;
    alignment->numUnpackRegions = 1;
    return;
  }

  // Get the subject regions for this alignment
  numRegions = unpack_getRegions(PSSMatrix, alignment, 0, unpack_unpackRegions);
  lastRegion = memBlocks_getLastEntry(unpack_unpackRegions);
  lastRegion++;
  firstRegion = lastRegion - numRegions;

  // Sort the regions in order of start position
  qsort(firstRegion, lastRegion - firstRegion, sizeof(struct unpackRegion),
        unpack_compareUnpackRegions);

  // Unpack each region
  currentRegion = firstRegion;
  while (currentRegion < lastRegion) {
    regionEnd = currentRegion->endOffset;
    regionStart = currentRegion->startOffset;

#ifdef VERBOSE
    if (parameters_verboseDloc == alignment->descriptionLocation) {
      printf("Unpack subject region %d to %d (length=%d)\n", regionStart,
             regionEnd, alignment->subjectLength);
      fflush(stdout);
    }
#endif

    // Get the subject region to be unpacked
    if (alignment->unpackRegions == NULL) {
      subject = alignment->subject;
    } else {
      unpackRegion = unpack_selectRegion(
          alignment->unpackRegions, alignment->numUnpackRegions, regionStart);
      subject = unpackRegion->subject;
    }

    // Declare memory for the region
    unpackedSubject = (unsigned char *)global_malloc(sizeof(char) *
                                                     (regionEnd - regionStart));

    // Unpack the region of interest
    encoding_byteUnpackRegion(unpackedSubject, subject + (regionStart / 4),
                              regionEnd - regionStart);
    unpackedSubject -= regionStart;
    currentRegion->unpackedSubject = unpackedSubject;

    currentRegion->subject = subject;
    currentRegion->subjectLength = alignment->subjectLength;

    blast_totalUnpacked += (regionEnd - regionStart);

    currentRegion++;
  }

  currentRegion = firstRegion;

  // Get wildcard edits for the sequence
  edits = alignment->edits;
  endEdits = alignment->edits + alignment->encodedLength -
             ((alignment->subjectLength + 3) / 4);

  // If there are edits
  if (edits < endEdits) {
    // Read first wildcard
    wildcard = *edits;
    edits++;

    // Read its position
    vbyte_getVbyte(edits, &wildcardPosition);

    // For each region in order of position in the subject
    while (currentRegion < lastRegion) {
      // Skip past edits that are before current region
      while (edits < endEdits &&
             wildcardPosition < currentRegion->startOffset) {
        // Read wildcard
        wildcard = *edits;
        edits++;

        // Read its position
        vbyte_getVbyte(edits, &wildcardPosition);
      }

      // Process edits that are in the current region
      while (edits < endEdits && wildcardPosition < currentRegion->endOffset) {
        // Insert wildcard into sequence
        currentRegion->unpackedSubject[wildcardPosition] = wildcard;

        // Read next wildcard
        wildcard = *edits;
        edits++;

        // Read its position
        vbyte_getVbyte(edits, &wildcardPosition);
      }

      // Advance to the next region
      currentRegion++;
    }
  }

  alignment->unpackRegions = firstRegion;
  alignment->numUnpackRegions = lastRegion - firstRegion;
}
Ejemplo n.º 3
0
// Given a query sequence uses an inverted index of the collection to identify the
// sequence number and offset of all hits between the query and the collection
void index_processQuery(unsigned char* startIndex, struct PSSMatrix PSSMatrix,
                        uint4 numSequences)
{
	uint4 queryPosition, codeword = 0, queryPosition4;
    unsigned char* offsets, *endOffsets;
    uint4 offsetGap, offset, sequenceGap, sequenceNumber;
    struct indexCoordinate* coordinate;
	struct memBlocks* unsortedCoordinates;
    uint4 *numSubjectHits, numQueryPositions, queryWordCount, numOffsets;
    uint4 time, wordPosition, containsWildcard;
	struct queryWord* queryWords;

    // Read word and interval size from start of index
	vbyte_getVbyte(startIndex, &index_wordSize);
	vbyte_getVbyte(startIndex, &index_intervalSize);

	index_numWords = pow(4, index_wordSize);
    index_sequencePositions = (uint4*)startIndex;
    index_descriptionLocations = index_sequencePositions + numSequences;
	index_loadedWords = index_descriptionLocations + numSequences;
	index_offsets = (unsigned char*)(index_loadedWords + index_numWords + 1);

    time = clock();
    unsortedCoordinates = memBlocks_initialize(sizeof(struct indexCoordinate), numSequences);

    // Declare and initialize array for count number of hits for each sequence
    numSubjectHits = (uint*)global_malloc(sizeof(uint4) * numSequences);
	sequenceNumber = 0;
    while (sequenceNumber < numSequences)
    {
    	numSubjectHits[sequenceNumber] = 0;
    	sequenceNumber++;
    }

    // Memory to hold offsets string for each query word
    numQueryPositions = PSSMatrix.length - index_wordSize + 1;
	queryWords = (struct queryWord*)global_malloc(sizeof(struct queryWord) * numQueryPositions);

    // For each word in the query
    queryPosition = 0;
    while (queryPosition < numQueryPositions)
    {
    	// Check if the word contains a wildcard
        containsWildcard = 0; wordPosition = 0;
        while (wordPosition < index_wordSize)
        {
            if (PSSMatrix.queryCodes[queryPosition + wordPosition] >= encoding_numRegularLetters)
                containsWildcard = 1;

            wordPosition++;
        }

        // Don't include words that cross the strand boundry or contain wildcards
        if (!containsWildcard && !(queryPosition < PSSMatrix.strandLength &&
              queryPosition >= PSSMatrix.strandLength - index_wordSize + 1))
		{
//            printf("--Query position=%d\n", queryPosition);

            // Get the codeword
            codeword = index_generateCodeword(PSSMatrix.bestMatchCodes + queryPosition, index_wordSize);

            // Get wordlist for that codeword
            offsets = index_offsets + index_loadedWords[codeword];
            endOffsets = index_offsets + index_loadedWords[codeword + 1];

            queryWords[queryPosition].offsets = offsets;
			queryWords[queryPosition].endOffsets = endOffsets;
			queryWords[queryPosition].queryPosition = queryPosition;
            queryWords[queryPosition].codeword = codeword;

//            printf("codeword=%d start=%d end=%d numHits=%d\n", codeword, index_loadedWords[codeword],
//                   index_loadedWords[codeword + 1], endOffsets - offsets);
		}
        else
        {
            queryWords[queryPosition].offsets = NULL;
			queryWords[queryPosition].endOffsets = NULL;
			queryWords[queryPosition].queryPosition = queryPosition;
            queryWords[queryPosition].codeword = codeword;
        }

//        printf("\n");
    	queryPosition++;
    }

    // Sort the query words by codeword
	qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareCodeword);

    // For each query word
    queryWordCount = 0;
    while (queryWordCount < numQueryPositions)
    {
    	// Ignoring those that cross the strand boundry
		if (queryWords[queryWordCount].offsets != NULL)
        {
        	// Make in-memory copy of list of offsets
            numOffsets = queryWords[queryWordCount].endOffsets - queryWords[queryWordCount].offsets;
			offsets = (char*)global_malloc(sizeof(char) * numOffsets);

            memcpy(offsets, queryWords[queryWordCount].offsets, numOffsets);
			queryWords[queryWordCount].offsets = offsets;
            queryWords[queryWordCount].endOffsets = offsets + numOffsets;
		}

        queryWordCount++;
    }

    // Sort the query words by query position
	qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareQueryPosition);

    queryPosition = 0;
    while (queryPosition < numQueryPositions)
    {
    	// Ignoring those that cross the strand boundry
		if (queryWords[queryPosition].offsets != NULL)
        {
        	offsets = queryWords[queryPosition].offsets;
            endOffsets = queryWords[queryPosition].endOffsets;
            offset = 0;
            sequenceNumber = 0;
        	queryPosition4 = queryPosition + (index_wordSize - 4);

            // Traverse the offsets
            while (offsets < endOffsets)
            {
                vbyte_getVbyte(offsets, (&sequenceGap));
                vbyte_getVbyte(offsets, (&offsetGap));

//                printf("[%d,%d]\n", sequenceGap, offsetGap);

                if (sequenceGap > 0)
                {
                	offset = offsetGap;
                    sequenceNumber += sequenceGap;
                }
                else
                {
                	offset += offsetGap;
                }
    //            printf(" %u", offset);

                // Add query/database coordinate of match to relevant bucket
//                printf("Sequence number=%d\n", sequenceNumber);
                coordinate = (struct indexCoordinate*)memBlocks_newEntry(unsortedCoordinates);
                coordinate->queryOffset = queryPosition4;
                coordinate->subjectOffset = offset * index_intervalSize + (index_wordSize - 4);
                coordinate->subjectNumber = sequenceNumber;

                numSubjectHits[sequenceNumber]++;
//                printf("[%d,%d]\n", queryPosition, offset);

                blast_numHits++;
            }

            free(queryWords[queryPosition].offsets);
		}

        queryPosition++;
	}


    printf("Time to process query=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC);
    time = clock();

    // Make memory for sorted list
    index_numCoordinates = unsortedCoordinates->numTotalEntries;
	index_coordinates = (struct indexCoordinate*)global_malloc(
                         sizeof(struct indexCoordinate) * index_numCoordinates);
	index_sequenceCoordinates = (struct indexCoordinate**)global_malloc(
                                 sizeof(struct indexCoordinate*) * numSequences);

    // For each sequence
	coordinate = index_coordinates;
    sequenceNumber = 0;
    while (sequenceNumber < numSequences)
    {
    	// If it has hits
    	if (numSubjectHits[sequenceNumber] != 0)
        {
        	// Point to location in sorted list of coordinates
			index_sequenceCoordinates[sequenceNumber] = coordinate;
            coordinate += numSubjectHits[sequenceNumber];

            numSubjectHits[sequenceNumber] = 0;
        }
    	sequenceNumber++;
    }

    // Move through list of unsorted coordinates
    memBlocks_resetCurrent(unsortedCoordinates);
    while ((coordinate = memBlocks_getCurrent(unsortedCoordinates)) != NULL)
    {
    	sequenceNumber = coordinate->subjectNumber;
//    	printf("%d,%d=[%d]\n", index_sequenceCoordinates[sequenceNumber], numSubjectHits[sequenceNumber], sequenceNumber);
    	// Place into sorted list
		index_sequenceCoordinates[sequenceNumber][numSubjectHits[sequenceNumber]] = *coordinate;
		numSubjectHits[sequenceNumber]++;
    }

    memBlocks_free(unsortedCoordinates);

/*    // Print sorted coordinates
	coordinate = index_coordinates;
    while (coordinate < index_coordinates + index_numCoordinates)
    {
    	printf("[%d]", coordinate);
    	printf("Subject %d Offset %d,%d\n", coordinate->subjectNumber, coordinate->queryOffset,
                                            coordinate->subjectOffset);
    	coordinate++;
    }*/

    printf("Time to sort buckets=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC);
}