Ejemplo n.º 1
0
// Perform an ungapped extension between points queryStart,subjectStart and queryEnd,subjectEnd
// and extend in each direction until score drops below best score yet minus a dropoff parameter
// Shucai
struct ungappedExtension* ungappedExtension_extend(int4 queryoffset, unsigned char* subjectHit,
	uint4 lastHitFP, struct PSSMatrix PSSMatrix, struct PSSMatrixFP PSSMatrixFP, 
	unsigned char* subject, unsigned char *startAddressFP)
{
	//Shucai
	int2 *queryPosition;
	unsigned char *subjectPosition, *subjectStart, *subjectEnd;
	int4 changeSinceBest = 0;
	int4 dropoff, originalDropoff;

    originalDropoff = dropoff = -statistics_ungappedNominalDropoff;
	ungappedExtension_bestScore = 0;

	// Start at queryEnd,subjectEnd (right/last hit position)
	queryPosition = PSSMatrixFP.matrix + queryoffset * encoding_numCodes;
	subjectPosition = subjectStart = subjectHit;

	// Extend the start of the hit backwards until dropoff
	while (changeSinceBest > dropoff)
	{
		//changeSinceBest += (*queryPosition)[*subjectPosition];
		changeSinceBest += queryPosition[*subjectPosition];

        // If we have got a positive score
		if (changeSinceBest > 0)
		{
			// Keep updating best score and resetting change-since-best
			// whilst we are reading positive scores
			do
			{
				ungappedExtension_bestScore += changeSinceBest;
				//Shucai
				queryPosition -= encoding_numCodes; 
				subjectPosition--;
				//Shucai
				changeSinceBest = queryPosition[*subjectPosition];
			}
			while (changeSinceBest > 0);

			subjectStart = subjectPosition;
		}
		//Shucai
		queryPosition -= encoding_numCodes; 
		subjectPosition--;
	}

	// Correct for extra decrement
	subjectStart++;

	// If best start point is right of previous hit which helped trigger this extension
	// then stop now
	// Shucai
	//if (subjectStart - startAddressFP > lastHitFP)
	if (subjectStart -  subject > lastHitFP)
	{
		//Shucai
		//ungappedExtension_subjectEndReachedFP = subjectHit - startAddressFP;
		ungappedExtension_subjectEndReachedFP = subjectHit - subject;
		return NULL;
	}

	// Starting at right/last hit position again
	//Shucai
	queryPosition = PSSMatrixFP.matrix + (queryoffset + 1) * encoding_numCodes;
    subjectEnd = subjectHit;

	subjectPosition = subjectHit + 1;
    changeSinceBest = 0;

    // May need to alter dropoff so we also dropoff if below zero
    if (-ungappedExtension_bestScore > originalDropoff)
    {
    	dropoff = -ungappedExtension_bestScore;
    }

	// Extend end of alignment until dropoff
	while (changeSinceBest > dropoff)
	{
		//Shucai
		changeSinceBest += queryPosition[*subjectPosition];

        // If we have got a positive score
		if (changeSinceBest > 0)
		{
			// Keep updating best score and resetting change-since-best
			// whilst we are reading positive scores
			do
			{
				ungappedExtension_bestScore += changeSinceBest;
				//Shucai
				queryPosition += encoding_numCodes; 
				subjectPosition++;
				//Shucai
				changeSinceBest = queryPosition[*subjectPosition];
			}
			while (changeSinceBest > 0);

			subjectEnd = subjectPosition;

			// Check need for change in dropoff
            if ((dropoff = -ungappedExtension_bestScore) < originalDropoff)
            {
            	dropoff = originalDropoff;
            }
        }
		//Shucai
		queryPosition += encoding_numCodes; 
		subjectPosition++;
	}

	// Correct for extra increment
	subjectEnd--;
	//Shucai
	//ungappedExtension_subjectEndReachedFP = subjectEnd - startAddressFP;
	ungappedExtension_subjectEndReachedFP = subjectEnd - subject;

    // If extension scored above trigger for gapping, create object and return it
    if (ungappedExtension_bestScore >= blast_ungappedNominalTrigger)
    {
    	int4 diagonal;
        struct ungappedExtension* newUngappedExtension;
        newUngappedExtension = memBlocks_newEntry(ungappedExtension_extensions);

        // Calculate diagonal
        // Shucai
		diagonal = (subjectHit - subject) - queryoffset;

        // Determine offsets from pointers
        newUngappedExtension->start.subjectOffset = subjectStart - subject;
        newUngappedExtension->end.subjectOffset = subjectEnd - subject;
        newUngappedExtension->start.queryOffset = newUngappedExtension->start.subjectOffset - diagonal;
        newUngappedExtension->end.queryOffset = newUngappedExtension->end.subjectOffset - diagonal;

        // Find the seed point
        newUngappedExtension->seed = ungappedExtension_findProteinSeed(newUngappedExtension, PSSMatrix, PSSMatrixFP, subject);
        // Initialize next to null
        newUngappedExtension->next = NULL;
        newUngappedExtension->nominalScore = ungappedExtension_bestScore;
        newUngappedExtension->status = ungappedExtension_UNGAPPED;

        return newUngappedExtension;
    }
    else
    {
    	return NULL;
    }
}
Ejemplo n.º 2
0
// Perform one-hit seeded ungapped extension for nucleotide, 1 packed-byte at a time
struct ungappedExtension* ungappedExtension_nucleotideExtend(int4 queryHitOffset,
	int4 subjectHitOffset, struct PSSMatrix PSSMatrix, unsigned char* subject,
    uint4 subjectLength)
{
	unsigned char* queryPosition, *minQueryPosition, *maxQueryPosition;
	unsigned char* subjectPosition, *subjectStart, *subjectEnd;
	int4 dropoff, originalDropoff;
    int4 changeSinceBest = 0;
    int4 matchLettersScore;

	originalDropoff = dropoff = -statistics_ungappedNominalDropoff;

    // Start with score for lookup-table nucleotide match that is not aligned
    ungappedExtension_bestScore = ungappedExtension_tableMatchesReward;

    // Determine minimum query position; either start of the query or start of the second strand
    if (queryHitOffset <= PSSMatrix.strandLength)
    {
        if (queryHitOffset < subjectHitOffset * 4)
            minQueryPosition = PSSMatrix.bytePackedCodes;
        else
            minQueryPosition = PSSMatrix.bytePackedCodes + queryHitOffset - subjectHitOffset * 4;
	}
    else
    {
        if (queryHitOffset - PSSMatrix.strandLength < subjectHitOffset * 4)
            minQueryPosition = PSSMatrix.bytePackedCodes + PSSMatrix.strandLength;
        else
            minQueryPosition = PSSMatrix.bytePackedCodes + queryHitOffset - subjectHitOffset * 4;
    }

	// Start left of hit location
	queryPosition = PSSMatrix.bytePackedCodes + queryHitOffset - parameters_wordTableLetters - 4;
	subjectPosition = subjectStart = subject + subjectHitOffset - parameters_wordTableBytes - 1;

    // Consider partial match of first byte before hit
	matchLettersScore = PSSMatrix_packedLeftMatchScores[*queryPosition ^ *subjectPosition];
    ungappedExtension_bestScore += matchLettersScore;
	changeSinceBest = -matchLettersScore;

    // Move back through alignment until start of query or subject, or until dropoff
    while (queryPosition > minQueryPosition)
    {
    	// Add score of matching entire bytes
		changeSinceBest += PSSMatrix_packedScore[*queryPosition ^ *subjectPosition];

        #ifdef VERBOSE
        if (parameters_verboseDloc == blast_dloc)
        {
        	printf("<%d< ", PSSMatrix_packedScore[*queryPosition ^ *subjectPosition]);
            printf("["); encoding_printLetters(*queryPosition, 4);
            printf(","); encoding_printLetters(*subjectPosition, 4); printf("]\n");
		}
        #endif

        // If we possibly have a new best score
        if (changeSinceBest > ungappedExtension_minus3reward)
        {
            // Get score for matching individual letters in next byte
        	queryPosition-=4; subjectPosition--;
	        matchLettersScore = PSSMatrix_packedLeftMatchScores[*queryPosition ^ *subjectPosition];

            // If best score
            if (changeSinceBest + matchLettersScore > 0)
            {
                // Mark new best position
                subjectStart = subjectPosition;

                // Update best score and change since best
                ungappedExtension_bestScore += changeSinceBest + matchLettersScore;
                changeSinceBest = -matchLettersScore;

                #ifdef VERBOSE
                if (parameters_verboseDloc == blast_dloc)
                    printf("(Best=%d)\n", ungappedExtension_bestScore);
                #endif
            }
        }
        else
        {
        	// Decrease in score, check dropoff
			if (changeSinceBest < dropoff)
            	break;

            queryPosition-=4; subjectPosition--;
        }
    }

    // Determine maximum query position; either end of the query or end of the first strand
    if (queryHitOffset <= PSSMatrix.strandLength)
    {
        if (PSSMatrix.strandLength - queryHitOffset < subjectLength - subjectHitOffset * 4)
            maxQueryPosition = PSSMatrix.bytePackedCodes + PSSMatrix.strandLength - 4;
        else
            maxQueryPosition = PSSMatrix.bytePackedCodes + (subjectLength - subjectHitOffset * 4)
                             + queryHitOffset - 4;
	}
    else
    {
        if (PSSMatrix.length - queryHitOffset < subjectLength - subjectHitOffset * 4)
            maxQueryPosition = PSSMatrix.bytePackedCodes + PSSMatrix.length - 4;
        else
            maxQueryPosition = PSSMatrix.bytePackedCodes + (subjectLength - subjectHitOffset * 4)
                             + queryHitOffset - 4;
    }

    // Starting right of hit position
	queryPosition = PSSMatrix.bytePackedCodes + queryHitOffset;
	subjectPosition = subjectEnd = subject + subjectHitOffset;
	changeSinceBest = 0;

    // May need to alter dropoff so we also dropoff if below zero
    if (-ungappedExtension_bestScore > originalDropoff)
    {
    	dropoff = -ungappedExtension_bestScore;
    }

    // Consider partial match of first byte after hit
	matchLettersScore = PSSMatrix_packedRightMatchScores[*queryPosition ^ *subjectPosition];
    ungappedExtension_bestScore += matchLettersScore;
	changeSinceBest = -matchLettersScore;

    // Move forward through alignment until end of query or subject, or until dropoff
    while (queryPosition < maxQueryPosition)
    {
		// Score of matching entire bytes
		changeSinceBest += PSSMatrix_packedScore[*queryPosition ^ *subjectPosition];

        #ifdef VERBOSE
        if (parameters_verboseDloc == blast_dloc)
        {
        	printf(">%d> ", PSSMatrix_packedScore[*queryPosition ^ *subjectPosition]);
            printf("["); encoding_printLetters(*queryPosition, 4);
            printf(","); encoding_printLetters(*subjectPosition, 4); printf("]\n");
            printf("changeSinceBest=%d\n", changeSinceBest);
		}
        #endif

        // If we possibly have a new best score
        if (changeSinceBest > ungappedExtension_minus3reward)
        {
            // Get score for matching individual letters in next byte
        	queryPosition+=4; subjectPosition++;
	        matchLettersScore = PSSMatrix_packedRightMatchScores[*queryPosition ^ *subjectPosition];

            // If best score
            if (changeSinceBest + matchLettersScore > 0)
            {
                // Mark new best position
                subjectEnd = subjectPosition;

                // Update best score and change since best
                ungappedExtension_bestScore += changeSinceBest + matchLettersScore;
                changeSinceBest = -matchLettersScore;

                #ifdef VERBOSE
                if (parameters_verboseDloc == blast_dloc)
                    printf("(Best=%d)\n", ungappedExtension_bestScore);
                #endif
            }
        }
        else
        {
        	// Decrease in score, check dropoff
			if (changeSinceBest < dropoff)
            	break;

            queryPosition+=4; subjectPosition++;
        }
    }

    // Record the point we got to extending forwards
    ungappedExtension_subjectEndReached = subjectPosition;

    // If extension scored above trigger for gapping, create object and return it
    if (ungappedExtension_bestScore >= blast_ungappedNominalTrigger)
    {
    	int4 diagonal;
        struct ungappedExtension* newUngappedExtension;
        newUngappedExtension = memBlocks_newEntry(ungappedExtension_extensions);

        // Correct for extra decrement
        subjectStart++;
        // Correct for extra increment
        subjectEnd--;

        // Calculate diagonal
        diagonal = subjectHitOffset * 4 - queryHitOffset;

        // Determine offsets from pointers
        newUngappedExtension->start.subjectOffset = (subjectStart - subject) * 4;
        newUngappedExtension->end.subjectOffset = (subjectEnd - subject) * 4;
        newUngappedExtension->start.queryOffset = newUngappedExtension->start.subjectOffset - diagonal;
        newUngappedExtension->end.queryOffset = newUngappedExtension->end.subjectOffset - diagonal;

		newUngappedExtension->seed.queryOffset = -1;
		newUngappedExtension->seed.subjectOffset = -1;

        // Initialize next to null
        newUngappedExtension->next = NULL;
        newUngappedExtension->nominalScore = ungappedExtension_bestScore;
        newUngappedExtension->status = ungappedExtension_UNGAPPED;

        #ifdef VERBOSE
        if (parameters_verboseDloc == blast_dloc)
        {
            printf("Hit=%d,%d\n", queryHitOffset, subjectHitOffset);
            printf("%d,%d - %d,%d\n", newUngappedExtension->start.queryOffset, newUngappedExtension->start.subjectOffset,
                                      newUngappedExtension->end.queryOffset, newUngappedExtension->end.subjectOffset);
                                      fflush(stdout);
            printf("seed=%d,%d\n", newUngappedExtension->seed.queryOffset, newUngappedExtension->seed.subjectOffset);
		}
		#endif

        return newUngappedExtension;
    }
    else
    {
    	return NULL;
    }
}
Ejemplo n.º 3
0
// Given a query sequence uses an inverted index of the collection to identify the
// sequence number and offset of all hits between the query and the collection
void index_processQuery(unsigned char* startIndex, struct PSSMatrix PSSMatrix,
                        uint4 numSequences)
{
	uint4 queryPosition, codeword = 0, queryPosition4;
    unsigned char* offsets, *endOffsets;
    uint4 offsetGap, offset, sequenceGap, sequenceNumber;
    struct indexCoordinate* coordinate;
	struct memBlocks* unsortedCoordinates;
    uint4 *numSubjectHits, numQueryPositions, queryWordCount, numOffsets;
    uint4 time, wordPosition, containsWildcard;
	struct queryWord* queryWords;

    // Read word and interval size from start of index
	vbyte_getVbyte(startIndex, &index_wordSize);
	vbyte_getVbyte(startIndex, &index_intervalSize);

	index_numWords = pow(4, index_wordSize);
    index_sequencePositions = (uint4*)startIndex;
    index_descriptionLocations = index_sequencePositions + numSequences;
	index_loadedWords = index_descriptionLocations + numSequences;
	index_offsets = (unsigned char*)(index_loadedWords + index_numWords + 1);

    time = clock();
    unsortedCoordinates = memBlocks_initialize(sizeof(struct indexCoordinate), numSequences);

    // Declare and initialize array for count number of hits for each sequence
    numSubjectHits = (uint*)global_malloc(sizeof(uint4) * numSequences);
	sequenceNumber = 0;
    while (sequenceNumber < numSequences)
    {
    	numSubjectHits[sequenceNumber] = 0;
    	sequenceNumber++;
    }

    // Memory to hold offsets string for each query word
    numQueryPositions = PSSMatrix.length - index_wordSize + 1;
	queryWords = (struct queryWord*)global_malloc(sizeof(struct queryWord) * numQueryPositions);

    // For each word in the query
    queryPosition = 0;
    while (queryPosition < numQueryPositions)
    {
    	// Check if the word contains a wildcard
        containsWildcard = 0; wordPosition = 0;
        while (wordPosition < index_wordSize)
        {
            if (PSSMatrix.queryCodes[queryPosition + wordPosition] >= encoding_numRegularLetters)
                containsWildcard = 1;

            wordPosition++;
        }

        // Don't include words that cross the strand boundry or contain wildcards
        if (!containsWildcard && !(queryPosition < PSSMatrix.strandLength &&
              queryPosition >= PSSMatrix.strandLength - index_wordSize + 1))
		{
//            printf("--Query position=%d\n", queryPosition);

            // Get the codeword
            codeword = index_generateCodeword(PSSMatrix.bestMatchCodes + queryPosition, index_wordSize);

            // Get wordlist for that codeword
            offsets = index_offsets + index_loadedWords[codeword];
            endOffsets = index_offsets + index_loadedWords[codeword + 1];

            queryWords[queryPosition].offsets = offsets;
			queryWords[queryPosition].endOffsets = endOffsets;
			queryWords[queryPosition].queryPosition = queryPosition;
            queryWords[queryPosition].codeword = codeword;

//            printf("codeword=%d start=%d end=%d numHits=%d\n", codeword, index_loadedWords[codeword],
//                   index_loadedWords[codeword + 1], endOffsets - offsets);
		}
        else
        {
            queryWords[queryPosition].offsets = NULL;
			queryWords[queryPosition].endOffsets = NULL;
			queryWords[queryPosition].queryPosition = queryPosition;
            queryWords[queryPosition].codeword = codeword;
        }

//        printf("\n");
    	queryPosition++;
    }

    // Sort the query words by codeword
	qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareCodeword);

    // For each query word
    queryWordCount = 0;
    while (queryWordCount < numQueryPositions)
    {
    	// Ignoring those that cross the strand boundry
		if (queryWords[queryWordCount].offsets != NULL)
        {
        	// Make in-memory copy of list of offsets
            numOffsets = queryWords[queryWordCount].endOffsets - queryWords[queryWordCount].offsets;
			offsets = (char*)global_malloc(sizeof(char) * numOffsets);

            memcpy(offsets, queryWords[queryWordCount].offsets, numOffsets);
			queryWords[queryWordCount].offsets = offsets;
            queryWords[queryWordCount].endOffsets = offsets + numOffsets;
		}

        queryWordCount++;
    }

    // Sort the query words by query position
	qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareQueryPosition);

    queryPosition = 0;
    while (queryPosition < numQueryPositions)
    {
    	// Ignoring those that cross the strand boundry
		if (queryWords[queryPosition].offsets != NULL)
        {
        	offsets = queryWords[queryPosition].offsets;
            endOffsets = queryWords[queryPosition].endOffsets;
            offset = 0;
            sequenceNumber = 0;
        	queryPosition4 = queryPosition + (index_wordSize - 4);

            // Traverse the offsets
            while (offsets < endOffsets)
            {
                vbyte_getVbyte(offsets, (&sequenceGap));
                vbyte_getVbyte(offsets, (&offsetGap));

//                printf("[%d,%d]\n", sequenceGap, offsetGap);

                if (sequenceGap > 0)
                {
                	offset = offsetGap;
                    sequenceNumber += sequenceGap;
                }
                else
                {
                	offset += offsetGap;
                }
    //            printf(" %u", offset);

                // Add query/database coordinate of match to relevant bucket
//                printf("Sequence number=%d\n", sequenceNumber);
                coordinate = (struct indexCoordinate*)memBlocks_newEntry(unsortedCoordinates);
                coordinate->queryOffset = queryPosition4;
                coordinate->subjectOffset = offset * index_intervalSize + (index_wordSize - 4);
                coordinate->subjectNumber = sequenceNumber;

                numSubjectHits[sequenceNumber]++;
//                printf("[%d,%d]\n", queryPosition, offset);

                blast_numHits++;
            }

            free(queryWords[queryPosition].offsets);
		}

        queryPosition++;
	}


    printf("Time to process query=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC);
    time = clock();

    // Make memory for sorted list
    index_numCoordinates = unsortedCoordinates->numTotalEntries;
	index_coordinates = (struct indexCoordinate*)global_malloc(
                         sizeof(struct indexCoordinate) * index_numCoordinates);
	index_sequenceCoordinates = (struct indexCoordinate**)global_malloc(
                                 sizeof(struct indexCoordinate*) * numSequences);

    // For each sequence
	coordinate = index_coordinates;
    sequenceNumber = 0;
    while (sequenceNumber < numSequences)
    {
    	// If it has hits
    	if (numSubjectHits[sequenceNumber] != 0)
        {
        	// Point to location in sorted list of coordinates
			index_sequenceCoordinates[sequenceNumber] = coordinate;
            coordinate += numSubjectHits[sequenceNumber];

            numSubjectHits[sequenceNumber] = 0;
        }
    	sequenceNumber++;
    }

    // Move through list of unsorted coordinates
    memBlocks_resetCurrent(unsortedCoordinates);
    while ((coordinate = memBlocks_getCurrent(unsortedCoordinates)) != NULL)
    {
    	sequenceNumber = coordinate->subjectNumber;
//    	printf("%d,%d=[%d]\n", index_sequenceCoordinates[sequenceNumber], numSubjectHits[sequenceNumber], sequenceNumber);
    	// Place into sorted list
		index_sequenceCoordinates[sequenceNumber][numSubjectHits[sequenceNumber]] = *coordinate;
		numSubjectHits[sequenceNumber]++;
    }

    memBlocks_free(unsortedCoordinates);

/*    // Print sorted coordinates
	coordinate = index_coordinates;
    while (coordinate < index_coordinates + index_numCoordinates)
    {
    	printf("[%d]", coordinate);
    	printf("Subject %d Offset %d,%d\n", coordinate->subjectNumber, coordinate->queryOffset,
                                            coordinate->subjectOffset);
    	coordinate++;
    }*/

    printf("Time to sort buckets=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC);
}
Ejemplo n.º 4
0
// Unpack entire or sections of a subject sequence before gapped alignment
void unpack_unpackSubject(struct PSSMatrix PSSMatrix,
                          struct alignment *alignment) {
  unsigned char *subject, *unpackedSubject, wildcard, *edits, *endEdits;
  uint4 wildcardPosition;
  struct unpackRegion *firstRegion = NULL, *lastRegion, *currentRegion,
                      *unpackRegion;
  int4 regionStart, regionEnd, numRegions;

  // No need to unpack a protein subject, or already unpacked nucleotide subject
  if (parameters_ssearch || encoding_alphabetType == encoding_protein) {
    // Just create a single region covering the entire sequence
    firstRegion = memBlocks_newEntry(unpack_unpackRegions);
    firstRegion->startOffset = 0;
    firstRegion->endOffset = alignment->subjectLength;
    firstRegion->subject = alignment->subject;
    firstRegion->unpackedSubject = alignment->subject;
    firstRegion->subjectLength = alignment->subjectLength;
    alignment->unpackRegions = firstRegion;
    alignment->numUnpackRegions = 1;
    return;
  }

  // Get the subject regions for this alignment
  numRegions = unpack_getRegions(PSSMatrix, alignment, 0, unpack_unpackRegions);
  lastRegion = memBlocks_getLastEntry(unpack_unpackRegions);
  lastRegion++;
  firstRegion = lastRegion - numRegions;

  // Sort the regions in order of start position
  qsort(firstRegion, lastRegion - firstRegion, sizeof(struct unpackRegion),
        unpack_compareUnpackRegions);

  // Unpack each region
  currentRegion = firstRegion;
  while (currentRegion < lastRegion) {
    regionEnd = currentRegion->endOffset;
    regionStart = currentRegion->startOffset;

#ifdef VERBOSE
    if (parameters_verboseDloc == alignment->descriptionLocation) {
      printf("Unpack subject region %d to %d (length=%d)\n", regionStart,
             regionEnd, alignment->subjectLength);
      fflush(stdout);
    }
#endif

    // Get the subject region to be unpacked
    if (alignment->unpackRegions == NULL) {
      subject = alignment->subject;
    } else {
      unpackRegion = unpack_selectRegion(
          alignment->unpackRegions, alignment->numUnpackRegions, regionStart);
      subject = unpackRegion->subject;
    }

    // Declare memory for the region
    unpackedSubject = (unsigned char *)global_malloc(sizeof(char) *
                                                     (regionEnd - regionStart));

    // Unpack the region of interest
    encoding_byteUnpackRegion(unpackedSubject, subject + (regionStart / 4),
                              regionEnd - regionStart);
    unpackedSubject -= regionStart;
    currentRegion->unpackedSubject = unpackedSubject;

    currentRegion->subject = subject;
    currentRegion->subjectLength = alignment->subjectLength;

    blast_totalUnpacked += (regionEnd - regionStart);

    currentRegion++;
  }

  currentRegion = firstRegion;

  // Get wildcard edits for the sequence
  edits = alignment->edits;
  endEdits = alignment->edits + alignment->encodedLength -
             ((alignment->subjectLength + 3) / 4);

  // If there are edits
  if (edits < endEdits) {
    // Read first wildcard
    wildcard = *edits;
    edits++;

    // Read its position
    vbyte_getVbyte(edits, &wildcardPosition);

    // For each region in order of position in the subject
    while (currentRegion < lastRegion) {
      // Skip past edits that are before current region
      while (edits < endEdits &&
             wildcardPosition < currentRegion->startOffset) {
        // Read wildcard
        wildcard = *edits;
        edits++;

        // Read its position
        vbyte_getVbyte(edits, &wildcardPosition);
      }

      // Process edits that are in the current region
      while (edits < endEdits && wildcardPosition < currentRegion->endOffset) {
        // Insert wildcard into sequence
        currentRegion->unpackedSubject[wildcardPosition] = wildcard;

        // Read next wildcard
        wildcard = *edits;
        edits++;

        // Read its position
        vbyte_getVbyte(edits, &wildcardPosition);
      }

      // Advance to the next region
      currentRegion++;
    }
  }

  alignment->unpackRegions = firstRegion;
  alignment->numUnpackRegions = lastRegion - firstRegion;
}
Ejemplo n.º 5
0
// Add sequence to the formatted collection
void writedb_addSequence_oid(unsigned char *sequence, uint4 sequenceLength,
                             unsigned char *description, uint4 descriptionLength,
                             unsigned char *wildcards, uint4 wildcardsLength,
                             struct child *children, uint4 numChildren, uint4 oid) {
    uint4 encodedLength, childNum, sizeEdits = 0, editNum;
    unsigned char *editData, *startEditData;
    struct child child;
    struct sequenceData *sequenceData;

    sequenceData = memBlocks_newEntry(writedb_sequenceData);


    // Write the description to file
    if (description != NULL)
        if (fwrite(description, sizeof(unsigned char), descriptionLength,
                   writedb_descriptionsFile) < descriptionLength) {
            fprintf(stderr, "Error writing header to sequence file %s\n",
                    writedb_sequenceFilename);
            exit(-1);
        }

    // Calculate length of encoded sequence
    if (writedb_alphabetType == encoding_nucleotide) {
        encodedLength = encoding_bytePackSequence(sequence, sequenceLength);
    } else {
        encodedLength = sequenceLength + 2;
    }

    // Calculate maximum space required to record sequence's edits
    childNum = 0;
    while (childNum < numChildren) {
        child = children[childNum];
        sizeEdits += 16 + 5 * child.numEdits;
        childNum++;
    }

    // Initialize array to record edits
    editData = startEditData = global_malloc(sizeEdits);

    // Record children edits as vbytes
    childNum = 0;
    while (childNum < numChildren) {
        child = children[childNum];

        // Write children descriptions to disk
        if (fwrite(child.description, sizeof(unsigned char),
                   child.descriptionLength,
                   writedb_descriptionsFile) < child.descriptionLength) {
            fprintf(stderr, "Error writing description to sequence file %s\n",
                    writedb_descriptionsFilename);
            exit(-1);
        }
        descriptionLength += child.descriptionLength;

        // Convert child details to vbytes
        vbyte_safePutVbyte(editData, child.descriptionLength);
        vbyte_safePutVbyte(editData, child.regionStart);
        vbyte_safePutVbyte(editData, child.length);
        vbyte_safePutVbyte(editData, child.numEdits);

        // Append edits
        editNum = 0;
        while (editNum < child.numEdits) {
            // Record edit character
            *editData = child.edits[editNum].code;
            editData++;

            editNum++;
        }

        // Add sequence size to total tally of letters
        writedb_numberOfLetters += child.length;
        writedb_sequenceCount++;

        childNum++;
    }

    // Update volume size, encoded length
    encodedLength += (editData - startEditData);
    writedb_volumeSize += encodedLength + wildcardsLength;

    sequenceData->descriptionLength = descriptionLength;
    sequenceData->sequenceLength = sequenceLength;
    sequenceData->encodedLength = encodedLength + wildcardsLength;
    sequenceData->oid = oid;

    // If the entry will exceed volume max size
    if (writedb_volumeSize > constants_volumeMaxSize) {
        // Close current volume
        fclose(writedb_sequenceFile);

        // Open next volume for writing
        writedb_volume++;

        sprintf(writedb_sequenceFilename, "%s.sequences%d", writedb_filename,
                writedb_volume);
        if ((writedb_sequenceFile = fopen(writedb_sequenceFilename, "w")) == NULL) {
            fprintf(stderr, "Error opening file %s for writing\n",
                    writedb_sequenceFilename);
            exit(-1);
        }


        // Reset volume size counter
        writedb_volumeSize = encodedLength + wildcardsLength;
    }

    // Nulceotide
    if (writedb_alphabetType == encoding_nucleotide) {
        // Write packed nucleotide sequences to disk
        if (fwrite(sequence, sizeof(unsigned char), encodedLength,
                   writedb_sequenceFile) < encodedLength) {
            fprintf(stderr, "Error writing to sequence file %s\n",
                    writedb_sequenceFilename);
            exit(-1);
        }
    }
    // Protein
    else {
        // Write sentinal byte after protein sequences
        fputc(encoding_sentinalCode, writedb_sequenceFile);

        // Write sequence codes to disk
        if (fwrite(sequence, sizeof(unsigned char), sequenceLength,
                   writedb_sequenceFile) < sequenceLength) {
            fprintf(stderr, "Error writing to sequence file %s\n",
                    writedb_sequenceFilename);
            exit(-1);
        }

        // Write sentinal byte after protein sequences
        fputc(encoding_sentinalCode, writedb_sequenceFile);
    }

    // Write wildcard data to disk
    if (fwrite(wildcards, sizeof(unsigned char), wildcardsLength,
               writedb_sequenceFile) < wildcardsLength) {
        fprintf(stderr, "Error writing to sequence file %s\n",
                writedb_sequenceFilename);
        exit(-1);
    }

    // Write edit information to disk
    if (fwrite(startEditData, sizeof(unsigned char), (editData - startEditData),
               writedb_sequenceFile) < (editData - startEditData)) {
        fprintf(stderr, "Error writing to sequence file %s\n",
                writedb_sequenceFilename);
        exit(-1);
    }
    free(startEditData);

    if (numChildren == 0) {
        // Add sequence size to total tally of letters
        writedb_numberOfLetters += sequenceLength;
        writedb_sequenceCount++;
    }

    writedb_numberOfClusters++;

    // Check for new longest/shortest sequence
    if (sequenceLength > writedb_maximumSequenceLength)
        writedb_maximumSequenceLength = sequenceLength;
    if (writedb_minimumSequenceLength == 0 ||
            sequenceLength < writedb_minimumSequenceLength)
        writedb_minimumSequenceLength = sequenceLength;
}