Example #1
0
// Free memory used to store unpacked regions
void unpack_free() {
  struct unpackRegion *region;

  // For each unpack region
  if (!parameters_ssearch && encoding_alphabetType == encoding_nucleotide) {
    memBlocks_resetCurrent(unpack_unpackRegions);
    while ((region = memBlocks_getCurrent(unpack_unpackRegions)) != NULL) {
      // Free the unpacked sequence
      free(region->unpackedSubject + region->startOffset);
    }
  }

  // For each copied subject region
  memBlocks_resetCurrent(unpack_subjectRegions);
  while ((region = memBlocks_getCurrent(unpack_subjectRegions)) != NULL) {
    // Free the subject
    free(region->subject + region->startOffset / 4);
  }

  memBlocks_free(unpack_unpackRegions);
  memBlocks_free(unpack_subjectRegions);
}
Example #2
0
// Given a query sequence uses an inverted index of the collection to identify the
// sequence number and offset of all hits between the query and the collection
void index_processQuery(unsigned char* startIndex, struct PSSMatrix PSSMatrix,
                        uint4 numSequences)
{
	uint4 queryPosition, codeword = 0, queryPosition4;
    unsigned char* offsets, *endOffsets;
    uint4 offsetGap, offset, sequenceGap, sequenceNumber;
    struct indexCoordinate* coordinate;
	struct memBlocks* unsortedCoordinates;
    uint4 *numSubjectHits, numQueryPositions, queryWordCount, numOffsets;
    uint4 time, wordPosition, containsWildcard;
	struct queryWord* queryWords;

    // Read word and interval size from start of index
	vbyte_getVbyte(startIndex, &index_wordSize);
	vbyte_getVbyte(startIndex, &index_intervalSize);

	index_numWords = pow(4, index_wordSize);
    index_sequencePositions = (uint4*)startIndex;
    index_descriptionLocations = index_sequencePositions + numSequences;
	index_loadedWords = index_descriptionLocations + numSequences;
	index_offsets = (unsigned char*)(index_loadedWords + index_numWords + 1);

    time = clock();
    unsortedCoordinates = memBlocks_initialize(sizeof(struct indexCoordinate), numSequences);

    // Declare and initialize array for count number of hits for each sequence
    numSubjectHits = (uint*)global_malloc(sizeof(uint4) * numSequences);
	sequenceNumber = 0;
    while (sequenceNumber < numSequences)
    {
    	numSubjectHits[sequenceNumber] = 0;
    	sequenceNumber++;
    }

    // Memory to hold offsets string for each query word
    numQueryPositions = PSSMatrix.length - index_wordSize + 1;
	queryWords = (struct queryWord*)global_malloc(sizeof(struct queryWord) * numQueryPositions);

    // For each word in the query
    queryPosition = 0;
    while (queryPosition < numQueryPositions)
    {
    	// Check if the word contains a wildcard
        containsWildcard = 0; wordPosition = 0;
        while (wordPosition < index_wordSize)
        {
            if (PSSMatrix.queryCodes[queryPosition + wordPosition] >= encoding_numRegularLetters)
                containsWildcard = 1;

            wordPosition++;
        }

        // Don't include words that cross the strand boundry or contain wildcards
        if (!containsWildcard && !(queryPosition < PSSMatrix.strandLength &&
              queryPosition >= PSSMatrix.strandLength - index_wordSize + 1))
		{
//            printf("--Query position=%d\n", queryPosition);

            // Get the codeword
            codeword = index_generateCodeword(PSSMatrix.bestMatchCodes + queryPosition, index_wordSize);

            // Get wordlist for that codeword
            offsets = index_offsets + index_loadedWords[codeword];
            endOffsets = index_offsets + index_loadedWords[codeword + 1];

            queryWords[queryPosition].offsets = offsets;
			queryWords[queryPosition].endOffsets = endOffsets;
			queryWords[queryPosition].queryPosition = queryPosition;
            queryWords[queryPosition].codeword = codeword;

//            printf("codeword=%d start=%d end=%d numHits=%d\n", codeword, index_loadedWords[codeword],
//                   index_loadedWords[codeword + 1], endOffsets - offsets);
		}
        else
        {
            queryWords[queryPosition].offsets = NULL;
			queryWords[queryPosition].endOffsets = NULL;
			queryWords[queryPosition].queryPosition = queryPosition;
            queryWords[queryPosition].codeword = codeword;
        }

//        printf("\n");
    	queryPosition++;
    }

    // Sort the query words by codeword
	qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareCodeword);

    // For each query word
    queryWordCount = 0;
    while (queryWordCount < numQueryPositions)
    {
    	// Ignoring those that cross the strand boundry
		if (queryWords[queryWordCount].offsets != NULL)
        {
        	// Make in-memory copy of list of offsets
            numOffsets = queryWords[queryWordCount].endOffsets - queryWords[queryWordCount].offsets;
			offsets = (char*)global_malloc(sizeof(char) * numOffsets);

            memcpy(offsets, queryWords[queryWordCount].offsets, numOffsets);
			queryWords[queryWordCount].offsets = offsets;
            queryWords[queryWordCount].endOffsets = offsets + numOffsets;
		}

        queryWordCount++;
    }

    // Sort the query words by query position
	qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareQueryPosition);

    queryPosition = 0;
    while (queryPosition < numQueryPositions)
    {
    	// Ignoring those that cross the strand boundry
		if (queryWords[queryPosition].offsets != NULL)
        {
        	offsets = queryWords[queryPosition].offsets;
            endOffsets = queryWords[queryPosition].endOffsets;
            offset = 0;
            sequenceNumber = 0;
        	queryPosition4 = queryPosition + (index_wordSize - 4);

            // Traverse the offsets
            while (offsets < endOffsets)
            {
                vbyte_getVbyte(offsets, (&sequenceGap));
                vbyte_getVbyte(offsets, (&offsetGap));

//                printf("[%d,%d]\n", sequenceGap, offsetGap);

                if (sequenceGap > 0)
                {
                	offset = offsetGap;
                    sequenceNumber += sequenceGap;
                }
                else
                {
                	offset += offsetGap;
                }
    //            printf(" %u", offset);

                // Add query/database coordinate of match to relevant bucket
//                printf("Sequence number=%d\n", sequenceNumber);
                coordinate = (struct indexCoordinate*)memBlocks_newEntry(unsortedCoordinates);
                coordinate->queryOffset = queryPosition4;
                coordinate->subjectOffset = offset * index_intervalSize + (index_wordSize - 4);
                coordinate->subjectNumber = sequenceNumber;

                numSubjectHits[sequenceNumber]++;
//                printf("[%d,%d]\n", queryPosition, offset);

                blast_numHits++;
            }

            free(queryWords[queryPosition].offsets);
		}

        queryPosition++;
	}


    printf("Time to process query=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC);
    time = clock();

    // Make memory for sorted list
    index_numCoordinates = unsortedCoordinates->numTotalEntries;
	index_coordinates = (struct indexCoordinate*)global_malloc(
                         sizeof(struct indexCoordinate) * index_numCoordinates);
	index_sequenceCoordinates = (struct indexCoordinate**)global_malloc(
                                 sizeof(struct indexCoordinate*) * numSequences);

    // For each sequence
	coordinate = index_coordinates;
    sequenceNumber = 0;
    while (sequenceNumber < numSequences)
    {
    	// If it has hits
    	if (numSubjectHits[sequenceNumber] != 0)
        {
        	// Point to location in sorted list of coordinates
			index_sequenceCoordinates[sequenceNumber] = coordinate;
            coordinate += numSubjectHits[sequenceNumber];

            numSubjectHits[sequenceNumber] = 0;
        }
    	sequenceNumber++;
    }

    // Move through list of unsorted coordinates
    memBlocks_resetCurrent(unsortedCoordinates);
    while ((coordinate = memBlocks_getCurrent(unsortedCoordinates)) != NULL)
    {
    	sequenceNumber = coordinate->subjectNumber;
//    	printf("%d,%d=[%d]\n", index_sequenceCoordinates[sequenceNumber], numSubjectHits[sequenceNumber], sequenceNumber);
    	// Place into sorted list
		index_sequenceCoordinates[sequenceNumber][numSubjectHits[sequenceNumber]] = *coordinate;
		numSubjectHits[sequenceNumber]++;
    }

    memBlocks_free(unsortedCoordinates);

/*    // Print sorted coordinates
	coordinate = index_coordinates;
    while (coordinate < index_coordinates + index_numCoordinates)
    {
    	printf("[%d]", coordinate);
    	printf("Subject %d Offset %d,%d\n", coordinate->subjectNumber, coordinate->queryOffset,
                                            coordinate->subjectOffset);
    	coordinate++;
    }*/

    printf("Time to sort buckets=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC);
}
Example #3
0
// Finalize writing to the formatted collection
void writedb_close() {
    unsigned char headerData[100], *headerDataPointer;
    uint4 headerLength;
    struct sequenceData *sequenceData;

    // Write sentinal/padding byte at end
    if (writedb_alphabetType == encoding_protein)
        fputc(encoding_sentinalCode, writedb_sequenceFile);
    else
        fputc(0, writedb_sequenceFile);

    // Close writing to sequence and description files
    fclose(writedb_sequenceFile);
    fclose(writedb_descriptionsFile);

    // Open data file for writing
    if ((writedb_dataFile = fopen(writedb_dataFilename, "w")) == NULL) {
        fprintf(stderr, "Error opening file %s for writing\n",
                writedb_dataFilename);
        exit(-1);
    }

    // Convert 6 header values to vbytes
    headerDataPointer = headerData;
    vbyte_safePutVbyte(headerDataPointer, constants_databaseVersion);
    vbyte_safePutVbyte(headerDataPointer, writedb_sequenceCount);
    vbyte_safePutVbyte(headerDataPointer, writedb_numberOfLetters);
    vbyte_safePutVbyte(headerDataPointer, writedb_maximumSequenceLength);
    vbyte_safePutVbyte(headerDataPointer, writedb_alphabetType);
    vbyte_safePutVbyte(headerDataPointer, writedb_numberOfClusters);
    vbyte_safePutVbyte(headerDataPointer, writedb_volume + 1);

    // Write the header data at the start of the file
    headerLength = headerDataPointer - headerData;
    if (fwrite(&headerData, sizeof(unsigned char), headerLength,
               writedb_dataFile) < headerLength) {
        fprintf(stderr, "Error writing header to sequence file %s\n",
                writedb_dataFilename);
        exit(-1);
    }

    // For each sequence
    memBlocks_resetCurrent(writedb_sequenceData);
    while ((sequenceData = memBlocks_getCurrent(writedb_sequenceData)) != NULL) {
        // Prepare to write sequence description length, subject length, and encoded
        // length using vbytes
        headerDataPointer = headerData;
        vbyte_safePutVbyte(headerDataPointer, sequenceData->descriptionLength);
        vbyte_safePutVbyte(headerDataPointer, sequenceData->sequenceLength);
        vbyte_safePutVbyte(headerDataPointer, sequenceData->encodedLength);
        vbyte_safePutVbyte(headerDataPointer, sequenceData->oid);

        // Write sequence header information
        headerLength = headerDataPointer - headerData;
        if (fwrite(headerData, sizeof(unsigned char), headerLength,
                   writedb_dataFile) < headerLength) {
            fprintf(stderr, "Error writing to sequence file %s\n",
                    writedb_dataFilename);
            exit(-1);
        }
    }

    // Close writing to sequence file
    fclose(writedb_dataFile);

    memBlocks_free(writedb_sequenceData);
}