Esempio n. 1
0
// Initialize region copying/unpacking
void unpack_initialize() {
  unpack_unpackRegions = memBlocks_initialize(
      sizeof(struct unpackRegion), constants_initialAllocUnpackRegions);

  unpack_subjectRegions = memBlocks_initialize(
      sizeof(struct unpackRegion), constants_initialAllocUnpackRegions);
}
Esempio n. 2
0
// Initialize the creation of ungapped extensions
void ungappedExtension_initialize()
{
	ungappedExtension_extensions = memBlocks_initialize(sizeof(struct ungappedExtension),
                                   constants_initialAllocUngappedExtensions);
	ungappedExtension_minus3reward = parameters_matchScore * -3;
    ungappedExtension_tableMatchesReward = parameters_matchScore * parameters_wordTableLetters;
}
Esempio n. 3
0
// Initialize writing to formatted database
void writedb_initialize(char *filename, uint4 alphabetType) {
    char *wildcardsFilename;

    writedb_filename = filename;
    writedb_alphabetType = alphabetType;
    writedb_maximumSequenceLength = 0;
    writedb_minimumSequenceLength = 0;
    writedb_numberOfLetters = 0;
    writedb_volume = 0;
    writedb_sequenceCount = 0;
    writedb_numberOfClusters = 0;

    // Construct sequence and description filenames
    writedb_sequenceFilename = (char *)global_malloc(strlen(filename) + 13);
    sprintf(writedb_sequenceFilename, "%s.sequences", filename);
    writedb_descriptionsFilename = (char *)global_malloc(strlen(filename) + 15);
    sprintf(writedb_descriptionsFilename, "%s.descriptions", filename);
    writedb_dataFilename = (char *)global_malloc(strlen(filename) + 8);
    sprintf(writedb_dataFilename, "%s.data", filename);
    wildcardsFilename = (char *)global_malloc(strlen(filename) + 12);
    sprintf(wildcardsFilename, "%s.wildcards", filename);

    // Delete the wildcards file if one exists
    rename(wildcardsFilename, writedb_sequenceFilename);

    // Open sequence file for writing
    if ((writedb_sequenceFile = fopen(writedb_sequenceFilename, "w")) == NULL) {
        fprintf(stderr, "Error opening file %s for writing\n",
                writedb_sequenceFilename);
        exit(-1);
    }

    // Write sentinal/padding byte at start
    if (alphabetType == encoding_protein)
        fputc(encoding_sentinalCode, writedb_sequenceFile);
    else
        fputc(0, writedb_sequenceFile);

    // Open descriptions file for writing
    if ((writedb_descriptionsFile = fopen(writedb_descriptionsFilename, "w")) ==
            NULL) {
        fprintf(stderr, "Error opening file %s for writing\n",
                writedb_descriptionsFilename);
        exit(-1);
    }

    writedb_volumeSize = 1;
    writedb_sequenceData = memBlocks_initialize(sizeof(struct sequenceData),
                           constants_initialSequenceData);
}
Esempio n. 4
0
// Given a query sequence uses an inverted index of the collection to identify the
// sequence number and offset of all hits between the query and the collection
void index_processQuery(unsigned char* startIndex, struct PSSMatrix PSSMatrix,
                        uint4 numSequences)
{
	uint4 queryPosition, codeword = 0, queryPosition4;
    unsigned char* offsets, *endOffsets;
    uint4 offsetGap, offset, sequenceGap, sequenceNumber;
    struct indexCoordinate* coordinate;
	struct memBlocks* unsortedCoordinates;
    uint4 *numSubjectHits, numQueryPositions, queryWordCount, numOffsets;
    uint4 time, wordPosition, containsWildcard;
	struct queryWord* queryWords;

    // Read word and interval size from start of index
	vbyte_getVbyte(startIndex, &index_wordSize);
	vbyte_getVbyte(startIndex, &index_intervalSize);

	index_numWords = pow(4, index_wordSize);
    index_sequencePositions = (uint4*)startIndex;
    index_descriptionLocations = index_sequencePositions + numSequences;
	index_loadedWords = index_descriptionLocations + numSequences;
	index_offsets = (unsigned char*)(index_loadedWords + index_numWords + 1);

    time = clock();
    unsortedCoordinates = memBlocks_initialize(sizeof(struct indexCoordinate), numSequences);

    // Declare and initialize array for count number of hits for each sequence
    numSubjectHits = (uint*)global_malloc(sizeof(uint4) * numSequences);
	sequenceNumber = 0;
    while (sequenceNumber < numSequences)
    {
    	numSubjectHits[sequenceNumber] = 0;
    	sequenceNumber++;
    }

    // Memory to hold offsets string for each query word
    numQueryPositions = PSSMatrix.length - index_wordSize + 1;
	queryWords = (struct queryWord*)global_malloc(sizeof(struct queryWord) * numQueryPositions);

    // For each word in the query
    queryPosition = 0;
    while (queryPosition < numQueryPositions)
    {
    	// Check if the word contains a wildcard
        containsWildcard = 0; wordPosition = 0;
        while (wordPosition < index_wordSize)
        {
            if (PSSMatrix.queryCodes[queryPosition + wordPosition] >= encoding_numRegularLetters)
                containsWildcard = 1;

            wordPosition++;
        }

        // Don't include words that cross the strand boundry or contain wildcards
        if (!containsWildcard && !(queryPosition < PSSMatrix.strandLength &&
              queryPosition >= PSSMatrix.strandLength - index_wordSize + 1))
		{
//            printf("--Query position=%d\n", queryPosition);

            // Get the codeword
            codeword = index_generateCodeword(PSSMatrix.bestMatchCodes + queryPosition, index_wordSize);

            // Get wordlist for that codeword
            offsets = index_offsets + index_loadedWords[codeword];
            endOffsets = index_offsets + index_loadedWords[codeword + 1];

            queryWords[queryPosition].offsets = offsets;
			queryWords[queryPosition].endOffsets = endOffsets;
			queryWords[queryPosition].queryPosition = queryPosition;
            queryWords[queryPosition].codeword = codeword;

//            printf("codeword=%d start=%d end=%d numHits=%d\n", codeword, index_loadedWords[codeword],
//                   index_loadedWords[codeword + 1], endOffsets - offsets);
		}
        else
        {
            queryWords[queryPosition].offsets = NULL;
			queryWords[queryPosition].endOffsets = NULL;
			queryWords[queryPosition].queryPosition = queryPosition;
            queryWords[queryPosition].codeword = codeword;
        }

//        printf("\n");
    	queryPosition++;
    }

    // Sort the query words by codeword
	qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareCodeword);

    // For each query word
    queryWordCount = 0;
    while (queryWordCount < numQueryPositions)
    {
    	// Ignoring those that cross the strand boundry
		if (queryWords[queryWordCount].offsets != NULL)
        {
        	// Make in-memory copy of list of offsets
            numOffsets = queryWords[queryWordCount].endOffsets - queryWords[queryWordCount].offsets;
			offsets = (char*)global_malloc(sizeof(char) * numOffsets);

            memcpy(offsets, queryWords[queryWordCount].offsets, numOffsets);
			queryWords[queryWordCount].offsets = offsets;
            queryWords[queryWordCount].endOffsets = offsets + numOffsets;
		}

        queryWordCount++;
    }

    // Sort the query words by query position
	qsort(queryWords, numQueryPositions, sizeof(struct queryWord), alignments_compareQueryPosition);

    queryPosition = 0;
    while (queryPosition < numQueryPositions)
    {
    	// Ignoring those that cross the strand boundry
		if (queryWords[queryPosition].offsets != NULL)
        {
        	offsets = queryWords[queryPosition].offsets;
            endOffsets = queryWords[queryPosition].endOffsets;
            offset = 0;
            sequenceNumber = 0;
        	queryPosition4 = queryPosition + (index_wordSize - 4);

            // Traverse the offsets
            while (offsets < endOffsets)
            {
                vbyte_getVbyte(offsets, (&sequenceGap));
                vbyte_getVbyte(offsets, (&offsetGap));

//                printf("[%d,%d]\n", sequenceGap, offsetGap);

                if (sequenceGap > 0)
                {
                	offset = offsetGap;
                    sequenceNumber += sequenceGap;
                }
                else
                {
                	offset += offsetGap;
                }
    //            printf(" %u", offset);

                // Add query/database coordinate of match to relevant bucket
//                printf("Sequence number=%d\n", sequenceNumber);
                coordinate = (struct indexCoordinate*)memBlocks_newEntry(unsortedCoordinates);
                coordinate->queryOffset = queryPosition4;
                coordinate->subjectOffset = offset * index_intervalSize + (index_wordSize - 4);
                coordinate->subjectNumber = sequenceNumber;

                numSubjectHits[sequenceNumber]++;
//                printf("[%d,%d]\n", queryPosition, offset);

                blast_numHits++;
            }

            free(queryWords[queryPosition].offsets);
		}

        queryPosition++;
	}


    printf("Time to process query=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC);
    time = clock();

    // Make memory for sorted list
    index_numCoordinates = unsortedCoordinates->numTotalEntries;
	index_coordinates = (struct indexCoordinate*)global_malloc(
                         sizeof(struct indexCoordinate) * index_numCoordinates);
	index_sequenceCoordinates = (struct indexCoordinate**)global_malloc(
                                 sizeof(struct indexCoordinate*) * numSequences);

    // For each sequence
	coordinate = index_coordinates;
    sequenceNumber = 0;
    while (sequenceNumber < numSequences)
    {
    	// If it has hits
    	if (numSubjectHits[sequenceNumber] != 0)
        {
        	// Point to location in sorted list of coordinates
			index_sequenceCoordinates[sequenceNumber] = coordinate;
            coordinate += numSubjectHits[sequenceNumber];

            numSubjectHits[sequenceNumber] = 0;
        }
    	sequenceNumber++;
    }

    // Move through list of unsorted coordinates
    memBlocks_resetCurrent(unsortedCoordinates);
    while ((coordinate = memBlocks_getCurrent(unsortedCoordinates)) != NULL)
    {
    	sequenceNumber = coordinate->subjectNumber;
//    	printf("%d,%d=[%d]\n", index_sequenceCoordinates[sequenceNumber], numSubjectHits[sequenceNumber], sequenceNumber);
    	// Place into sorted list
		index_sequenceCoordinates[sequenceNumber][numSubjectHits[sequenceNumber]] = *coordinate;
		numSubjectHits[sequenceNumber]++;
    }

    memBlocks_free(unsortedCoordinates);

/*    // Print sorted coordinates
	coordinate = index_coordinates;
    while (coordinate < index_coordinates + index_numCoordinates)
    {
    	printf("[%d]", coordinate);
    	printf("Subject %d Offset %d,%d\n", coordinate->subjectNumber, coordinate->queryOffset,
                                            coordinate->subjectOffset);
    	coordinate++;
    }*/

    printf("Time to sort buckets=%f\n", (float)(clock() - time) / CLOCKS_PER_SEC);
}