コード例 #1
0
ファイル: qPosList.c プロジェクト: aswarren/grc
// Process a given query position list
void qPosList_processList(int2* queryPositions, int2 numQueryPositions, int4 codeword)
{
	int4 listCount = 0, queryPositionCount, subset, present;
    struct memSingleBlock* list;
    struct queryPosition* queryPosition = NULL;
	struct codeword* newCodeword;

    // Iterative through existing query positions lists (ordered from longest to shortest)
    while (listCount < qPosList_numQPosLists)
    {
        // Check for one that contains a subset of to-be-added query positions
        list = qPosList_qPosLists + listCount;

        // Start by assuming it is
        subset = 1;

        // Iterate through each query position in the current existing list
        memSingleBlock_resetCurrent(list);
        while ((queryPosition = memSingleBlock_getCurrent(list)) != NULL && subset)
        {
            // Iterate through each query position in the new list (which is sorted)
            queryPositionCount = 0;
            while (queryPositionCount < numQueryPositions)
            {
                // Found a match, break out and proceed to next position in current list
                if (queryPosition->queryPosition == queryPositions[queryPositionCount])
                {
                    break;
                }
                // The query position is not present in the new list, then existing list
                // is not a subset of the new one
                else if (queryPosition->queryPosition < queryPositions[queryPositionCount])
                {
                    subset = 0;
                    break;
                }
                // Otherwise keep going
                queryPositionCount++;
            }

            // If we got to the end of the list, and didn't find a match, not a subset
            if (queryPositionCount == numQueryPositions)
                subset = 0;

            // If the query positions in the existing list processed so far match all of
            // the positions in the new list
            if (list->currentEntry == numQueryPositions && subset)
            {
                // We have a match, starting here
                newCodeword = global_malloc(sizeof(struct codeword));
				newCodeword->codeword = codeword;
                newCodeword->next = queryPosition->codewords;
                queryPosition->codewords = newCodeword;

                return;
            }
        }

        if (subset)
        {
            // If this existing list is a subset of the new list then add the new/additional
            // query positions to the end of it
            queryPosition = memSingleBlock_getLastEntry(list);

            // Iterate through each query position in the new list
            while (numQueryPositions > 0)
            {
                numQueryPositions--;
                present = 0;

                // Check if present in the existing list
                memSingleBlock_resetCurrent(list);
                while ((queryPosition = memSingleBlock_getCurrent(list)) != NULL && subset)
                {
                    // Found it
                    if (queryPosition->queryPosition == queryPositions[numQueryPositions])
                    {
                        present = 1;
                        break;
                    }
                }

                // Not present - add to the existing list with a null reference codeword
                if (!present)
                {
                    queryPosition = memSingleBlock_newEntry(list);
                    queryPosition->queryPosition = queryPositions[numQueryPositions];
                    // No refering codeword for any of the positions except the last
                    queryPosition->codewords = NULL;
                }

                queryPositionCount++;
            }

            // Get the last, new query position
            queryPosition = memSingleBlock_getLastEntry(list);

            // Add reference codeword to the last query position (will become first)
            newCodeword = global_malloc(sizeof(struct codeword));
            newCodeword->next = NULL;
            newCodeword->codeword = codeword;
            queryPosition->codewords = newCodeword;

            // Re-sort the lists of query positions from longest to shortest
            qsort(qPosList_qPosLists, qPosList_numQPosLists,
                  sizeof(struct memSingleBlock), qPosList_compareList);

            return;
        }

        listCount++;
    }

    // Instead use a new list of query positions
    list = qPosList_qPosLists + qPosList_numQPosLists;
    list->numEntries = 0;
    qPosList_numQPosLists++;

    // And copy values into it
    while (numQueryPositions > 0)
    {
        numQueryPositions--;
        queryPosition = memSingleBlock_newEntry(list);
        queryPosition->queryPosition = queryPositions[numQueryPositions];
        // No refering codeword for any of the positions except the last
        queryPosition->codewords = NULL;
    }

    // Reference at the last query position (will become the first) to the
    // new query position list's codeword
    newCodeword = global_malloc(sizeof(struct codeword));
    newCodeword->next = NULL;
    newCodeword->codeword = codeword;
    queryPosition->codewords = newCodeword;

    // Sort the lists from longest to shortest
    qsort(qPosList_qPosLists, qPosList_numQPosLists,
          sizeof(struct memSingleBlock), qPosList_compareList);
}
コード例 #2
0
ファイル: formatdb.c プロジェクト: pombredanne/grc
int4 main(int argc, char* argv[])
{
    char *sequence, *filename;
    uint4 sequenceLength;
    int4 totalWilds = 0, alphabetType;
    struct memSingleBlock* wildcardEdits;
    struct wildcardEdit* wildcardEdit;
    char *wildcardData = NULL, *startWildcardData = NULL;

    // User must provide FASTA format file at command line
    if (argc < 2)
    {
        fprintf(stderr, "Useage: formatdb <FASTA file>\n");
        exit(-1);
    }
    filename = argv[1];

    // Initialize array to store wildcard edits
    wildcardEdits = memSingleBlock_initialize(sizeof(struct wildcardEdit), 10);

    // Determine if database is protein or nucleotide
    alphabetType = determineDbAlphabetType(filename);

    if (alphabetType == encoding_protein)
    {
        printf("PROTEIN database detected.\n");
    }
    else if (alphabetType == encoding_nucleotide)
    {
        printf("NUCLEOTIDE database detected.\n");
    }

    // Initialize codes array
    encoding_initialize(alphabetType);

    // Initialize writing to formatted database
    writedb_initialize(filename, alphabetType);

    // Open FASTA file for reading
    readFasta_open(filename);

    printf("Formatting database...");
    fflush(stdout);

    // Move through the FASTA file reading descriptions and sequences
    while (readFasta_readSequence())
    {
        // Get sequence just read
        sequence = readFasta_sequenceBuffer;
        sequenceLength = readFasta_sequenceLength;

        // Encode the sequence
        encoding_encodeSequence(sequence, sequenceLength, alphabetType);

        // Convert nucleotide sequences to byte-packed format
        if (alphabetType == encoding_nucleotide)
        {
            // Replace any wilds with a random character
            totalWilds += encoding_replaceWildcards(wildcardEdits, sequence, sequenceLength);

            // Declare memory to hold wildcard data
            startWildcardData = global_realloc(startWildcardData,
                                               sizeof(char) * wildcardEdits->numEntries * 5);
            wildcardData = startWildcardData;

            // For each wildcard edit, encode details using chars and vbytes
            memSingleBlock_resetCurrent(wildcardEdits);
            while ((wildcardEdit = memSingleBlock_getCurrent(wildcardEdits)) != NULL)
            {
                // Record wild character
                *wildcardData = wildcardEdit->code;
                wildcardData++;

                // Convert the position to a vbyte
                vbyte_putVbyte(wildcardData, wildcardEdit->position);
            }
        }
        else
        {
            startWildcardData = wildcardData = NULL;
        }

//        printf("[%s](%d)", readFasta_descriptionBuffer, readFasta_descriptionLength); fflush(stdout);

        // Add sequence to the formatted collection
        writedb_addSequence(sequence, sequenceLength, readFasta_descriptionBuffer,
                            readFasta_descriptionLength, startWildcardData,
                            wildcardData - startWildcardData, NULL, 0);

        // Print status dots
        if (writedb_sequenceCount % 10000 == 0)
        {
            printf(".");
            fflush(stdout);
        }
    }

    // Close fasta reader
    readFasta_close();

    // Finalize writing to the formatted collection
    writedb_close();

    printf("done.\n");
    printf("%d sequences processed.\n", writedb_sequenceCount);
    printf("%llu letters processed.\n", writedb_numberOfLetters);
    printf("%d wildcards encoded.\n", totalWilds);
    printf("%d volume(s) created.\n", writedb_volume + 1);
    printf("Longest/shortest sequence was %d/%d letters\n",
           writedb_maximumSequenceLength, writedb_minimumSequenceLength);
    fflush(stdout);

    return 0;
}