Exemple #1
0
int4 main(int4 argc, char *argv[]) {
    unsigned char *filename, *readdb_address, *sequence, code, *wildcardsFilename;
    uint4 descriptionStart = 0, descriptionLength = 0, sequenceLength;
    uint4 encodedLength, numChildren, count;
    char *description;
    struct child *children, *child;
    uint4 candidateNum, change, childNum;
    uint4 numWilds = 0;
    struct wild *wilds, defaultWild, *candidates, bestNewCandidate;
    struct wild *wildSubset, *newCandidates, *bestNewCandidates;
    uint4 sizeWildSubset, numOccurences, numCandidates;
    float defaultWildscore, candidatesScore, bestScore;

    // User must provide FASTA format file at command line
    if (argc < 4) {
        fprintf(stderr, "Useage: chooseWilds <database> <Wildcard score constant> "
                "<Wildcards output file>\n");
        exit(-1);
    }
    filename = argv[1];
    wildcards_scoringConstant = atof(argv[2]);
    wildcardsFilename = argv[3];

    readdb_open(filename);

    printf("Number of clusters = %u\n", readdb_numberOfClusters);
    printf("Number of sequences = %u\n", readdb_numberOfSequences);
    printf("Number of volumes = %u\n", readdb_numberOfVolumes);
    printf("Total number of letters = %llu\n", readdb_numberOfLetters);
    printf("Length of longest sequence = %u\n", readdb_longestSequenceLength);
    printf("Alphabet type = %s\n", encoding_alphabetTypes[readdb_dbAlphabetType]);

    // Initialize codes array
    encoding_initialize(readdb_dbAlphabetType);

    // Load score matrix
    parameters_findScoringMatrix();
    wildcards_scoreMatrix = scoreMatrix_load(parameters_scoringMatrixPath);

    // Count occurences of each wildcard set
    wildcards_initializeCountOccurences(readdb_longestSequenceLength);
    do {
        // Read each sequence in the collection
        while (readdb_readSequence(&sequence, &sequenceLength, &descriptionStart,
                                   &descriptionLength, &encodedLength)) {
            // If a protein sequence cluster
            if (encoding_alphabetType == encoding_protein &&
                    sequenceLength + 2 != encodedLength) {
                // Get the children
                children = readdb_getChildren(sequence, sequenceLength, encodedLength,
                                              descriptionStart, &numChildren);

                // Add to list of occurences
                wildcards_countOccurences(children, numChildren, sequenceLength);

                childNum = 0;
                while (childNum < numChildren) {
                    free(children[childNum].edits);
                    free(children[childNum].sequence - 1);
                    childNum++;
                }

                free(children);
            }
        }
    } while (readdb_nextVolume());

    // Get final list of number of occurences of each wild
    wilds = wildcards_getOccurences(&numWilds);

    chooseWilds_printOccurenceMatrix(wilds, numWilds);

    // Build default wildcard
    defaultWild.code = 0;
    defaultWild.count = 0;
    code = 0;
    while (code < encoding_numLetters) {
        setbit(defaultWild.code, code);
        code++;
    }

    // Get average score for default wildcard
    wildSubset = wildcards_getSubset(defaultWild, wilds, numWilds,
                                     &sizeWildSubset, &numOccurences);
    defaultWildscore = wildcards_averageResidueWildMatch(defaultWild, wildSubset,
                       sizeWildSubset);
    printf("defaultWildScore=%f occurences=%d\n", defaultWildscore,
           numOccurences);

    // Build up list of wildcard candidates
    candidates = (struct wild *)global_malloc(sizeof(struct wild) *
                 wildcards_numClusterWildcards);
    numCandidates = 0;
    while (numCandidates < wildcards_numClusterWildcards - 1) {
        // Explore each possible option to add to list of candidates
        count = 0;
        bestScore = 0;
        while (count < numWilds) {
            //        	printf("set pos %d to ", numCandidates);
            //			wildcards_printWildcard(wilds[count].code);
            candidates[numCandidates] = wilds[count];

            // Score a set of candidates
            candidatesScore = wildcards_scoreCandidates(
                                  candidates, numCandidates + 1, wilds, numWilds, defaultWildscore);
            //            printf("Candidates saving=%f\n", candidatesScore);
            if (candidatesScore > bestScore) {
                bestScore = candidatesScore;
                bestNewCandidate = wilds[count];
            }

            count++;
        }

        printf("Score=%f Best new candidate (%d): ", bestScore, numCandidates);
        wildcards_printWildcard(bestNewCandidate.code);
        candidates[numCandidates] = bestNewCandidate;

        numCandidates++;
    }

    newCandidates = (struct wild *)global_malloc(sizeof(struct wild) *
                    wildcards_numClusterWildcards);
    bestNewCandidates = (struct wild *)global_malloc(
                            sizeof(struct wild) * wildcards_numClusterWildcards);

    // Perform hill climbing; consider changing each position
    change = 1;
    while (change) {
        change = 0;
        candidateNum = 0;
        bestScore = 0;
        while (candidateNum < numCandidates) {
            // Start with current candidates
            memcpy(newCandidates, candidates,
                   sizeof(struct wild) * wildcards_numClusterWildcards - 1);

            // Change current position to every possible candidate
            count = 0;
            while (count < numWilds) {
                newCandidates[candidateNum] = wilds[count];

                // Score a possible new set of candidates
                candidatesScore = wildcards_scoreCandidates(
                                      newCandidates, numCandidates, wilds, numWilds, defaultWildscore);

                // Check if best new candidates
                if (candidatesScore > bestScore) {
                    bestScore = candidatesScore;
                    memcpy(bestNewCandidates, newCandidates,
                           sizeof(struct wild) * wildcards_numClusterWildcards - 1);
                }

                count++;
            }

            candidateNum++;
        }

        // Update candidates
        if (bestScore > wildcards_scoreCandidates(candidates, numCandidates, wilds,
                numWilds, defaultWildscore)) {
            printf("New bestScore=%f\n", bestScore);
            memcpy(candidates, bestNewCandidates,
                   sizeof(struct wild) * wildcards_numClusterWildcards - 1);
            change = 1;
        }

        candidateNum = 0;
        while (candidateNum < numCandidates) {
            wildcards_printWildcard(candidates[candidateNum].code);
            candidateNum++;
        }
    }

    // Print out final set of clusters with default wild added
    candidates[numCandidates] = defaultWild;
    numCandidates++;
    wildcards_scoreCandidates(candidates, numCandidates, wilds, numWilds,
                              defaultWildscore);
    wildcards_outputWildcards(wildcardsFilename);

    printf("%d sequences read.\n", readdb_numberOfSequences);
    fflush(stdout);

    free(candidates);
    free(newCandidates);
    free(bestNewCandidates);

    return 0;
}
// Given the alphabet type of the query, use the relevant parameter defaults
void parameters_loadDefaults(unsigned char alphabetType)
{
	struct defaultPenalties defaultPenalties;

    // Decode scoring system parameter
    if (alphabetType == encoding_nucleotide)
    {
    	// Nulceotide case
        if (parameters_scoringSystem == 0)
        {
    		parameters_bytepackedScoring = 1;
            parameters_semiGappedScoring = 0;
            parameters_restrictedInsertionScoring = 1;
        }
        else if (parameters_scoringSystem == 1)
        {
    		parameters_bytepackedScoring = 0;
            parameters_semiGappedScoring = 0;
            parameters_restrictedInsertionScoring = 1;
        }
        else if (parameters_scoringSystem == 2)
        {
    		parameters_bytepackedScoring = 0;
            parameters_semiGappedScoring = 1;
            parameters_restrictedInsertionScoring = 0;
        }
        else if (parameters_scoringSystem == 3)
        {
    		parameters_bytepackedScoring = 0;
            parameters_semiGappedScoring = 0;
            parameters_restrictedInsertionScoring = 0;
        }
        else if (parameters_scoringSystem == 4)
        {
            parameters_bytepackedScoring = 0;
            parameters_semiGappedScoring = 1;
            parameters_restrictedInsertionScoring = 1;
        }
        else if (parameters_scoringSystem == 5)
        {
        	parameters_tableScoring = 1;
    		parameters_bytepackedScoring = 0;
            parameters_semiGappedScoring = 0;
            parameters_restrictedInsertionScoring = 1;
        }
	}
    else
    {
    	// Protein case
        parameters_bytepackedScoring = 0;
        if (parameters_scoringSystem == 0)
        {
            parameters_semiGappedScoring = 1;
            parameters_restrictedInsertionScoring = 1;
        }
        else if (parameters_scoringSystem == 1)
        {
            parameters_semiGappedScoring = 0;
            parameters_restrictedInsertionScoring = 1;
        }
        else if (parameters_scoringSystem == 2)
        {
            parameters_semiGappedScoring = 1;
            parameters_restrictedInsertionScoring = 0;
        }
        else if (parameters_scoringSystem == 3)
        {
            parameters_semiGappedScoring = 0;
            parameters_restrictedInsertionScoring = 0;
        }
        else if (parameters_scoringSystem == 4)
        {
            parameters_semiGappedScoring = 1;
            parameters_restrictedInsertionScoring = 1;
        }
        else if (parameters_scoringSystem == 5)
        {
            parameters_semiGappedScoring = 0;
            parameters_restrictedInsertionScoring = 1;
        }
    }

    // If a nucleotide alphabet
    if (alphabetType == encoding_nucleotide)
    {
    	// Use default nucleotide gap penalties
		defaultPenalties.startGap = 5;
		defaultPenalties.extendGap = 2;
		defaultPenalties.semiGappedDifference = 2;

        // Bytepacked alignment parameters
        parameters_setDefault(parameters_bytepackStartGap, 0);
        parameters_setDefault(parameters_bytepackExtendGap, defaultPenalties.extendGap);
        parameters_bytepackOpenGap = parameters_bytepackStartGap + parameters_bytepackExtendGap;

        parameters_bytepackExtend4Gap = 4 * parameters_bytepackExtendGap;
        parameters_bytepackOpen4Gap = parameters_bytepackStartGap + parameters_bytepackExtend4Gap;

        parameters_setDefault(parameters_bytepackDropoffDecrease, 0);

        // Set default word size
        parameters_setDefault(parameters_wordSize, 11);

        // Set trigger and dropoffs
		parameters_setDefault(parameters_ungappedNormalizedTrigger, 25.0);
        parameters_setDefault(parameters_ungappedNormalizedDropoff, 20.0);
		parameters_setDefault(parameters_gappedNormalizedDropoff, 30.0);
		parameters_setDefault(parameters_gappedFinalNormalizedDropoff, 50.0);

        // Set matrix name for nucleotide search
		parameters_scoringMatrix = (char*)global_malloc(sizeof(char) * 30);
        sprintf(parameters_scoringMatrix, "blastn matrix:%d %d",
                parameters_matchScore, parameters_mismatchScore);

        // Set values for performing hit detection
        if (parameters_wordSize >= 11)
        {
            parameters_wordTableBytes = 2;
            parameters_wordExtraBytes = (parameters_wordSize - 11) / 4;
            parameters_wordExtraLetters = ((parameters_wordSize - 11) % 4) + 3;
        }
       	// Current disabled word size < 11
/*        else if (parameters_wordSize >= 7)
        {
            parameters_wordTableBytes = 1;
            parameters_wordExtraBytes = 0;
            parameters_wordExtraLetters = (parameters_wordSize - 4);
        }*/
        else
        {
            fprintf(stderr, "Error: Word size W=%d is too small\n", parameters_wordSize);
            fflush(stderr);
            exit(-1);
        }

        parameters_wordTableLetters = parameters_wordTableBytes * 4;

    }
    // If a protein alphabet
	else
    {
        // Get default penalties for selected protein scoring matrix
        defaultPenalties = parameters_getDefaultPenalties(parameters_scoringMatrix);

        // Set default word size
        parameters_setDefault(parameters_wordSize, 3);

        parameters_setDefault(parameters_ungappedNormalizedTrigger, 22.0);
	    parameters_setDefault(parameters_ungappedNormalizedDropoff, 7.0);
        parameters_setDefault(parameters_gappedNormalizedDropoff, 15.0);
        parameters_setDefault(parameters_gappedFinalNormalizedDropoff, 25.0);
    }

    // If using byte-packed scoring instead of semi-gapped
    if (parameters_bytepackedScoring)
    {
    	// Set byte-packed default R value
    	parameters_setDefault(parameters_semiGappedR1, 0.85);
    	parameters_setDefault(parameters_semiGappedR2, 1.2);
	}
    else if (parameters_tableScoring)
    {
    	// Otherwise if using table driven scoring
    	parameters_setDefault(parameters_semiGappedR1, 1.0);
    	parameters_setDefault(parameters_semiGappedR2, 1.5);
    }
    else if (parameters_semiGappedScoring)
    {
    	// Otherwise set semi-gapped default R value
    	parameters_setDefault(parameters_semiGappedR1, 0.68);
    	parameters_setDefault(parameters_semiGappedR2, 1.2);
    }
    else
    {
    	// Regular gapped scoring
    	parameters_setDefault(parameters_semiGappedR1, 1.0);
    	parameters_setDefault(parameters_semiGappedR2, 1.0);
    }

    // If no open gap value given at the command line, use default
    parameters_setDefault(parameters_startGap, defaultPenalties.startGap);

    // If no extend gap value given at the command line, use default
    parameters_setDefault(parameters_extendGap, defaultPenalties.extendGap);

    // Invert sign of gap existance penalty and extend gap penalty if either is negative
    if (parameters_startGap < 0)
    	parameters_startGap = -parameters_startGap;
    if (parameters_extendGap < 0)
    	parameters_extendGap = -parameters_extendGap;

    // Calculate openGap from startGap
    parameters_openGap = parameters_startGap + parameters_extendGap;

    // If no semi-gapped open gap value given at the command line, use default difference
    parameters_setDefault(parameters_semiGappedStartGap,
    	parameters_startGap - defaultPenalties.semiGappedDifference);

    // Use the same gap extend value for semi-gapped alignment
    parameters_semiGappedExtendGap = parameters_extendGap;

    // Invert sign of semi-gapped gap existance penalty if negative
    if (parameters_semiGappedStartGap < 0)
    	parameters_semiGappedStartGap = -parameters_semiGappedStartGap;

    // Calculate semiGappedOpenGap from semiGappedStartGap
    parameters_semiGappedOpenGap = parameters_semiGappedStartGap + parameters_semiGappedExtendGap;

    // If not CAFE nor nucleotide search
    #ifndef CAFEMODE
    if (alphabetType != encoding_nucleotide)
    {
        parameters_findScoringMatrix();
	}
    #endif
}