int4 main(int4 argc, char *argv[]) { unsigned char *filename, *readdb_address, *sequence, code, *wildcardsFilename; uint4 descriptionStart = 0, descriptionLength = 0, sequenceLength; uint4 encodedLength, numChildren, count; char *description; struct child *children, *child; uint4 candidateNum, change, childNum; uint4 numWilds = 0; struct wild *wilds, defaultWild, *candidates, bestNewCandidate; struct wild *wildSubset, *newCandidates, *bestNewCandidates; uint4 sizeWildSubset, numOccurences, numCandidates; float defaultWildscore, candidatesScore, bestScore; // User must provide FASTA format file at command line if (argc < 4) { fprintf(stderr, "Useage: chooseWilds <database> <Wildcard score constant> " "<Wildcards output file>\n"); exit(-1); } filename = argv[1]; wildcards_scoringConstant = atof(argv[2]); wildcardsFilename = argv[3]; readdb_open(filename); printf("Number of clusters = %u\n", readdb_numberOfClusters); printf("Number of sequences = %u\n", readdb_numberOfSequences); printf("Number of volumes = %u\n", readdb_numberOfVolumes); printf("Total number of letters = %llu\n", readdb_numberOfLetters); printf("Length of longest sequence = %u\n", readdb_longestSequenceLength); printf("Alphabet type = %s\n", encoding_alphabetTypes[readdb_dbAlphabetType]); // Initialize codes array encoding_initialize(readdb_dbAlphabetType); // Load score matrix parameters_findScoringMatrix(); wildcards_scoreMatrix = scoreMatrix_load(parameters_scoringMatrixPath); // Count occurences of each wildcard set wildcards_initializeCountOccurences(readdb_longestSequenceLength); do { // Read each sequence in the collection while (readdb_readSequence(&sequence, &sequenceLength, &descriptionStart, &descriptionLength, &encodedLength)) { // If a protein sequence cluster if (encoding_alphabetType == encoding_protein && sequenceLength + 2 != encodedLength) { // Get the children children = readdb_getChildren(sequence, sequenceLength, encodedLength, descriptionStart, &numChildren); // Add to list of occurences wildcards_countOccurences(children, numChildren, sequenceLength); childNum = 0; while (childNum < numChildren) { free(children[childNum].edits); free(children[childNum].sequence - 1); childNum++; } free(children); } } } while (readdb_nextVolume()); // Get final list of number of occurences of each wild wilds = wildcards_getOccurences(&numWilds); chooseWilds_printOccurenceMatrix(wilds, numWilds); // Build default wildcard defaultWild.code = 0; defaultWild.count = 0; code = 0; while (code < encoding_numLetters) { setbit(defaultWild.code, code); code++; } // Get average score for default wildcard wildSubset = wildcards_getSubset(defaultWild, wilds, numWilds, &sizeWildSubset, &numOccurences); defaultWildscore = wildcards_averageResidueWildMatch(defaultWild, wildSubset, sizeWildSubset); printf("defaultWildScore=%f occurences=%d\n", defaultWildscore, numOccurences); // Build up list of wildcard candidates candidates = (struct wild *)global_malloc(sizeof(struct wild) * wildcards_numClusterWildcards); numCandidates = 0; while (numCandidates < wildcards_numClusterWildcards - 1) { // Explore each possible option to add to list of candidates count = 0; bestScore = 0; while (count < numWilds) { // printf("set pos %d to ", numCandidates); // wildcards_printWildcard(wilds[count].code); candidates[numCandidates] = wilds[count]; // Score a set of candidates candidatesScore = wildcards_scoreCandidates( candidates, numCandidates + 1, wilds, numWilds, defaultWildscore); // printf("Candidates saving=%f\n", candidatesScore); if (candidatesScore > bestScore) { bestScore = candidatesScore; bestNewCandidate = wilds[count]; } count++; } printf("Score=%f Best new candidate (%d): ", bestScore, numCandidates); wildcards_printWildcard(bestNewCandidate.code); candidates[numCandidates] = bestNewCandidate; numCandidates++; } newCandidates = (struct wild *)global_malloc(sizeof(struct wild) * wildcards_numClusterWildcards); bestNewCandidates = (struct wild *)global_malloc( sizeof(struct wild) * wildcards_numClusterWildcards); // Perform hill climbing; consider changing each position change = 1; while (change) { change = 0; candidateNum = 0; bestScore = 0; while (candidateNum < numCandidates) { // Start with current candidates memcpy(newCandidates, candidates, sizeof(struct wild) * wildcards_numClusterWildcards - 1); // Change current position to every possible candidate count = 0; while (count < numWilds) { newCandidates[candidateNum] = wilds[count]; // Score a possible new set of candidates candidatesScore = wildcards_scoreCandidates( newCandidates, numCandidates, wilds, numWilds, defaultWildscore); // Check if best new candidates if (candidatesScore > bestScore) { bestScore = candidatesScore; memcpy(bestNewCandidates, newCandidates, sizeof(struct wild) * wildcards_numClusterWildcards - 1); } count++; } candidateNum++; } // Update candidates if (bestScore > wildcards_scoreCandidates(candidates, numCandidates, wilds, numWilds, defaultWildscore)) { printf("New bestScore=%f\n", bestScore); memcpy(candidates, bestNewCandidates, sizeof(struct wild) * wildcards_numClusterWildcards - 1); change = 1; } candidateNum = 0; while (candidateNum < numCandidates) { wildcards_printWildcard(candidates[candidateNum].code); candidateNum++; } } // Print out final set of clusters with default wild added candidates[numCandidates] = defaultWild; numCandidates++; wildcards_scoreCandidates(candidates, numCandidates, wilds, numWilds, defaultWildscore); wildcards_outputWildcards(wildcardsFilename); printf("%d sequences read.\n", readdb_numberOfSequences); fflush(stdout); free(candidates); free(newCandidates); free(bestNewCandidates); return 0; }
// Given the alphabet type of the query, use the relevant parameter defaults void parameters_loadDefaults(unsigned char alphabetType) { struct defaultPenalties defaultPenalties; // Decode scoring system parameter if (alphabetType == encoding_nucleotide) { // Nulceotide case if (parameters_scoringSystem == 0) { parameters_bytepackedScoring = 1; parameters_semiGappedScoring = 0; parameters_restrictedInsertionScoring = 1; } else if (parameters_scoringSystem == 1) { parameters_bytepackedScoring = 0; parameters_semiGappedScoring = 0; parameters_restrictedInsertionScoring = 1; } else if (parameters_scoringSystem == 2) { parameters_bytepackedScoring = 0; parameters_semiGappedScoring = 1; parameters_restrictedInsertionScoring = 0; } else if (parameters_scoringSystem == 3) { parameters_bytepackedScoring = 0; parameters_semiGappedScoring = 0; parameters_restrictedInsertionScoring = 0; } else if (parameters_scoringSystem == 4) { parameters_bytepackedScoring = 0; parameters_semiGappedScoring = 1; parameters_restrictedInsertionScoring = 1; } else if (parameters_scoringSystem == 5) { parameters_tableScoring = 1; parameters_bytepackedScoring = 0; parameters_semiGappedScoring = 0; parameters_restrictedInsertionScoring = 1; } } else { // Protein case parameters_bytepackedScoring = 0; if (parameters_scoringSystem == 0) { parameters_semiGappedScoring = 1; parameters_restrictedInsertionScoring = 1; } else if (parameters_scoringSystem == 1) { parameters_semiGappedScoring = 0; parameters_restrictedInsertionScoring = 1; } else if (parameters_scoringSystem == 2) { parameters_semiGappedScoring = 1; parameters_restrictedInsertionScoring = 0; } else if (parameters_scoringSystem == 3) { parameters_semiGappedScoring = 0; parameters_restrictedInsertionScoring = 0; } else if (parameters_scoringSystem == 4) { parameters_semiGappedScoring = 1; parameters_restrictedInsertionScoring = 1; } else if (parameters_scoringSystem == 5) { parameters_semiGappedScoring = 0; parameters_restrictedInsertionScoring = 1; } } // If a nucleotide alphabet if (alphabetType == encoding_nucleotide) { // Use default nucleotide gap penalties defaultPenalties.startGap = 5; defaultPenalties.extendGap = 2; defaultPenalties.semiGappedDifference = 2; // Bytepacked alignment parameters parameters_setDefault(parameters_bytepackStartGap, 0); parameters_setDefault(parameters_bytepackExtendGap, defaultPenalties.extendGap); parameters_bytepackOpenGap = parameters_bytepackStartGap + parameters_bytepackExtendGap; parameters_bytepackExtend4Gap = 4 * parameters_bytepackExtendGap; parameters_bytepackOpen4Gap = parameters_bytepackStartGap + parameters_bytepackExtend4Gap; parameters_setDefault(parameters_bytepackDropoffDecrease, 0); // Set default word size parameters_setDefault(parameters_wordSize, 11); // Set trigger and dropoffs parameters_setDefault(parameters_ungappedNormalizedTrigger, 25.0); parameters_setDefault(parameters_ungappedNormalizedDropoff, 20.0); parameters_setDefault(parameters_gappedNormalizedDropoff, 30.0); parameters_setDefault(parameters_gappedFinalNormalizedDropoff, 50.0); // Set matrix name for nucleotide search parameters_scoringMatrix = (char*)global_malloc(sizeof(char) * 30); sprintf(parameters_scoringMatrix, "blastn matrix:%d %d", parameters_matchScore, parameters_mismatchScore); // Set values for performing hit detection if (parameters_wordSize >= 11) { parameters_wordTableBytes = 2; parameters_wordExtraBytes = (parameters_wordSize - 11) / 4; parameters_wordExtraLetters = ((parameters_wordSize - 11) % 4) + 3; } // Current disabled word size < 11 /* else if (parameters_wordSize >= 7) { parameters_wordTableBytes = 1; parameters_wordExtraBytes = 0; parameters_wordExtraLetters = (parameters_wordSize - 4); }*/ else { fprintf(stderr, "Error: Word size W=%d is too small\n", parameters_wordSize); fflush(stderr); exit(-1); } parameters_wordTableLetters = parameters_wordTableBytes * 4; } // If a protein alphabet else { // Get default penalties for selected protein scoring matrix defaultPenalties = parameters_getDefaultPenalties(parameters_scoringMatrix); // Set default word size parameters_setDefault(parameters_wordSize, 3); parameters_setDefault(parameters_ungappedNormalizedTrigger, 22.0); parameters_setDefault(parameters_ungappedNormalizedDropoff, 7.0); parameters_setDefault(parameters_gappedNormalizedDropoff, 15.0); parameters_setDefault(parameters_gappedFinalNormalizedDropoff, 25.0); } // If using byte-packed scoring instead of semi-gapped if (parameters_bytepackedScoring) { // Set byte-packed default R value parameters_setDefault(parameters_semiGappedR1, 0.85); parameters_setDefault(parameters_semiGappedR2, 1.2); } else if (parameters_tableScoring) { // Otherwise if using table driven scoring parameters_setDefault(parameters_semiGappedR1, 1.0); parameters_setDefault(parameters_semiGappedR2, 1.5); } else if (parameters_semiGappedScoring) { // Otherwise set semi-gapped default R value parameters_setDefault(parameters_semiGappedR1, 0.68); parameters_setDefault(parameters_semiGappedR2, 1.2); } else { // Regular gapped scoring parameters_setDefault(parameters_semiGappedR1, 1.0); parameters_setDefault(parameters_semiGappedR2, 1.0); } // If no open gap value given at the command line, use default parameters_setDefault(parameters_startGap, defaultPenalties.startGap); // If no extend gap value given at the command line, use default parameters_setDefault(parameters_extendGap, defaultPenalties.extendGap); // Invert sign of gap existance penalty and extend gap penalty if either is negative if (parameters_startGap < 0) parameters_startGap = -parameters_startGap; if (parameters_extendGap < 0) parameters_extendGap = -parameters_extendGap; // Calculate openGap from startGap parameters_openGap = parameters_startGap + parameters_extendGap; // If no semi-gapped open gap value given at the command line, use default difference parameters_setDefault(parameters_semiGappedStartGap, parameters_startGap - defaultPenalties.semiGappedDifference); // Use the same gap extend value for semi-gapped alignment parameters_semiGappedExtendGap = parameters_extendGap; // Invert sign of semi-gapped gap existance penalty if negative if (parameters_semiGappedStartGap < 0) parameters_semiGappedStartGap = -parameters_semiGappedStartGap; // Calculate semiGappedOpenGap from semiGappedStartGap parameters_semiGappedOpenGap = parameters_semiGappedStartGap + parameters_semiGappedExtendGap; // If not CAFE nor nucleotide search #ifndef CAFEMODE if (alphabetType != encoding_nucleotide) { parameters_findScoringMatrix(); } #endif }