int main(int argc, char *argv[]) { char *filename, *sequence, *description, *sequenceCopy; int sequenceLength; // User must provide FASTA format file at command line if (argc < 2) { fprintf(stderr, "Useage: dust <FASTA file>\n"); exit(-1); } filename = argv[1]; // Initialize encoding routines encoding_initialize(encoding_nucleotide); // Open FASTA file for reading readFasta_open(filename); // Read each sequence from the file while (readFasta_readSequence()) { // Get sequence just read sequence = readFasta_sequenceBuffer; description = readFasta_descriptionBuffer; sequenceLength = readFasta_sequenceLength; // Make in-memory copy of it sequenceCopy = (char *)global_malloc(sequenceLength); strcpy(sequenceCopy, sequence); // Perform dust filtering dust_dustSequence(sequence); // Print description and filtering sequence printf(">%s\n%s\n", description, sequence); } }
int4 main(int4 argc, char* argv[]) { // User must provide FASTA format file at command line if (argc < 4) { fprintf(stderr, "Useage: indexdb <DB filename> <Query Length> <Number of Sequences>\n"); exit(-1); } char *filename = argv[1]; int queryLen = atoi(argv[2]); int numSeqs = atoi(argv[3]); // Open sequence data file and read information encoding_initialize(encoding_protein); readdb_open_mem(filename); int ii; //int interval = readdb_numberOfSequences / numSeqs; int queryCnt = 0; for(ii = 0; ii < readdb_numVolumeSequences && queryCnt < numSeqs; ii++) { if(readdb_sequenceData[ii].sequenceLength >= queryLen) { print_sequence(ii); if(queryCnt < 50) fprint_sequence(ii, queryCnt + 1); queryCnt++; } } fprintf(stderr, "maxQueryLength: %d\n", readdb_sequenceData[ii].sequenceLength); //close database readdb_close_mem(); return 0; }
int4 main(int argc, char* argv[]) { char *sequence, *filename; uint4 sequenceLength; int4 totalWilds = 0, alphabetType; struct memSingleBlock* wildcardEdits; struct wildcardEdit* wildcardEdit; char *wildcardData = NULL, *startWildcardData = NULL; // User must provide FASTA format file at command line if (argc < 2) { fprintf(stderr, "Useage: formatdb <FASTA file>\n"); exit(-1); } filename = argv[1]; // Initialize array to store wildcard edits wildcardEdits = memSingleBlock_initialize(sizeof(struct wildcardEdit), 10); // Determine if database is protein or nucleotide alphabetType = determineDbAlphabetType(filename); if (alphabetType == encoding_protein) { printf("PROTEIN database detected.\n"); } else if (alphabetType == encoding_nucleotide) { printf("NUCLEOTIDE database detected.\n"); } // Initialize codes array encoding_initialize(alphabetType); // Initialize writing to formatted database writedb_initialize(filename, alphabetType); // Open FASTA file for reading readFasta_open(filename); printf("Formatting database..."); fflush(stdout); // Move through the FASTA file reading descriptions and sequences while (readFasta_readSequence()) { // Get sequence just read sequence = readFasta_sequenceBuffer; sequenceLength = readFasta_sequenceLength; // Encode the sequence encoding_encodeSequence(sequence, sequenceLength, alphabetType); // Convert nucleotide sequences to byte-packed format if (alphabetType == encoding_nucleotide) { // Replace any wilds with a random character totalWilds += encoding_replaceWildcards(wildcardEdits, sequence, sequenceLength); // Declare memory to hold wildcard data startWildcardData = global_realloc(startWildcardData, sizeof(char) * wildcardEdits->numEntries * 5); wildcardData = startWildcardData; // For each wildcard edit, encode details using chars and vbytes memSingleBlock_resetCurrent(wildcardEdits); while ((wildcardEdit = memSingleBlock_getCurrent(wildcardEdits)) != NULL) { // Record wild character *wildcardData = wildcardEdit->code; wildcardData++; // Convert the position to a vbyte vbyte_putVbyte(wildcardData, wildcardEdit->position); } } else { startWildcardData = wildcardData = NULL; } // printf("[%s](%d)", readFasta_descriptionBuffer, readFasta_descriptionLength); fflush(stdout); // Add sequence to the formatted collection writedb_addSequence(sequence, sequenceLength, readFasta_descriptionBuffer, readFasta_descriptionLength, startWildcardData, wildcardData - startWildcardData, NULL, 0); // Print status dots if (writedb_sequenceCount % 10000 == 0) { printf("."); fflush(stdout); } } // Close fasta reader readFasta_close(); // Finalize writing to the formatted collection writedb_close(); printf("done.\n"); printf("%d sequences processed.\n", writedb_sequenceCount); printf("%llu letters processed.\n", writedb_numberOfLetters); printf("%d wildcards encoded.\n", totalWilds); printf("%d volume(s) created.\n", writedb_volume + 1); printf("Longest/shortest sequence was %d/%d letters\n", writedb_maximumSequenceLength, writedb_minimumSequenceLength); fflush(stdout); return 0; }
int4 main(int4 argc, char *argv[]) { unsigned char *filename, *readdb_address, *sequence, code, *wildcardsFilename; uint4 descriptionStart = 0, descriptionLength = 0, sequenceLength; uint4 encodedLength, numChildren, count; char *description; struct child *children, *child; uint4 candidateNum, change, childNum; uint4 numWilds = 0; struct wild *wilds, defaultWild, *candidates, bestNewCandidate; struct wild *wildSubset, *newCandidates, *bestNewCandidates; uint4 sizeWildSubset, numOccurences, numCandidates; float defaultWildscore, candidatesScore, bestScore; // User must provide FASTA format file at command line if (argc < 4) { fprintf(stderr, "Useage: chooseWilds <database> <Wildcard score constant> " "<Wildcards output file>\n"); exit(-1); } filename = argv[1]; wildcards_scoringConstant = atof(argv[2]); wildcardsFilename = argv[3]; readdb_open(filename); printf("Number of clusters = %u\n", readdb_numberOfClusters); printf("Number of sequences = %u\n", readdb_numberOfSequences); printf("Number of volumes = %u\n", readdb_numberOfVolumes); printf("Total number of letters = %llu\n", readdb_numberOfLetters); printf("Length of longest sequence = %u\n", readdb_longestSequenceLength); printf("Alphabet type = %s\n", encoding_alphabetTypes[readdb_dbAlphabetType]); // Initialize codes array encoding_initialize(readdb_dbAlphabetType); // Load score matrix parameters_findScoringMatrix(); wildcards_scoreMatrix = scoreMatrix_load(parameters_scoringMatrixPath); // Count occurences of each wildcard set wildcards_initializeCountOccurences(readdb_longestSequenceLength); do { // Read each sequence in the collection while (readdb_readSequence(&sequence, &sequenceLength, &descriptionStart, &descriptionLength, &encodedLength)) { // If a protein sequence cluster if (encoding_alphabetType == encoding_protein && sequenceLength + 2 != encodedLength) { // Get the children children = readdb_getChildren(sequence, sequenceLength, encodedLength, descriptionStart, &numChildren); // Add to list of occurences wildcards_countOccurences(children, numChildren, sequenceLength); childNum = 0; while (childNum < numChildren) { free(children[childNum].edits); free(children[childNum].sequence - 1); childNum++; } free(children); } } } while (readdb_nextVolume()); // Get final list of number of occurences of each wild wilds = wildcards_getOccurences(&numWilds); chooseWilds_printOccurenceMatrix(wilds, numWilds); // Build default wildcard defaultWild.code = 0; defaultWild.count = 0; code = 0; while (code < encoding_numLetters) { setbit(defaultWild.code, code); code++; } // Get average score for default wildcard wildSubset = wildcards_getSubset(defaultWild, wilds, numWilds, &sizeWildSubset, &numOccurences); defaultWildscore = wildcards_averageResidueWildMatch(defaultWild, wildSubset, sizeWildSubset); printf("defaultWildScore=%f occurences=%d\n", defaultWildscore, numOccurences); // Build up list of wildcard candidates candidates = (struct wild *)global_malloc(sizeof(struct wild) * wildcards_numClusterWildcards); numCandidates = 0; while (numCandidates < wildcards_numClusterWildcards - 1) { // Explore each possible option to add to list of candidates count = 0; bestScore = 0; while (count < numWilds) { // printf("set pos %d to ", numCandidates); // wildcards_printWildcard(wilds[count].code); candidates[numCandidates] = wilds[count]; // Score a set of candidates candidatesScore = wildcards_scoreCandidates( candidates, numCandidates + 1, wilds, numWilds, defaultWildscore); // printf("Candidates saving=%f\n", candidatesScore); if (candidatesScore > bestScore) { bestScore = candidatesScore; bestNewCandidate = wilds[count]; } count++; } printf("Score=%f Best new candidate (%d): ", bestScore, numCandidates); wildcards_printWildcard(bestNewCandidate.code); candidates[numCandidates] = bestNewCandidate; numCandidates++; } newCandidates = (struct wild *)global_malloc(sizeof(struct wild) * wildcards_numClusterWildcards); bestNewCandidates = (struct wild *)global_malloc( sizeof(struct wild) * wildcards_numClusterWildcards); // Perform hill climbing; consider changing each position change = 1; while (change) { change = 0; candidateNum = 0; bestScore = 0; while (candidateNum < numCandidates) { // Start with current candidates memcpy(newCandidates, candidates, sizeof(struct wild) * wildcards_numClusterWildcards - 1); // Change current position to every possible candidate count = 0; while (count < numWilds) { newCandidates[candidateNum] = wilds[count]; // Score a possible new set of candidates candidatesScore = wildcards_scoreCandidates( newCandidates, numCandidates, wilds, numWilds, defaultWildscore); // Check if best new candidates if (candidatesScore > bestScore) { bestScore = candidatesScore; memcpy(bestNewCandidates, newCandidates, sizeof(struct wild) * wildcards_numClusterWildcards - 1); } count++; } candidateNum++; } // Update candidates if (bestScore > wildcards_scoreCandidates(candidates, numCandidates, wilds, numWilds, defaultWildscore)) { printf("New bestScore=%f\n", bestScore); memcpy(candidates, bestNewCandidates, sizeof(struct wild) * wildcards_numClusterWildcards - 1); change = 1; } candidateNum = 0; while (candidateNum < numCandidates) { wildcards_printWildcard(candidates[candidateNum].code); candidateNum++; } } // Print out final set of clusters with default wild added candidates[numCandidates] = defaultWild; numCandidates++; wildcards_scoreCandidates(candidates, numCandidates, wilds, numWilds, defaultWildscore); wildcards_outputWildcards(wildcardsFilename); printf("%d sequences read.\n", readdb_numberOfSequences); fflush(stdout); free(candidates); free(newCandidates); free(bestNewCandidates); return 0; }