// Process a given query position list void qPosList_processList(int2* queryPositions, int2 numQueryPositions, int4 codeword) { int4 listCount = 0, queryPositionCount, subset, present; struct memSingleBlock* list; struct queryPosition* queryPosition = NULL; struct codeword* newCodeword; // Iterative through existing query positions lists (ordered from longest to shortest) while (listCount < qPosList_numQPosLists) { // Check for one that contains a subset of to-be-added query positions list = qPosList_qPosLists + listCount; // Start by assuming it is subset = 1; // Iterate through each query position in the current existing list memSingleBlock_resetCurrent(list); while ((queryPosition = memSingleBlock_getCurrent(list)) != NULL && subset) { // Iterate through each query position in the new list (which is sorted) queryPositionCount = 0; while (queryPositionCount < numQueryPositions) { // Found a match, break out and proceed to next position in current list if (queryPosition->queryPosition == queryPositions[queryPositionCount]) { break; } // The query position is not present in the new list, then existing list // is not a subset of the new one else if (queryPosition->queryPosition < queryPositions[queryPositionCount]) { subset = 0; break; } // Otherwise keep going queryPositionCount++; } // If we got to the end of the list, and didn't find a match, not a subset if (queryPositionCount == numQueryPositions) subset = 0; // If the query positions in the existing list processed so far match all of // the positions in the new list if (list->currentEntry == numQueryPositions && subset) { // We have a match, starting here newCodeword = global_malloc(sizeof(struct codeword)); newCodeword->codeword = codeword; newCodeword->next = queryPosition->codewords; queryPosition->codewords = newCodeword; return; } } if (subset) { // If this existing list is a subset of the new list then add the new/additional // query positions to the end of it queryPosition = memSingleBlock_getLastEntry(list); // Iterate through each query position in the new list while (numQueryPositions > 0) { numQueryPositions--; present = 0; // Check if present in the existing list memSingleBlock_resetCurrent(list); while ((queryPosition = memSingleBlock_getCurrent(list)) != NULL && subset) { // Found it if (queryPosition->queryPosition == queryPositions[numQueryPositions]) { present = 1; break; } } // Not present - add to the existing list with a null reference codeword if (!present) { queryPosition = memSingleBlock_newEntry(list); queryPosition->queryPosition = queryPositions[numQueryPositions]; // No refering codeword for any of the positions except the last queryPosition->codewords = NULL; } queryPositionCount++; } // Get the last, new query position queryPosition = memSingleBlock_getLastEntry(list); // Add reference codeword to the last query position (will become first) newCodeword = global_malloc(sizeof(struct codeword)); newCodeword->next = NULL; newCodeword->codeword = codeword; queryPosition->codewords = newCodeword; // Re-sort the lists of query positions from longest to shortest qsort(qPosList_qPosLists, qPosList_numQPosLists, sizeof(struct memSingleBlock), qPosList_compareList); return; } listCount++; } // Instead use a new list of query positions list = qPosList_qPosLists + qPosList_numQPosLists; list->numEntries = 0; qPosList_numQPosLists++; // And copy values into it while (numQueryPositions > 0) { numQueryPositions--; queryPosition = memSingleBlock_newEntry(list); queryPosition->queryPosition = queryPositions[numQueryPositions]; // No refering codeword for any of the positions except the last queryPosition->codewords = NULL; } // Reference at the last query position (will become the first) to the // new query position list's codeword newCodeword = global_malloc(sizeof(struct codeword)); newCodeword->next = NULL; newCodeword->codeword = codeword; queryPosition->codewords = newCodeword; // Sort the lists from longest to shortest qsort(qPosList_qPosLists, qPosList_numQPosLists, sizeof(struct memSingleBlock), qPosList_compareList); }
int4 main(int argc, char* argv[]) { char *sequence, *filename; uint4 sequenceLength; int4 totalWilds = 0, alphabetType; struct memSingleBlock* wildcardEdits; struct wildcardEdit* wildcardEdit; char *wildcardData = NULL, *startWildcardData = NULL; // User must provide FASTA format file at command line if (argc < 2) { fprintf(stderr, "Useage: formatdb <FASTA file>\n"); exit(-1); } filename = argv[1]; // Initialize array to store wildcard edits wildcardEdits = memSingleBlock_initialize(sizeof(struct wildcardEdit), 10); // Determine if database is protein or nucleotide alphabetType = determineDbAlphabetType(filename); if (alphabetType == encoding_protein) { printf("PROTEIN database detected.\n"); } else if (alphabetType == encoding_nucleotide) { printf("NUCLEOTIDE database detected.\n"); } // Initialize codes array encoding_initialize(alphabetType); // Initialize writing to formatted database writedb_initialize(filename, alphabetType); // Open FASTA file for reading readFasta_open(filename); printf("Formatting database..."); fflush(stdout); // Move through the FASTA file reading descriptions and sequences while (readFasta_readSequence()) { // Get sequence just read sequence = readFasta_sequenceBuffer; sequenceLength = readFasta_sequenceLength; // Encode the sequence encoding_encodeSequence(sequence, sequenceLength, alphabetType); // Convert nucleotide sequences to byte-packed format if (alphabetType == encoding_nucleotide) { // Replace any wilds with a random character totalWilds += encoding_replaceWildcards(wildcardEdits, sequence, sequenceLength); // Declare memory to hold wildcard data startWildcardData = global_realloc(startWildcardData, sizeof(char) * wildcardEdits->numEntries * 5); wildcardData = startWildcardData; // For each wildcard edit, encode details using chars and vbytes memSingleBlock_resetCurrent(wildcardEdits); while ((wildcardEdit = memSingleBlock_getCurrent(wildcardEdits)) != NULL) { // Record wild character *wildcardData = wildcardEdit->code; wildcardData++; // Convert the position to a vbyte vbyte_putVbyte(wildcardData, wildcardEdit->position); } } else { startWildcardData = wildcardData = NULL; } // printf("[%s](%d)", readFasta_descriptionBuffer, readFasta_descriptionLength); fflush(stdout); // Add sequence to the formatted collection writedb_addSequence(sequence, sequenceLength, readFasta_descriptionBuffer, readFasta_descriptionLength, startWildcardData, wildcardData - startWildcardData, NULL, 0); // Print status dots if (writedb_sequenceCount % 10000 == 0) { printf("."); fflush(stdout); } } // Close fasta reader readFasta_close(); // Finalize writing to the formatted collection writedb_close(); printf("done.\n"); printf("%d sequences processed.\n", writedb_sequenceCount); printf("%llu letters processed.\n", writedb_numberOfLetters); printf("%d wildcards encoded.\n", totalWilds); printf("%d volume(s) created.\n", writedb_volume + 1); printf("Longest/shortest sequence was %d/%d letters\n", writedb_maximumSequenceLength, writedb_minimumSequenceLength); fflush(stdout); return 0; }