Example #1
0
int main(int argc, char *argv[]) {
  char *filename, *sequence, *description, *sequenceCopy;
  int sequenceLength;

  // User must provide FASTA format file at command line
  if (argc < 2) {
    fprintf(stderr, "Useage: dust <FASTA file>\n");
    exit(-1);
  }
  filename = argv[1];

  // Initialize encoding routines
  encoding_initialize(encoding_nucleotide);

  // Open FASTA file for reading
  readFasta_open(filename);

  // Read each sequence from the file
  while (readFasta_readSequence()) {
    // Get sequence just read
    sequence = readFasta_sequenceBuffer;
    description = readFasta_descriptionBuffer;
    sequenceLength = readFasta_sequenceLength;

    // Make in-memory copy of it
    sequenceCopy = (char *)global_malloc(sequenceLength);
    strcpy(sequenceCopy, sequence);

    // Perform dust filtering
    dust_dustSequence(sequence);

    // Print description and filtering sequence
    printf(">%s\n%s\n", description, sequence);
  }
}
Example #2
0
int4 main(int4 argc, char* argv[])
{
    // User must provide FASTA format file at command line
	if (argc < 4)
	{
		fprintf(stderr, "Useage: indexdb <DB filename> <Query Length> <Number of Sequences>\n");
		exit(-1);
	}

	char *filename = argv[1];
    int queryLen = atoi(argv[2]);
	int numSeqs = atoi(argv[3]);

    // Open sequence data file and read information
	encoding_initialize(encoding_protein);
    readdb_open_mem(filename);

    int ii;
    //int interval = readdb_numberOfSequences / numSeqs; 

    int queryCnt = 0;
    for(ii = 0; ii < readdb_numVolumeSequences && queryCnt < numSeqs; ii++)
    {
        if(readdb_sequenceData[ii].sequenceLength >= queryLen)
        {
            print_sequence(ii);
            if(queryCnt < 50)
                fprint_sequence(ii, queryCnt + 1);
            queryCnt++;
        }
    }

    fprintf(stderr, "maxQueryLength: %d\n", readdb_sequenceData[ii].sequenceLength);

    //close database
    readdb_close_mem();
    return 0;
}
Example #3
0
int4 main(int argc, char* argv[])
{
    char *sequence, *filename;
    uint4 sequenceLength;
    int4 totalWilds = 0, alphabetType;
    struct memSingleBlock* wildcardEdits;
    struct wildcardEdit* wildcardEdit;
    char *wildcardData = NULL, *startWildcardData = NULL;

    // User must provide FASTA format file at command line
    if (argc < 2)
    {
        fprintf(stderr, "Useage: formatdb <FASTA file>\n");
        exit(-1);
    }
    filename = argv[1];

    // Initialize array to store wildcard edits
    wildcardEdits = memSingleBlock_initialize(sizeof(struct wildcardEdit), 10);

    // Determine if database is protein or nucleotide
    alphabetType = determineDbAlphabetType(filename);

    if (alphabetType == encoding_protein)
    {
        printf("PROTEIN database detected.\n");
    }
    else if (alphabetType == encoding_nucleotide)
    {
        printf("NUCLEOTIDE database detected.\n");
    }

    // Initialize codes array
    encoding_initialize(alphabetType);

    // Initialize writing to formatted database
    writedb_initialize(filename, alphabetType);

    // Open FASTA file for reading
    readFasta_open(filename);

    printf("Formatting database...");
    fflush(stdout);

    // Move through the FASTA file reading descriptions and sequences
    while (readFasta_readSequence())
    {
        // Get sequence just read
        sequence = readFasta_sequenceBuffer;
        sequenceLength = readFasta_sequenceLength;

        // Encode the sequence
        encoding_encodeSequence(sequence, sequenceLength, alphabetType);

        // Convert nucleotide sequences to byte-packed format
        if (alphabetType == encoding_nucleotide)
        {
            // Replace any wilds with a random character
            totalWilds += encoding_replaceWildcards(wildcardEdits, sequence, sequenceLength);

            // Declare memory to hold wildcard data
            startWildcardData = global_realloc(startWildcardData,
                                               sizeof(char) * wildcardEdits->numEntries * 5);
            wildcardData = startWildcardData;

            // For each wildcard edit, encode details using chars and vbytes
            memSingleBlock_resetCurrent(wildcardEdits);
            while ((wildcardEdit = memSingleBlock_getCurrent(wildcardEdits)) != NULL)
            {
                // Record wild character
                *wildcardData = wildcardEdit->code;
                wildcardData++;

                // Convert the position to a vbyte
                vbyte_putVbyte(wildcardData, wildcardEdit->position);
            }
        }
        else
        {
            startWildcardData = wildcardData = NULL;
        }

//        printf("[%s](%d)", readFasta_descriptionBuffer, readFasta_descriptionLength); fflush(stdout);

        // Add sequence to the formatted collection
        writedb_addSequence(sequence, sequenceLength, readFasta_descriptionBuffer,
                            readFasta_descriptionLength, startWildcardData,
                            wildcardData - startWildcardData, NULL, 0);

        // Print status dots
        if (writedb_sequenceCount % 10000 == 0)
        {
            printf(".");
            fflush(stdout);
        }
    }

    // Close fasta reader
    readFasta_close();

    // Finalize writing to the formatted collection
    writedb_close();

    printf("done.\n");
    printf("%d sequences processed.\n", writedb_sequenceCount);
    printf("%llu letters processed.\n", writedb_numberOfLetters);
    printf("%d wildcards encoded.\n", totalWilds);
    printf("%d volume(s) created.\n", writedb_volume + 1);
    printf("Longest/shortest sequence was %d/%d letters\n",
           writedb_maximumSequenceLength, writedb_minimumSequenceLength);
    fflush(stdout);

    return 0;
}
Example #4
0
int4 main(int4 argc, char *argv[]) {
    unsigned char *filename, *readdb_address, *sequence, code, *wildcardsFilename;
    uint4 descriptionStart = 0, descriptionLength = 0, sequenceLength;
    uint4 encodedLength, numChildren, count;
    char *description;
    struct child *children, *child;
    uint4 candidateNum, change, childNum;
    uint4 numWilds = 0;
    struct wild *wilds, defaultWild, *candidates, bestNewCandidate;
    struct wild *wildSubset, *newCandidates, *bestNewCandidates;
    uint4 sizeWildSubset, numOccurences, numCandidates;
    float defaultWildscore, candidatesScore, bestScore;

    // User must provide FASTA format file at command line
    if (argc < 4) {
        fprintf(stderr, "Useage: chooseWilds <database> <Wildcard score constant> "
                "<Wildcards output file>\n");
        exit(-1);
    }
    filename = argv[1];
    wildcards_scoringConstant = atof(argv[2]);
    wildcardsFilename = argv[3];

    readdb_open(filename);

    printf("Number of clusters = %u\n", readdb_numberOfClusters);
    printf("Number of sequences = %u\n", readdb_numberOfSequences);
    printf("Number of volumes = %u\n", readdb_numberOfVolumes);
    printf("Total number of letters = %llu\n", readdb_numberOfLetters);
    printf("Length of longest sequence = %u\n", readdb_longestSequenceLength);
    printf("Alphabet type = %s\n", encoding_alphabetTypes[readdb_dbAlphabetType]);

    // Initialize codes array
    encoding_initialize(readdb_dbAlphabetType);

    // Load score matrix
    parameters_findScoringMatrix();
    wildcards_scoreMatrix = scoreMatrix_load(parameters_scoringMatrixPath);

    // Count occurences of each wildcard set
    wildcards_initializeCountOccurences(readdb_longestSequenceLength);
    do {
        // Read each sequence in the collection
        while (readdb_readSequence(&sequence, &sequenceLength, &descriptionStart,
                                   &descriptionLength, &encodedLength)) {
            // If a protein sequence cluster
            if (encoding_alphabetType == encoding_protein &&
                    sequenceLength + 2 != encodedLength) {
                // Get the children
                children = readdb_getChildren(sequence, sequenceLength, encodedLength,
                                              descriptionStart, &numChildren);

                // Add to list of occurences
                wildcards_countOccurences(children, numChildren, sequenceLength);

                childNum = 0;
                while (childNum < numChildren) {
                    free(children[childNum].edits);
                    free(children[childNum].sequence - 1);
                    childNum++;
                }

                free(children);
            }
        }
    } while (readdb_nextVolume());

    // Get final list of number of occurences of each wild
    wilds = wildcards_getOccurences(&numWilds);

    chooseWilds_printOccurenceMatrix(wilds, numWilds);

    // Build default wildcard
    defaultWild.code = 0;
    defaultWild.count = 0;
    code = 0;
    while (code < encoding_numLetters) {
        setbit(defaultWild.code, code);
        code++;
    }

    // Get average score for default wildcard
    wildSubset = wildcards_getSubset(defaultWild, wilds, numWilds,
                                     &sizeWildSubset, &numOccurences);
    defaultWildscore = wildcards_averageResidueWildMatch(defaultWild, wildSubset,
                       sizeWildSubset);
    printf("defaultWildScore=%f occurences=%d\n", defaultWildscore,
           numOccurences);

    // Build up list of wildcard candidates
    candidates = (struct wild *)global_malloc(sizeof(struct wild) *
                 wildcards_numClusterWildcards);
    numCandidates = 0;
    while (numCandidates < wildcards_numClusterWildcards - 1) {
        // Explore each possible option to add to list of candidates
        count = 0;
        bestScore = 0;
        while (count < numWilds) {
            //        	printf("set pos %d to ", numCandidates);
            //			wildcards_printWildcard(wilds[count].code);
            candidates[numCandidates] = wilds[count];

            // Score a set of candidates
            candidatesScore = wildcards_scoreCandidates(
                                  candidates, numCandidates + 1, wilds, numWilds, defaultWildscore);
            //            printf("Candidates saving=%f\n", candidatesScore);
            if (candidatesScore > bestScore) {
                bestScore = candidatesScore;
                bestNewCandidate = wilds[count];
            }

            count++;
        }

        printf("Score=%f Best new candidate (%d): ", bestScore, numCandidates);
        wildcards_printWildcard(bestNewCandidate.code);
        candidates[numCandidates] = bestNewCandidate;

        numCandidates++;
    }

    newCandidates = (struct wild *)global_malloc(sizeof(struct wild) *
                    wildcards_numClusterWildcards);
    bestNewCandidates = (struct wild *)global_malloc(
                            sizeof(struct wild) * wildcards_numClusterWildcards);

    // Perform hill climbing; consider changing each position
    change = 1;
    while (change) {
        change = 0;
        candidateNum = 0;
        bestScore = 0;
        while (candidateNum < numCandidates) {
            // Start with current candidates
            memcpy(newCandidates, candidates,
                   sizeof(struct wild) * wildcards_numClusterWildcards - 1);

            // Change current position to every possible candidate
            count = 0;
            while (count < numWilds) {
                newCandidates[candidateNum] = wilds[count];

                // Score a possible new set of candidates
                candidatesScore = wildcards_scoreCandidates(
                                      newCandidates, numCandidates, wilds, numWilds, defaultWildscore);

                // Check if best new candidates
                if (candidatesScore > bestScore) {
                    bestScore = candidatesScore;
                    memcpy(bestNewCandidates, newCandidates,
                           sizeof(struct wild) * wildcards_numClusterWildcards - 1);
                }

                count++;
            }

            candidateNum++;
        }

        // Update candidates
        if (bestScore > wildcards_scoreCandidates(candidates, numCandidates, wilds,
                numWilds, defaultWildscore)) {
            printf("New bestScore=%f\n", bestScore);
            memcpy(candidates, bestNewCandidates,
                   sizeof(struct wild) * wildcards_numClusterWildcards - 1);
            change = 1;
        }

        candidateNum = 0;
        while (candidateNum < numCandidates) {
            wildcards_printWildcard(candidates[candidateNum].code);
            candidateNum++;
        }
    }

    // Print out final set of clusters with default wild added
    candidates[numCandidates] = defaultWild;
    numCandidates++;
    wildcards_scoreCandidates(candidates, numCandidates, wilds, numWilds,
                              defaultWildscore);
    wildcards_outputWildcards(wildcardsFilename);

    printf("%d sequences read.\n", readdb_numberOfSequences);
    fflush(stdout);

    free(candidates);
    free(newCandidates);
    free(bestNewCandidates);

    return 0;
}