Beispiel #1
0
// Returns a hash mapping from sequence header to sequence data.
static stHash *readFastaFile(char *filename) {
    FILE *fasta = fopen(filename, "r");
    if (fasta == NULL) {
        st_errnoAbort("Could not open fasta file %s", filename);
    }
    stHash *headerToData = stHash_construct3(stHash_stringKey,
                                             stHash_stringEqualKey,
                                             free,
                                             free);
    struct List *seqs = constructEmptyList(0, NULL);
    struct List *seqLengths = constructEmptyList(0, free);
    struct List *headers = constructEmptyList(0, free);
    fastaRead(fasta, seqs, seqLengths, headers);

    for (int64_t i = 0; i < seqs->length; i++) {
        char *fullHeader = headers->list[i];
        stList *headerTokens = stString_splitByString(fullHeader, " ");
        char *usableHeader = stString_copy(stList_get(headerTokens, 0));
        stHash_insert(headerToData, usableHeader, seqs->list[i]);
        stList_destruct(headerTokens);
    }
    destructList(seqs);
    destructList(seqLengths);
    destructList(headers);

    return headerToData;
}
Beispiel #2
0
int main(int argc, char *argv[]) {
    FILE *fileHandle;
    struct List *seqs;
    struct List *seqLengths;
    struct List *seqNames;
    int32_t i;

    assert(argc == 3);

    seqs = constructEmptyList(0, free);
    seqLengths = constructEmptyList(0, (void (*)(void *))destructInt);
    seqNames = constructEmptyList(0, free);

    fileHandle = fopen(argv[1], "r");
    fastaRead(fileHandle, seqs, seqLengths, seqNames);
    fclose(fileHandle);

    fileHandle = fopen(argv[2], "w");
    for(i=0; i < seqs->length; i++) {
        assert(strlen(seqs->list[i]) == *((int32_t *)seqLengths->list[i]));
        fastaWrite(seqs->list[i], seqNames->list[i], fileHandle);
    }
    fclose(fileHandle);

    return 0;
}
Beispiel #3
0
int main(int argc, char *argv[]) {
    /*
     * Arguments/options
     */
    char *logLevelString = NULL;
    char *mfaFile = NULL;
    char *outputFile = NULL;
    char *treeFile = NULL;

    ///////////////////////////////////////////////////////////////////////////
    // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
    ///////////////////////////////////////////////////////////////////////////

    while (1) {
        static struct option long_options[] = {
            { "logLevel", required_argument, 0, 'a' },
            { "mfaFile", required_argument, 0, 'b' },
            { "outputFile", required_argument, 0, 'd' },
            { "treeFile", optional_argument, 0, 't' },
            { "help", no_argument, 0, 'h' },
            { 0, 0, 0, 0 }
        };

        int option_index = 0;

        int key = getopt_long(argc, argv, "a:b:d:t:h", long_options, &option_index);

        if (key == -1) {
            break;
        }

        switch (key) {
            case 'a':
                logLevelString = stString_copy(optarg);
                break;
            case 'b':
                mfaFile = stString_copy(optarg);
                break;
            case 'd':
                outputFile = stString_copy(optarg);
                break;
            case 't':
                treeFile = stString_copy(optarg);
                break;
            case 'h':
                usage();
                return 0;
            default:
                usage();
                return 1;
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    // (0) Check the inputs.
    ///////////////////////////////////////////////////////////////////////////

    if (argc == 1) {
        usage();
        exit(1);
    }

    assert(mfaFile != NULL);
    assert(outputFile != NULL);

    //////////////////////////////////////////////
    //Set up logging
    //////////////////////////////////////////////

    st_setLogLevelFromString(logLevelString);

    //////////////////////////////////////////////
    //Log (some of) the inputs
    //////////////////////////////////////////////

    st_logInfo("MFA file  name : %s\n", mfaFile);
    st_logInfo("Output MAF file : %s\n", outputFile);
    st_logInfo("Tree file name: %s\n", treeFile == NULL ? "null" : treeFile);

    //////////////////////////////////////////////
    //Get the MFA alignment
    //////////////////////////////////////////////

    //get the alignment
    struct List *sequences = constructEmptyList(0, free);
    struct List *seqLengths = constructEmptyList(0, (void (*)(void *))destructInt);
    struct List *fastaNames = constructEmptyList(0, free);
    FILE *fileHandle = fopen(mfaFile, "r");
    if (fileHandle == NULL) {
        usage();
        exit(1);
    }
    fastaRead(fileHandle, sequences, seqLengths, fastaNames);
    fclose(fileHandle);

    //////////////////////////////////////////////
    //Get the tree alignment
    //////////////////////////////////////////////

    stTree *tree = NULL;
    LeafPtrArray *leafArray = NULL;

    int32_t leafCount = 0;
    if (treeFile != NULL) {
        tree = eTreeX_getTreeFromFile(treeFile);

        eTreeX_postOrderTraversal(tree, eTreeX_countLeaves, &leafCount);

        leafArray = eTreeX_constructLeafPtrArray(leafCount);
        eTreeX_postOrderTraversal(tree, eTreeX_getLeafArray, (void *) leafArray);
    }

    //////////////////////////////////////////////
    //Write the MFA alignment.
    //////////////////////////////////////////////

    fileHandle = fopen(outputFile, "w");
    //write the header.
    fprintf(fileHandle, "##maf version=1 scoring=NULL\n");
    fprintf(fileHandle, "# converted_from_MFA\n\n");

    //write the score line
    char *treeString = NULL;
    if (treeFile != NULL) {
        treeString = stTree_getNewickTreeString(tree);
        fprintf(fileHandle, "a score=0 tree=\"%s\"\n", treeString);
    }
    else {
        fprintf(fileHandle, "a score=0\n");
        leafCount = sequences->length;
    }

    //write the alignment
    int32_t i, j;
    int32_t ii;
    const char *label;
    for (ii=0; ii<leafCount; ii++) {
        if (treeFile != NULL) {
            label = stTree_getLabel((stTree *) leafArray->ptrArray[ii]);

            /* Do a brute force search to find the appropriate sequence that matches "label" */
            for (i=0; i<sequences->length; i++) {
                char *fastaHeader = fastaNames->list[i];
                char *sequenceName = st_malloc(sizeof(char) *(1 + strlen(fastaHeader)));
                sscanf(fastaHeader, "%s", sequenceName); //take the sequence name to be the first word of the sequence.
                if (strcmp(label, sequenceName) == 0) {
                    free(sequenceName);
                    break;
                }
                free(sequenceName);
            }
        }
        else {
            i = ii;
        }

        char *sequence = sequences->list[i];
        int32_t seqLength = *((int32_t *)seqLengths->list[i]);
        assert(seqLength == (int32_t)strlen(sequence));
        char *fastaHeader = fastaNames->list[i];
        char *sequenceName = st_malloc(sizeof(char) *(1 + strlen(fastaHeader)));
        sscanf(fastaHeader, "%s", sequenceName); //take the sequence name to be the first word of the sequence.
        int32_t length = 0;
        for (j=0; j<(int32_t)strlen(sequence); j++) {
            if (sequence[j] != '-') {
                length++;
            }
        }
        fprintf(fileHandle, "s\t%s\t%i\t%i\t%s\t%i\t%s\n", sequenceName, 0, length, "+", length, sequence);
        free(sequenceName);
    }

    fclose(fileHandle);

    //////////////////////////////////////////////
    //Clean up.
    //////////////////////////////////////////////

    free(mfaFile);
    free(outputFile);
    free(treeFile);

    if (treeFile != NULL) {
        stTree_destruct(tree);
        free(treeString);
        eTreeX_destructLeafPtrArray(leafArray);
    }

    destructList(sequences);
    destructList(seqLengths);
    destructList(fastaNames);

    return 0;
}