Exemplo n.º 1
0
void SequenceCleaner::read_sequences (istream* pios) {
    Sequence seq;
    string retstring;
    int ft = test_seq_filetype_stream(*pios, retstring);
    int num_current_char = 0;
    bool first = true;
    
    while (read_next_seq_from_stream(*pios, ft, retstring, seq)) {
        sequences_[seq.get_id()] = seq.get_sequence();
        num_current_char = seq.get_sequence().size();
        if (first) {
            num_char_ = num_current_char; // just getting this from an arbitrary (first) sequence for now
            if (is_dna_) {
                string alpha_name = seq.get_alpha_name();
                if (alpha_name == "AA") {
                    is_dna_ = false;
                    //cout << "I believe this is a protein!" << endl;
                }
            }
            first = false;
            continue;
        } else {
            if (num_current_char != num_char_) {
                cout << "Error: sequences are not all of the same length. Exiting."
                    << endl;
                exit(0);
            }
        }
    }
    if (ft == 2) {
        sequences_[seq.get_id()] = seq.get_sequence();
        num_current_char = seq.get_sequence().size();
        if (num_current_char != num_char_) {
            cout << "Error: sequences are not all of the same length. Exiting."
                << endl;
            exit(0);
        }
    }
    num_taxa_ = sequences_.size();
}
Exemplo n.º 2
0
NJOI::NJOI (istream* pios, int & threads):ntax_(0), nchar_(0), nthreads_(threads) {
    Sequence seq;
    string retstring;
    int ft = test_seq_filetype_stream(*pios, retstring);
    
    int seqcount = 0;
    // some error checking. should be in general seq reader class
    bool first = true;
    while (read_next_seq_from_stream(*pios, ft, retstring, seq)) {
        sequences_[seq.get_id()] = seq.get_sequence();
        if (!first) {
            if ((int)seq.get_length() != nchar_) {
                cout << "Error: sequence " << seq.get_id() << " has "
                    << seq.get_length() << " characters, was expecting " 
                    << nchar_ << "." << endl << "Exiting." << endl;
                exit(1);
            }
        } else {
            nchar_ = seq.get_length();
            first = false;
        }
        seqcount++;
    }
    //fasta has a trailing one
    if (ft == 2) {
        sequences_[seq.get_id()] = seq.get_sequence();
        if ((int)seq.get_length() != nchar_) {
            cout << "Error: sequence " << seq.get_id() << " has "
                << seq.get_length() << " characters, was expecting " 
                << nchar_ << "." << endl << "Exiting." << endl;
            exit(1);
        };
        seqcount++;
    }
    ntax_ = seqcount;
    set_name_key ();
    Matrix = BuildMatrix(sequences_);
    TREEMAKE(names_, name_key_, Matrix);
}
Exemplo n.º 3
0
int main(int argc, char * argv[]) {
    
    log_call(argc, argv);
    
    bool cfileset = false;
    bool tfileset = false;
    bool outfileset = false;
    
    char * treef = NULL;
    char * charf = NULL;
    char * outf = NULL;
    int analysis = 0;
    while (1) {
        int oi = -1;
        int c = getopt_long(argc, argv, "c:t:o:hV", long_options, &oi);
        if (c == -1) {
            break;
        }
        switch(c) {
            case 'c':
                cfileset = true;
                charf = strdup(optarg);
                check_file_exists(charf);
                break;
            case 't':
                tfileset = true;
                treef = strdup(optarg);
                check_file_exists(treef);
                break;
            case 'o':
                outfileset = true;
                outf = strdup(optarg);
                break;
            case 'h':
                print_help();
                exit(0);
            case 'V':
                cout << versionline << endl;
                exit(0);
            default:
                print_error(argv[0], (char)c);
                exit(0);
        }
    }

    istream * pios = NULL;
    istream * poos = NULL;
    ifstream * cfstr = NULL;
    ifstream * tfstr = NULL;

    ostream * poouts = NULL;
    ofstream * ofstr = NULL;
    

    if (tfileset == true) {
        tfstr = new ifstream(treef);
        poos = tfstr;
    } else {
        poos = &cin;
    }

    if (cfileset == true) {
        cfstr = new ifstream(charf);
        pios = cfstr;
    } else {
        cout << "you have to set a character file. Only a tree file can be read in through the stream;" << endl;
    }

    //out file
    //
    if (outfileset == true){
        ofstr = new ofstream(outf);
        poouts = ofstr;
    } else{
        poouts = &cout;
    }
    //

    string retstring;
    int ft = test_char_filetype_stream(*pios, retstring);
    if (ft != 1 && ft != 2) {
        cout << "only fasta and phylip (with spaces) supported so far" << endl;
        exit(0);
    }
    Sequence seq;
    vector <Sequence> seqs;
    map <string, int> seq_map;
    int y = 0;
    int nchars = 0 ;
    while (read_next_seq_char_from_stream(*pios, ft, retstring, seq)) {
        seqs.push_back(seq);
        nchars = seq.get_num_cont_char();
        seq_map[seq.get_id()] = y;
        seq.clear_cont_char();
        y++;
    }
    cout << "nchars: " <<  nchars << endl;
    
    if (ft == 2) {
        seqs.push_back(seq);
        seq_map[seq.get_id()] = y;
        seq.clear_cont_char();
    }
    //read trees
    TreeReader tr;
    vector<Tree *> trees;
    while (getline(*poos,retstring)) {
        if (retstring.size()<4){
            continue;
        }
        trees.push_back(tr.readTree(retstring));
    }
    int x = 0;
    //conduct analyses for each character
    for (int i=0; i < trees[x]->getExternalNodeCount(); i++) {
        vector<Superdouble> tv (nchars);
        for (int c=0; c < nchars; c++) {
            tv[c] = seqs[seq_map[trees[x]->getExternalNode(i)->getName()]].get_cont_char(c);
        }
        trees[x]->getExternalNode(i)->assocDoubleVector("val",tv);
    }
    for (int i=0; i < trees[x]->getInternalNodeCount(); i++) {
        vector<Superdouble> tv (nchars);
        for (int c=0; c < nchars; c++) {
            tv[c] = 0;
        }
        trees[x]->getInternalNode(i)->assocDoubleVector("val",tv);
    }
    float sigma = 1;
    cout << calc_bm_prune(trees[x], sigma) << endl;
    optimize_single_rate_bm_bl(trees[x]);
    (*poouts) << trees[x]->getRoot()->getNewick(true) << ";" << endl;
    cout << calc_bm_prune(trees[x], sigma) << endl;

    if (cfileset) {
        cfstr->close();
        delete pios;
    }
    if (tfileset) {
        tfstr->close();
        delete poos;
    }
    if (outfileset) {
        ofstr->close();
        delete poouts;
    }
    return EXIT_SUCCESS;
}
Exemplo n.º 4
0
void SequenceConcatenater::read_sequences (string & seqf) {
    filename_ = seqf;
    string retstring;
    istream * pios = new ifstream(filename_);
    ft_ = test_seq_filetype_stream(*pios, retstring);
    Sequence seq;
    int counter = 0;
    int length = 0;
    
    // phylip (1) NEXUS (0)
    if (ft_ == 1 || ft_ == 0) {
        if (ft_ == 1) {
            vector <string> fileDim = tokenize(retstring);
            num_taxa_ = stoi(fileDim[0]);
            num_char_ = stoi(fileDim[1]);
        } else {
            get_nexus_dimensions_file(seqf, num_taxa_, num_char_, interleave_);
        }
        if (!interleave_) {
            while (read_next_seq_from_stream(*pios, ft_, retstring, seq)) {
                length = (int)seq.get_sequence().size();
                if (length != num_char_) {
                    cout << "Sequence '" << seq.get_id() << "' has " << length << " characters, but the file '"
                        << filename_ << "' specified " << num_char_ << " characters. Exiting." << endl;
                    delete pios;
                    exit(1);
                }
                if (toupcase_) {
                    seq.set_sequence(seq.seq_to_upper());
                }
                seqs_.push_back(seq);
                counter++;
            }
            if (counter != num_taxa_) {
                cout << "Read " << counter << " taxa, but the file '" << filename_ << "' specified "
                    << num_taxa_ << " taxa. Exiting." << endl;
                delete pios;
                exit(1);
            }
        } else {
            seqs_ = read_interleaved_nexus_file(seqf, num_taxa_, num_char_);
            if (toupcase_) {
                for (int i = 0; i < num_taxa_; i++) {
                    seqs_[i].set_sequence(seqs_[i].seq_to_upper());
                }
            }
        }
        
    } else if (ft_ == 2) { // fasta
        bool first = true;
        while (read_next_seq_from_stream(*pios, ft_, retstring, seq)) {
            int curr = (int)seq.get_sequence().size();
            if (!first) {
                if (curr != length) {
                    cout << "Error: current sequence has " << curr << " characters, but previous sequence had "
                        << length << " characters. Exiting." << endl;
                    delete pios;
                    exit(1);
                }
            } else {
                length = curr;
                first = false;
            }
            if (toupcase_) {
                seq.set_sequence(seq.seq_to_upper());
            }
            seqs_.push_back(seq);
            counter++;
        }
        // fasta has a trailing one
        if (toupcase_) {
            seq.set_sequence(seq.seq_to_upper());
        }
        seqs_.push_back(seq);
        counter++;
        num_taxa_ = counter;
        num_char_ = length;
    } else {
        cout << "I don't know what that alignment file format is! Exiting." << endl;
        exit(0);
    }
    num_partitions_ = 1;
    partition_sizes_.push_back(num_char_);
    delete pios;
}
Exemplo n.º 5
0
int main(int argc, char * argv[]) {
    
    log_call(argc, argv);
    
    bool outfileset = false;
    bool sfileset = false;
    bool cfileset = false;
    bool nfileset = false;
    bool verbose = false;
    char * outf = NULL;
    char * seqf = NULL;
    string cnamef = "";
    string nnamef = "";
    while (1) {
        int oi = -1;
        int c = getopt_long(argc, argv, "s:c:n:o:vhV", long_options, &oi);
        if (c == -1) {
            break;
        }
        switch(c) {
            case 's':
                sfileset = true;
                seqf = strdup(optarg);
                check_file_exists(seqf);
                break;
            case 'c':
                cfileset = true;
                cnamef = strdup(optarg);
                check_file_exists(cnamef.c_str());
                break;
            case 'n':
                nfileset = true;
                nnamef = strdup(optarg);
                check_file_exists(nnamef.c_str());
                break;
            case 'o':
                outfileset = true;
                outf = strdup(optarg);
                break;
            case 'v':
                verbose = true;
                break;
            case 'h':
                print_help();
                exit(0);
            case 'V':
                cout << versionline << endl;
                exit(0);
            default:
                print_error(argv[0], (char)c);
                exit(0);
        }
    }
    
    if (sfileset && outfileset) {
        check_inout_streams_identical(seqf, outf);
    }
    
    istream * pios = NULL;
    ostream * poos = NULL;
    ifstream * fstr = NULL;
    ofstream * ofstr = NULL;
    
    if (!nfileset | !cfileset) {
        cout << "Must supply both name files (-c for current, -n for new)." << endl;
        exit(0);
    }
    
    if (sfileset == true) {
        fstr = new ifstream(seqf);
        pios = fstr;
    } else {
        pios = &cin;
        if (check_for_input_to_stream() == false) {
            print_help();
            exit(1);
        }
    }
    if (outfileset == true) {
        ofstr = new ofstream(outf);
        poos = ofstr;
    } else {
        poos = &cout;
    }
    
    Relabel rl (cnamef, nnamef, verbose);
    
    set <string> orig = rl.get_names_to_replace();
    
    Sequence seq;
    string retstring;
    
    int ft = test_seq_filetype_stream(*pios, retstring);
    
    bool success = false;
    
    while (read_next_seq_from_stream(*pios, ft, retstring, seq)) {
        string terp = seq.get_id();
        success = rl.relabel_sequence(seq);
        if (success) {
            orig.erase(terp);
        }
        (*poos) << ">" << seq.get_id() << endl;
        (*poos) << seq.get_sequence() << endl;
    }
// have to deal with last sequence outside while loop. fix this.
    if (ft == 2) {
        string terp = seq.get_id();
        success = rl.relabel_sequence(seq);
        if (success) {
            orig.erase(terp);
        }
        (*poos) << ">" << seq.get_id() << endl;
        (*poos) << seq.get_sequence() << endl;
    }
    
    if (orig.size() > 0) {
        if (verbose) {
            cerr << "The following names to match were not found in the alignment:" << endl;
            for (auto elem : orig) {
                cerr << elem << endl;
            }
        }
    }
    
    if (sfileset) {
        fstr->close();
        delete pios;
    }
    if (outfileset) {
        ofstr->close();
        delete poos;
    }
    return EXIT_SUCCESS;
}
Exemplo n.º 6
0
int main(int argc, char * argv[]) {
    
    log_call(argc, argv);
    
    bool outfileset = false;
    bool fileset = false;
    bool printpost = false;
    bool showancs = false;
    bool is_dna = true;
    float pinvar = 0.0;
    double tot;
    string yorn = "n";
    int seqlen = 1000;
    int pos = 0;
    int pos2 = 0;
    string infreqs;
    string inrates;
    string holdrates;
    string ancseq;
    char * outf = NULL;
    char * treef = NULL;
    vector <double> diag(20, 0.0);
    vector <double> basefreq(4, 0.25);
    vector <double> aabasefreq(20, 0.05);
    vector <double> userrates;
    vector <double> multirates;
    int nreps = 1; // not implemented at the moment
    int seed = -1;
    int numpars = 0;
    float alpha = -1.0;
    vector<vector <double>> dmatrix;
    vector< vector <double> > aa_rmatrix(20, vector<double>(20, 1));
        for (unsigned int i = 0; i < aa_rmatrix.size(); i++) {
        for (unsigned int j = 0; j < aa_rmatrix.size(); j++) {
            if (i == j) { // Fill Diagonal
                aa_rmatrix[i][j] = -19.0;
            }
        }
    }
    vector< vector <double> > rmatrix(4, vector<double>(4, 0.33));
    for (unsigned int i = 0; i < rmatrix.size(); i++) {
        for (unsigned int j = 0; j < rmatrix.size(); j++) {
            if (i == j) { // Fill Diagonal
                rmatrix[i][j] = -0.99;
            }
        }
    }
    /*dmatrix = aa_rmatrix;
    for (unsigned int i = 0; i < dmatrix.size(); i++) {
        for (unsigned int j = 0; j < dmatrix.size(); j++) {
            cout << dmatrix[i][j] << " ";
        }
        cout << "\n";
    }*/

    while (1) {
        int oi = -1;
        int c = getopt_long(argc, argv, "t:o:l:b:g:i:r:w:q:n:x:apcm:k:hV", long_options, &oi);
        if (c == -1) {
            break;
        }
        switch(c) {
            case 't':
                fileset = true;
                treef = strdup(optarg);
                check_file_exists(treef);
                break;
            case 'o':
                outfileset = true;
                outf = strdup(optarg);
                break;
            case 'b':
                infreqs = strdup(optarg);
                parse_comma_list(infreqs, basefreq);
                if (basefreq.size() != 4) {
                    cout << "Error: must provide 4 base frequencies (" << basefreq.size()
                        << " provided). Exiting." << endl;
                    exit(0);
                }
                if (!essentially_equal(sum(basefreq), 1.0)) {
                    cout << "Error: base frequencies must sum to 1.0. Exiting." << endl;
                    exit(0);
                }
                break;
            case 'l':
                seqlen = atoi(strdup(optarg));
                break;
            case 'a':
                showancs = true;
                break;
            case 'r':
                inrates = strdup(optarg);
                parse_comma_list(inrates, userrates);
                
                // NOTE: will have to alter this check for a.a., non-reversible, etc.
                if (userrates.size() != 6) {
                    cout << "Error: must provide 6 substitution parameters. " <<
                        "Only " << userrates.size() << " provided. Exiting." << endl;
                    exit(0);
                }
                // NOTE: this uses order: A,T,C,G for matrix, but
                //       A<->C,A<->G,A<->T,C<->G,C<->T,G<->T for subst. params.
                rmatrix[0][2] = userrates[0];
                rmatrix[2][0] = userrates[0];
                rmatrix[0][3] = userrates[1];
                rmatrix[3][0] = userrates[1];
                rmatrix[0][1] = userrates[2];
                rmatrix[1][0] = userrates[2];
                rmatrix[2][3] = userrates[3];
                rmatrix[3][2] = userrates[3];
                rmatrix[1][2] = userrates[4];
                rmatrix[2][1] = userrates[4];
                rmatrix[1][3] = userrates[5];
                rmatrix[3][1] = userrates[5];
                rmatrix[0][0] = (userrates[0]+userrates[1]+userrates[2]) * -1;
                rmatrix[1][1] = (userrates[2]+userrates[4]+userrates[5]) * -1;
                rmatrix[2][2] = (userrates[0]+userrates[3]+userrates[4]) * -1;
                rmatrix[3][3] = (userrates[1]+userrates[3]+userrates[5]) * -1;
                /*//Turn on to check matrix
                for (unsigned int i = 0; i < rmatrix.size(); i++) {
                   for (unsigned int j = 0; j < rmatrix.size(); j++) {
                      cout << rmatrix[i][j] << " ";
                   }
                    cout << "\n";
                }*/
                break;
            case 'w':
                inrates = strdup(optarg);
                parse_comma_list(inrates, userrates);
                is_dna = false;
                
                // NOTE: will have to alter this check for a.a., non-reversible, etc.
                if (userrates.size() != 190) {
                    cout << "Error: must provide 190 substitution parameters, I know its a stupidly large amount. " <<
                        "Only " << userrates.size() << " provided. Exiting." << endl;
                    exit(0);
                }
                pos = 0;
                pos2 = 1;
                //Fill the Matrix
                for (unsigned int i = 0; i < userrates.size(); i++){
                    aa_rmatrix[pos][pos2] = userrates[i];
                    aa_rmatrix[pos2][pos] = userrates[i];
                    pos2++;
                    if (pos2 == 20){
                        pos += 1;
                        pos2 = (pos + 1);
                    }
                }
                //Replace Diagonal
                for (unsigned int i = 0; i < aa_rmatrix.size(); i++) {
                    for (unsigned int j = 0; j < aa_rmatrix.size(); j++) {
                        if (i != j){
                            tot += aa_rmatrix[i][j];
                        }
                    }
                    aa_rmatrix[i][i] = (tot*-1);
                    tot = 0.0;
                }
                /*
                for (unsigned int i = 0; i < aa_rmatrix.size(); i++) {
                    for (unsigned int j = 0; j < aa_rmatrix.size(); j++) {
                        cout << aa_rmatrix[i][j] << " ";
                    }
                    cout << "\n";
                }*/
                break;
            case 'n':
                nreps = atoi(strdup(optarg));
                break;
            case 'x':
                seed = atoi(strdup(optarg));
                break;
            case 'q':
                is_dna = false;
                infreqs = strdup(optarg);
                parse_comma_list(infreqs, aabasefreq);
                if (aabasefreq.size() != 20) {
                    cout << "Error: must provide 20 base frequencies (" << aabasefreq.size()
                        << " provided). Exiting." << endl;
                    exit(0);
                }
                if (!essentially_equal(sum(aabasefreq), 1.0)) {
                    cout << "Error: base frequencies must sum to 1.0. Exiting." << endl;
                    exit(0);
                }
                break;
            case 'g':
                alpha = atof(strdup(optarg));
                break;
            case 'i':
                pinvar = atof(strdup(optarg));
                break;
            case 'p':
                printpost = true;
                break;
            case 'c':
                is_dna = false;
                break;
            case 'm':
                holdrates = strdup(optarg);
                parse_comma_list(holdrates, multirates);
                numpars = multirates.size();
                if ((numpars - 6) % 7 != 0) {
                    cout << "Error: must provide 6 background substitution "
                        << "parameters and 7 values (1 node id + 6 subst. par.) "
                        << "for each piecewise model. Exiting." << endl;
                    exit(0);
                }
                break;
            case 'k':
                ancseq = strdup(optarg);
                break;
            case 'h':
                print_help();
                exit(0);
            case 'V':
                cout << versionline << endl;
                exit(0);
            default:
                print_error(argv[0], (char)c);
                exit(0);
        }
    }
    
    if (fileset && outfileset) {
        check_inout_streams_identical(treef, outf);
    }
    
    if (is_dna) {
        dmatrix = rmatrix;
    } else {
        dmatrix = aa_rmatrix;
    }
    
    istream * pios = NULL;
    ostream * poos = NULL;
    ifstream * fstr = NULL;
    ofstream * ofstr = NULL;
    
    if (outfileset == true) {
        ofstr = new ofstream(outf);
        poos = ofstr;
    } else {
        poos = &cout;
    }
    if (fileset == true) {
        fstr = new ifstream(treef);
        pios = fstr;
    } else {
        pios = &cin;
        if (check_for_input_to_stream() == false) {
            print_help();
            exit(1);
        }
    }
    
    
    /*
     * Default Base Frequencies and Rate Matrix
     *
     */
    //vector <double> basefreq(4, 0.0);
    //basefreq[0] = .25;
    //basefreq[1] = .25;
    //basefreq[2] = .25;
    //basefreq[3] = 1.0 - basefreq[0] - basefreq[1] - basefreq[2];
    /*    
    vector< vector <double> > rmatrix(4, vector<double>(4, 0.33));
    for (unsigned int i = 0; i < rmatrix.size(); i++) {
        for (unsigned int j = 0; j < rmatrix.size(); j++) {
            if (i == j) {//Fill Diagnol
                rmatrix[i][j] = -1.0;
            }
        }
    }
    */
    string retstring;
    int ft = test_tree_filetype_stream(*pios, retstring);
    if (ft != 0 && ft != 1) {
        cerr << "this really only works with nexus or newick" << endl;
        exit(0);
    }
    
    // allow > 1 tree in input. passing but not yet using nreps
    int treeCounter = 0;
    bool going = true;
    if (ft == 1) { // newick. easy
        Tree * tree;
        while (going) {
            tree = read_next_tree_from_stream_newick (*pios, retstring, &going);
            if (tree != NULL) {
                //cout << "Working on tree #" << treeCounter << endl;
                SequenceGenerator SGen(seqlen, basefreq, dmatrix, tree, showancs,
                    nreps, seed, alpha, pinvar, ancseq, printpost, multirates, aabasefreq, is_dna);
                vector <Sequence> seqs = SGen.get_sequences();
                for (unsigned int i = 0; i < seqs.size(); i++) {
                    Sequence seq = seqs[i];
                    (*poos) << ">" << seq.get_id() << endl;
                    //cout << "Here" << endl;
                    (*poos) << seq.get_sequence() << endl;
                }
                delete tree;
                treeCounter++;
            }
        }
    } else if (ft == 0) { // Nexus. need to worry about possible translation tables
        map <string, string> translation_table;
        bool ttexists;
        ttexists = get_nexus_translation_table(*pios, &translation_table, &retstring);
        Tree * tree;
        while (going) {
            tree = read_next_tree_from_stream_nexus(*pios, retstring, ttexists,
                &translation_table, &going);
            if (going == true) {
                cout << "Working on tree #" << treeCounter << endl;
                SequenceGenerator SGen(seqlen, basefreq, dmatrix, tree, showancs,
                    nreps, seed, alpha, pinvar, ancseq, printpost, multirates, aabasefreq, is_dna);
                vector <Sequence> seqs = SGen.get_sequences();
                for (unsigned int i = 0; i < seqs.size(); i++) {
                    Sequence seq = seqs[i];
                    (*poos) << ">" << seq.get_id() << endl;
                    (*poos) << seq.get_sequence() << endl;
                }
                delete tree;
                treeCounter++;
            }
        }
    }
    
    return EXIT_SUCCESS;
}