void HMM::load(istream &is) { order=0; // Load number of states and allocate arrays of that size is>>numStates; discardInput(is,"states"); emissionProb.resize(numStates); transitionProb.resize(numStates,numStates); // Load schema discardInput(is,"schema"); loadSchema(is); const int numDiscrete=schema.getNumDiscrete(); discreteEmitProb.resize(numStates,numDiscrete); chains.resize(numDiscrete); orders.resize(numStates,numDiscrete); // Read transition && emission probabilities discardInput(is,"transitions"); is >> transitionProb; String line; for(int q=1 ; q<numStates ; ++q) { discardInput(is,"emissions"); emissionProb[q].load(is); for(int i=0 ; i<numDiscrete ; ++i) { int order; is>>order; orders[q][i]=order; if(order>this->order) this->order=order; discardInput(is,"order"); discardInput(is,"alphabet"); Alphabet alphabet; alphabet.load(is); schema.getAlphabet(i)=alphabet; HigherOrderAlphabet H(alphabet,order+1); const int numNmers=H.getNumNmers(); Array1D<double> &row=discreteEmitProb[q][i]; row.resize(numNmers); for(NmerSymbol nmer=1 ; nmer<numNmers ; ++nmer) { line.getline(is); line.trimWhitespace(); if(is.eof() || line.isEmpty()) break; BOOM::Vector<BOOM::String> &fields=*line.getFields(); Sequence seq(fields[0],alphabet); NmerSymbol nmer=H.lookup(seq); row[nmer]=fields[1].asDouble(); // in log space delete &fields; } } } // Load list of foreground states (if present) while(!is.eof()) { line.getline(is); line.trimWhitespace(); if(line.contains("foreground")) { line.getline(is); line.trimWhitespace(); foregroundStates.clear(); BOOM::Vector<BOOM::String> &fg=*line.getFields(","); int n=fg.size(); for(int i=0 ; i<n ; ++i) foregroundStates+=fg[i].asInt(); delete &fg; } } // Construct chains for(int i=0 ; i<numDiscrete ; ++i) { HigherOrderAlphabet H(schema.getAlphabet(i),order+1); chains[i]=NmerChain(H); } // Convert to log space logifyTransitions(); }
int main(int argc, char** argv) { // prints more information if the program was terminated due to an // exception. // see http://groups.yahoo.com/group/open-source-programmer/message/91 #ifdef __GNUC__ std::set_terminate (__gnu_cxx::__verbose_terminate_handler); #endif // parse command line arguments handleargs(argc, argv); //load an alphabet from the mass_file Alphabet alphabet; try { alphabet.load(alphabet_filename); } catch (IOException) { cerr << "Error loading alphabet \""<< alphabet_filename << "\"" << endl; exit(1); } cout << "Alphabet succesfully loaded:" << endl; cout << alphabet << endl; // copy over masses into a alphabet with fast access by character CharacterAlphabet fast_alphabet(alphabet); //now build some kind of fragmentizer cout << "Start building Fragmenter..." << endl; auto_ptr<Modifier<peaklist_type> > sort_modifier(new SortModifier<peaklist_type>); auto_ptr<Modifier<peaklist_type> > unification_modifier(new UnificationModifier<peaklist_type>); auto_ptr<MultiModifier<peaklist_type> > multi_modifier(new MultiModifier<peaklist_type>); multi_modifier->addModifier(sort_modifier); multi_modifier->addModifier(unification_modifier); fragmenter_type fragmenter(alphabet, cleavage_chars, prohibition_chars, with_cleavage_char); fragmenter.setModifier(auto_ptr<Modifier<peaklist_type> >(multi_modifier)); cout << "...done\n" << endl; //now that we hopefully have some fragmentizer, we should be able to start parsing the fasta file //first open the stream ifstream fasta_stream(database_filename.c_str()); if (!fasta_stream) { cerr << "Error: file \"" << database_filename << "\" could not be opened" << endl; exit(1); } //some vars //this maxlength thingie determines, how long such a sequence should be. 0 means unlimited const unsigned int maxlength = 0; string line; string currentString(""); string currentID(""); //open file for output ofstream ofs(output_filename.c_str()); if (!ofs) { cerr << "Error: Could not open output file \""<< output_filename << "\"" << endl; exit(1); } //init the statistics vars long discarded_sequences = 0; long kept_sequences = 0; map_type letter_frequencies; //begin the looping while (getline(fasta_stream, line)) { if (line[0] == ' ') { continue; //FIXME this is not fasta file format } if(line[0] == '>') { // check if end of sequence (rather beginng of new) // close current sequence (if there is one) if((maxlength==0 || currentString.length()<=maxlength) && currentString.length()>0) { //here comes the sequence handling //FIXME we need string toUpper, but we dont have it if (handle_sequence (currentString, currentID, fast_alphabet, fragmenter, letter_frequencies, ofs)) ++kept_sequences; else ++discarded_sequences; currentString =""; } // begin new id sequence int k=1; while(isspace(line[k++])); currentID=""; for(unsigned int i=k-1;i<line.length() && !(isspace(line[i]));i++) { currentID += line[i]; } } else { // read sequence line for(unsigned int i=0;i<line.length();i++) { if(!isspace(line[i])) { currentString += line[i]; } } } } // handle last sequence if((maxlength==0 || currentString.length()<=maxlength) && currentString.length()>0) { //here we have to do some handling for the last sequence //FIXME we need string toUpper, but we dont have it if (handle_sequence (currentString, currentID, fast_alphabet, fragmenter, letter_frequencies, ofs)) ++kept_sequences; else ++discarded_sequences; } //logging cout << "\nAll sequences processed\n" << endl; //close streams fasta_stream.close(); ofs.close(); if (statistic_filename!="") { //do some statistics for the letter frequencies and sequence counts ofstream statistics(statistic_filename.c_str()); if (!statistics) { cerr << "Error opening output file for statistics \""<< statistic_filename << "\"" << endl; exit(1); } cout << "Writing statistics to \"" << statistic_filename << "\"" << endl; statistics << "# Kept " << kept_sequences << ", discarded " << discarded_sequences << ", from a total of " << kept_sequences + discarded_sequences << " sequences." << endl; statistics << "# Letter frequencies in valid sequences:" << endl; map_type::const_iterator it = letter_frequencies.begin(); for ( ; it != letter_frequencies.end(); ++it) { statistics << it->first << "\t" << it->second << endl; } statistics.close(); } }
int main(int argc, char **argv) { Options options; try { options.parse(argc, argv); } catch (TCLAP::ArgException &e) { throw ims::Exception("Error while parsing command line: " + e.error() + " for argument " + e.argId()); } Alphabet alphabet; try { alphabet.load(options.getAlphabetFileName()); } catch (IOException& ioe) { cerr << "can not read alphabet: " << ioe.what() << endl; return 1; } Weights weights(alphabet.getMasses(), 1 /*precision*/); // optimize alphabet by dividing by gcd weights.divideByGCD(); // get masses from -m options and/or from a file vector<double> masses = options.getMasses(); if (options.hasMassFile()) { MassesTextParser parser; try { parser.load(options.getMassFile()); } catch (IOException& ioe) { cerr << "can not read masses file: " << ioe.what() << endl; return 1; } vector<double> filemasses = parser.getElements(); // append to masses copy(filemasses.begin(), filemasses.end(), back_inserter(masses)); } if (masses.empty()) { cerr << "No input masses given to decompose (use -m or -f option)!\n"; return 1; } // maximal number of decompositions to show unsigned int maxNumber = options.getMaxNumberDecompositions(); printHeader(alphabet, weights, options.getMode()); if (options.getMode() == Options::GETNUMBER) { ClassicalDPMassDecomposer<> decomposer(weights); // loop through masses for (vector<double>::const_iterator it = masses.begin(); it != masses.end(); ++it) { value_type mass = static_cast<value_type>(*it); decomposition_value_t number = decomposer.getNumberOfDecompositions(mass); cout << "# mass " << mass << " has " << number << " decompositions" << endl; } } else { // if any other tasks defined // use decomposer with residues IntegerMassDecomposer<> decomposer(weights); if (options.getMode() == Options::FINDALL) { // loop through masses for (vector<double>::const_iterator it = masses.begin(); it != masses.end(); ++it) { value_type mass = static_cast<value_type>(*it); outputDecompositions(decomposer.getAllDecompositions(mass), alphabet, mass, maxNumber); cout << '\n'; } } else if (options.getMode() == Options::FINDONE) { // loop through masses for (vector<double>::const_iterator it = masses.begin(); it != masses.end(); ++it) { value_type mass = static_cast<value_type>(*it); decomposition_t decomposition = decomposer.getDecomposition(mass); if (decomposition.size() == 0) { cout << "# mass " << mass << " has 0 decompositions\n\n"; } else { cout << "# mass " << mass << " has at least this decomposition:\n"; printDecomposition(alphabet, decomposition); cout << "\n\n"; } } } else if (options.getMode() == Options::ISDECOMPOSABLE) { // loop through masses for (vector<double>::const_iterator it = masses.begin(); it != masses.end(); ++it) { value_type mass = static_cast<value_type>(*it); if (decomposer.exist(mass)) { cout << "# mass " << mass << " has at least one decomposition\n"; } else { cout << "# mass " << mass << " has no decompositions\n"; } } } } cout << "# done\n"; return 0; }