예제 #1
0
파일: HMM.C 프로젝트: bmajoros/MUMMIE
void HMM::load(istream &is)
{
  order=0;

  // Load number of states and allocate arrays of that size
  is>>numStates;
  discardInput(is,"states");
  emissionProb.resize(numStates);
  transitionProb.resize(numStates,numStates);

  // Load schema
  discardInput(is,"schema");
  loadSchema(is);
  const int numDiscrete=schema.getNumDiscrete();
  discreteEmitProb.resize(numStates,numDiscrete);
  chains.resize(numDiscrete);
  orders.resize(numStates,numDiscrete);

  // Read transition  && emission probabilities
  discardInput(is,"transitions");
  is >> transitionProb;
  String line;
  for(int q=1 ; q<numStates ; ++q) {
    discardInput(is,"emissions");
    emissionProb[q].load(is);
    for(int i=0 ; i<numDiscrete ; ++i) {
      int order;
      is>>order;
      orders[q][i]=order;
      if(order>this->order) this->order=order;
      discardInput(is,"order");
      discardInput(is,"alphabet");
      Alphabet alphabet;
      alphabet.load(is);
      schema.getAlphabet(i)=alphabet;
      HigherOrderAlphabet H(alphabet,order+1);
      const int numNmers=H.getNumNmers();
      Array1D<double> &row=discreteEmitProb[q][i];
      row.resize(numNmers);
      for(NmerSymbol nmer=1 ; nmer<numNmers ; ++nmer) {
	line.getline(is);
	line.trimWhitespace();
	if(is.eof() || line.isEmpty()) break;
	BOOM::Vector<BOOM::String> &fields=*line.getFields();
	Sequence seq(fields[0],alphabet);
	NmerSymbol nmer=H.lookup(seq);
	row[nmer]=fields[1].asDouble(); // in log space
	delete &fields;
      }
    }
  }

  // Load list of foreground states (if present)
  while(!is.eof()) {
    line.getline(is);
    line.trimWhitespace();
    if(line.contains("foreground")) {
      line.getline(is);
      line.trimWhitespace();
      foregroundStates.clear();
      BOOM::Vector<BOOM::String> &fg=*line.getFields(",");
      int n=fg.size();
      for(int i=0 ; i<n ; ++i) foregroundStates+=fg[i].asInt();
      delete &fg;
    }
  }	

  // Construct chains
  for(int i=0 ; i<numDiscrete ; ++i) {
    HigherOrderAlphabet H(schema.getAlphabet(i),order+1);
    chains[i]=NmerChain(H);
  }	

  // Convert to log space
  logifyTransitions();
}
예제 #2
0
int main(int argc, char** argv)
{
	// prints more information if the program was terminated due to an
	// exception.
	// see http://groups.yahoo.com/group/open-source-programmer/message/91
	#ifdef __GNUC__
	std::set_terminate (__gnu_cxx::__verbose_terminate_handler);
	#endif
	// parse command line arguments
	handleargs(argc, argv);

	//load an alphabet from the mass_file
	Alphabet alphabet;
	try {
		alphabet.load(alphabet_filename);
	}
	catch (IOException) {
		cerr << "Error loading alphabet \""<< alphabet_filename << "\"" << endl;
		exit(1);
	}

	cout << "Alphabet succesfully loaded:" << endl;
	cout << alphabet << endl;
	
	// copy over masses into a alphabet with fast access by character
	CharacterAlphabet fast_alphabet(alphabet);

	//now build some kind of fragmentizer
	cout << "Start building Fragmenter..." << endl;
	auto_ptr<Modifier<peaklist_type> > sort_modifier(new SortModifier<peaklist_type>);
	auto_ptr<Modifier<peaklist_type> > unification_modifier(new UnificationModifier<peaklist_type>);
	auto_ptr<MultiModifier<peaklist_type> > multi_modifier(new MultiModifier<peaklist_type>);
	multi_modifier->addModifier(sort_modifier);
	multi_modifier->addModifier(unification_modifier);
	fragmenter_type fragmenter(alphabet, cleavage_chars, prohibition_chars, with_cleavage_char);
	fragmenter.setModifier(auto_ptr<Modifier<peaklist_type> >(multi_modifier));
	cout << "...done\n" << endl;


	//now that we hopefully have some fragmentizer, we should be able to start parsing the fasta file
	//first open the stream
	ifstream fasta_stream(database_filename.c_str());
	if (!fasta_stream) {
		cerr << "Error: file \"" << database_filename << "\" could not be opened" << endl;
		exit(1);
	}

	//some vars
	//this maxlength thingie determines, how long such a sequence should be. 0 means unlimited
	const unsigned int maxlength = 0;
	string line;
	string currentString("");
	string currentID("");

	//open file for output
	ofstream ofs(output_filename.c_str());
	if (!ofs) {
		cerr << "Error: Could not open output file \""<< output_filename << "\"" << endl;
		exit(1);
	}

	//init the statistics vars
	long discarded_sequences = 0;
	long kept_sequences = 0;
	map_type letter_frequencies;

	//begin the looping
	while (getline(fasta_stream, line)) {
		if (line[0] == ' ') {
			continue; //FIXME this is not fasta file format
		}
		if(line[0] == '>') {          // check if end of sequence (rather beginng of new)
            // close current sequence (if there is one)
			if((maxlength==0 || currentString.length()<=maxlength) && currentString.length()>0) {
				//here comes the sequence handling
				//FIXME we need string toUpper, but we dont have it
				if (handle_sequence (currentString, currentID, fast_alphabet, fragmenter, letter_frequencies, ofs))
					++kept_sequences;
				else
					++discarded_sequences;
				currentString ="";
			}
            // begin new id sequence
			int k=1;
			while(isspace(line[k++]));
			currentID="";
			for(unsigned int i=k-1;i<line.length() && !(isspace(line[i]));i++) {
				currentID += line[i];
			}
		}
		else {
            // read sequence line
			for(unsigned int i=0;i<line.length();i++) {
				if(!isspace(line[i])) {
					currentString += line[i];
				}
			}
		}
	}
    // handle last sequence
	if((maxlength==0 || currentString.length()<=maxlength) && currentString.length()>0) {
		//here we have to do some handling for the last sequence
		//FIXME we need string toUpper, but we dont have it
		if (handle_sequence (currentString, currentID, fast_alphabet, fragmenter, letter_frequencies, ofs))
			++kept_sequences;
		else
			++discarded_sequences;

	}
	
	//logging
	cout << "\nAll sequences processed\n" << endl;

	//close streams
	fasta_stream.close();
	ofs.close();

	if (statistic_filename!="") {
	//do some statistics for the letter frequencies and sequence counts
		ofstream statistics(statistic_filename.c_str());
		if (!statistics) {
			cerr << "Error opening output file for statistics \""<< statistic_filename << "\"" << endl;
			exit(1);
		}
		cout << "Writing statistics to \"" << statistic_filename << "\"" << endl;
		statistics << "# Kept " << kept_sequences << ", discarded " << discarded_sequences << ", from a total of "
				<< kept_sequences + discarded_sequences << " sequences." << endl;
		statistics << "# Letter frequencies in valid sequences:" << endl;
		map_type::const_iterator it = letter_frequencies.begin();
		for ( ; it != letter_frequencies.end(); ++it) {
			statistics << it->first << "\t" << it->second << endl;
		}
		statistics.close();
	}

}
예제 #3
0
int main(int argc, char **argv) {
	Options options;
	try {
		options.parse(argc, argv);
	} catch (TCLAP::ArgException &e) {
		throw ims::Exception("Error while parsing command line: " + e.error() + " for argument " + e.argId());
	}
	Alphabet alphabet;
	try {
		alphabet.load(options.getAlphabetFileName());
	} catch (IOException& ioe) {
		cerr << "can not read alphabet: " << ioe.what() << endl;
		return 1;
	}
	Weights weights(alphabet.getMasses(), 1 /*precision*/);

	// optimize alphabet by dividing by gcd
	weights.divideByGCD();

	// get masses from -m options and/or from a file
	vector<double> masses = options.getMasses();
	if (options.hasMassFile()) {
		MassesTextParser parser;
		try {
			parser.load(options.getMassFile());
		} catch (IOException& ioe) {
			cerr << "can not read masses file: " << ioe.what() << endl;
			return 1;
		}
		vector<double> filemasses = parser.getElements();
		// append to masses
		copy(filemasses.begin(), filemasses.end(), back_inserter(masses));
	}
	if (masses.empty()) {
		cerr << "No input masses given to decompose (use -m or -f option)!\n";
		return 1;
	}

	// maximal number of decompositions to show
	unsigned int maxNumber = options.getMaxNumberDecompositions();

	printHeader(alphabet, weights, options.getMode());

	if (options.getMode() == Options::GETNUMBER) {
		ClassicalDPMassDecomposer<> decomposer(weights);

		// loop through masses
		for (vector<double>::const_iterator it = masses.begin(); it != masses.end(); ++it) {
			value_type mass = static_cast<value_type>(*it);

			decomposition_value_t number = decomposer.getNumberOfDecompositions(mass);
			cout << "# mass " << mass << " has " << number << " decompositions" << endl;
		}
	} else { // if any other tasks defined
		// use decomposer with residues
		IntegerMassDecomposer<> decomposer(weights);

		if (options.getMode() == Options::FINDALL) {
			// loop through masses
			for (vector<double>::const_iterator it = masses.begin(); it != masses.end(); ++it) {
				value_type mass = static_cast<value_type>(*it);
				outputDecompositions(decomposer.getAllDecompositions(mass), alphabet, mass, maxNumber);
				cout << '\n';
			}
		}
		else if (options.getMode() == Options::FINDONE) {
			// loop through masses
			for (vector<double>::const_iterator it = masses.begin(); it != masses.end(); ++it) {
				value_type mass = static_cast<value_type>(*it);
				decomposition_t decomposition = decomposer.getDecomposition(mass);
				if (decomposition.size() == 0) {
					cout << "# mass " << mass << " has 0 decompositions\n\n";
				} else {
					cout << "# mass " << mass << " has at least this decomposition:\n";
					printDecomposition(alphabet, decomposition);
					cout << "\n\n";
				}
			}
		}
		else if (options.getMode() == Options::ISDECOMPOSABLE) {
			// loop through masses
			for (vector<double>::const_iterator it = masses.begin(); it != masses.end(); ++it) {
				value_type mass = static_cast<value_type>(*it);
				if (decomposer.exist(mass)) {
					cout << "# mass " << mass << " has at least one decomposition\n";
				} else {
					cout << "# mass " << mass << " has no decompositions\n";
				}
			}
		}
	}
	cout << "# done\n";

	return 0;
}