Beispiel #1
0
int ssds_field_data(struct db_class *db_class, int fd, int num)
{
  int ret;
  char **argv;
  char *myargv[16];
  int argc;
  char rbuf[132];
  char sbuf[132];
  struct db_field * field;

  field = db_class->fields[num];

  argv = myargv;
  sprintf(sbuf,"msg 5678:class:%s:action:get_field_num:num:%d:",
	  db_class->name,num);
  ret = ssds_send(fd, sbuf, rbuf);
  argc = get_argc(rbuf, argv, 16);
  get_arg_char(argc, argv,"field", field->name, 32);
  ret = get_arg_int(argc, argv,"id", &field->id);
  if (ret == 0) {
    printf(" ..  Client field data class name (%s) id %d field %s (id %d)\n",
	   db_class->name, db_class->id,
	   field->name, field->id);
  }
  return ret;
}
Beispiel #2
0
int
mdb_exec_pg_info(
    PMDB_INSPECT_CMD pCmd,
    PMDB_INSPECT_CONTEXT pCtxt
    )
{
    int error = 0;
    int pgno = 0;
    MDB_page *pg = NULL;

    if(!pCmd || !pCtxt)
    {
        error = EINVAL;
        bail_on_error(error);
    }

    error = get_arg_int(pCmd, CMD_ARG_PGNO, &pgno);
    bail_on_error(error);

    error = mdb_page_info(pCtxt->pDB, pgno, &pg);
    bail_on_error(error);

    mdb_print_page(pg);
cleanup:
    return error;

error:
    goto cleanup;
}
Beispiel #3
0
int ssds_num_classes(int fd)
{
  int res;
  int num;
  char **argv;
  char *myargv[16];
  int argc;
  char rbuf[132];
  char sbuf[132];

  argv = myargv;
  sprintf(sbuf,"msg 5678:action:get_classes:");
  res  = ssds_send(fd, sbuf, rbuf);
  if (res < 0)
    printf(" Client res error %d \n", res);

  argc = get_argc(rbuf, argv, 16);
  res  = get_arg_int(argc, argv, "reply", &num);
  if (res < 0)
    printf(" Client res error %d \n", res);

  //  printf(" Client num_classes is %d \n", num);

  return num;
}
Beispiel #4
0
int ssds_item_data(struct db_class *db_class, int fd, int num)
{
  int ret;
  char **argv;
  char *myargv[16];
  int argc;
  char rbuf[132];
  char sbuf[132];
  struct db_item * item;

  item = db_class->items[num];

  argv = myargv;
  sprintf(sbuf,"msg 5678:class:%s:action:get_item_num:num:%d:",
	  db_class->name, num);
  ret = ssds_send(fd, sbuf, rbuf);
  argc = get_argc(rbuf, argv, 16);
  get_arg_char(argc, argv,"item", item->name, 32);
  ret = get_arg_int(argc, argv,"id", &item->id);
  if (ret == 0) {
    printf(" ..  Client item data class name (%s) id %d item %s (id %d)\n",
	   db_class->name, db_class->id,
	   item->name, item->id);
  }

  return ret;
}
Beispiel #5
0
int ssds_class_data(struct db_class *db_class, int fd, int num)
{
  int ret;
  char **argv;
  char *myargv[16];
  int argc;
  char rbuf[132];
  char sbuf[132];

  argv = myargv;
  sprintf(sbuf,"msg 5678:action:get_class_num:num:%d:",num);
  ret = ssds_send(fd, sbuf, rbuf);
  argc = get_argc(rbuf, argv, 16);
  get_arg_char(argc, argv,"class", db_class->name, 32);
  ret = get_arg_int(argc, argv,"id", &db_class->id);
  ret += get_arg_int(argc, argv,"items", &db_class->num_items);
  ret += get_arg_int(argc, argv,"fields", &db_class->num_fields);
  if (ret == 0) {
    printf(" Client class name (%s) id %d \n",db_class->name, db_class->id);
  }

  return ret;
}
Beispiel #6
0
ConvStep::Range	Convert::retrieve_range (const ::VSFormat &fmt, const ::VSMap &in, ::VSMap &out, const char arg_0 [])
{
	assert (arg_0 != 0);

	bool           range_set_flag  = false;
	const bool     full_range_flag = (get_arg_int (
		in, out, arg_0,
		vsutl::is_full_range_default (fmt) ? 1 : 0,
		0, &range_set_flag
	) != 0);

	return (
		  (! range_set_flag) ? ConvStep::Range_UNDEF
		: (full_range_flag ) ? ConvStep::Range_FULL
		:                      ConvStep::Range_TV
	);
}
Beispiel #7
0
int ssds_get_gen_num(int ssfd, char *sdef, char *skey, char *rkey)
{
  int num;
  int ret;

  if(strlen(gen_rbuf)== 0) {
    ssds_get_gen_buf(ssfd, sdef, skey);
  }
  ret = get_arg_int(gen_argc, gen_argv, rkey, &num);
  if(cl_db) {
    printf("ret %d key (%s) num %d\n", ret, rkey, num);
  }
  if(ret >= 0) {
    ret = num;
  }
  if(cl_db)cl_db--;
  return ret;
}
Beispiel #8
0
int ssds_num_items(int fd, int class_num)
{
  int res;
  int num;
  char **argv;
  char *myargv[16];
  int argc;
  char rbuf[132];
  char sbuf[132];

  argv = myargv;
  sprintf(sbuf,"msg 5678:action:get_items:class_num:%d:",class_num);
  res = ssds_send(fd, sbuf, rbuf);
  if (res < 0)
    printf(" Client res error %d \n", res);

  argc = get_argc(rbuf, argv, 16);
  res  = get_arg_int(argc, argv, "reply", &num);
  if (0 && res < 0)
    printf(" Client res error %d \n", res);

  return num;
}
Beispiel #9
0
int do_remd_setup( struct aceplug_sim_t *s, int argc, char **argkey, char**argval ) {

// MJH REMD setup start
  int i;

  struct remd_t *p;
  p = (struct remd_t*) malloc( sizeof( struct remd_t ) );
  p->doing_remd = 0;
	p->exchange_type = EXCH_TEMP;

  s->privdata = p;
  if( s->ensemble_size > 1 ) {
    printf( "# PLUMED Ensemble replica exchange on %d replicas.\n", s->ensemble_size );
    p->doing_remd = 1;
  }
  else {
    printf( "# PLUMED Metadynamics on single simulation.\n" );
    return 0;
  }



  p->count = 0;
//  p->T0 = get_arg( argc , argkey, argval, "T0" );
//  p->k  = get_arg( argc, argkey, argval, "k" );
  p->whoami = s->ensemble_rank;
	p->exchangefreq = get_arg_int( argc, argkey, argval, "exchangefreq" );

  p->flog = NULL;
  if( s->ensemble_rank == 0 ) {
    for(i=0;i<argc;i++ ) {
      if(!strcmp(argkey[i], "log" ) ) {
        p->flog = fopen( argval[i], "w" ); // don't try to append, even if this is a reset (we can't tell)
      }
    }
  }

  if( p->flog == NULL ) {
    if( !s->ensemble_rank ) {
      fprintf( stderr, "# REMD : and log=filename for trace log\n");
      return -1;
    }
  }

	
//	p->exchange_type = get_exchange_type(  argc, argkey, argval );
	p->exchange_type = EXCH_SYSTEM;
//	p->scale_after_exchange = get_scale_after_exchange( argc, argkey, argval );
	p->startstep     = get_startstep( argc, argkey, argval );

  p->Tall = (double*) malloc( sizeof(double) * s->ensemble_size );
//  if( !s->ensemble_rank ) {
    printf( "#REMD Setup:\n# Replica\tTemperature\n" );
    for( i=0; i< s->ensemble_size; i++ ) {
      char tstring[10];
      snprintf( tstring, 9, "T%d", i );
      p->Tall[i] = get_arg( argc, argkey, argval, tstring );
//      if( i == s->ensemble_rank ) { p = p->Tall[i];}
      printf( "# %d\t%f\n", i, p->Tall[i] );
    }
    printf("# \n");
//  }

    for( i=0; i< s->ensemble_size; i++ ) {
			if( p->Tall[i]<=0. || ( i>0 && (p->Tall[i] <= p->Tall[i-1] ) ) ) {
				printf("# Invalid temperature for replica %d (%f)\n", i, p->Tall[i] );
				exit(-1);
			}
		}
  printf("# Rank %d Temperature %f\n", s->ensemble_rank, p->Tall[ s->ensemble_rank ] );

	

  s->plugin_set_temperature( p->Tall[ s->ensemble_rank ] );

	return 0;
// MJH REMD setup done
}
Beispiel #10
0
void	Convert::retrieve_output_colorspace (const ::VSMap &in, ::VSMap &out, ::VSCore &core, const ::VSFormat &fmt_src)
{
	const ::VSFormat *   fmt_dst_ptr = &fmt_src;

	// Full colorspace
	int            csp_dst = get_arg_int (in, out, "csp", ::pfNone);
	if (csp_dst != ::pfNone)
	{
		fmt_dst_ptr = _vsapi.getFormatPreset (csp_dst, &core);
		if (fmt_dst_ptr == 0)
		{
			throw_inval_arg ("unknown output colorspace.");
		}
	}

	int            col_fam  = fmt_dst_ptr->colorFamily;
	int            spl_type = fmt_dst_ptr->sampleType;
	int            bits     = fmt_dst_ptr->bitsPerSample;
	int            ssh      = fmt_dst_ptr->subSamplingW;
	int            ssv      = fmt_dst_ptr->subSamplingH;

	// Color family
	_col_fam = get_arg_int (in, out, "col_fam", col_fam);

	// Chroma subsampling
	std::string    css (get_arg_str (in, out, "css", ""));
	if (! css.empty ())
	{
		const int      ret_val = vsutl::conv_str_to_chroma_subspl (ssh, ssv, css);
		if (ret_val != 0)
		{
			throw_inval_arg ("unsupported css value.");
		}
	}

	// Destination bit depth and sample type
	bool           bits_def_flag = false;
	bool           flt_def_flag = false;
	int            flt = (spl_type != ::stInteger) ? 1 : 0;
	bits = get_arg_int (in, out, "bits", bits, 0, &bits_def_flag);
	flt  = get_arg_int (in, out, "flt" , flt,  0, &flt_def_flag );
	spl_type = (flt != 0) ? ::stFloat : ::stInteger;

	if (flt_def_flag && ! bits_def_flag)
	{
		if (spl_type == ::stFloat)
		{
			bits = 32;
		}
		else
		{
			if (bits > 16)
			{
				throw_inval_arg (
					"Cannot deduce the output bitdepth. Please specify it."
				);
			}
		}
	}
	else if (bits_def_flag && ! flt_def_flag)
	{
		if (bits >= 32)
		{
			spl_type = ::stFloat;
		}
		else
		{
			spl_type = ::stInteger;
		}
	}

	// Combines the modified parameters and validates the format
	try
	{
		fmt_dst_ptr = register_format (
			_col_fam,
			spl_type,
			bits,
			ssh,
			ssv,
			core
		);
	}
	catch (std::exception &)
	{
		throw;
	}
	catch (...)
	{
		fmt_dst_ptr = 0;
	}

	if (fmt_dst_ptr == 0)
	{
		throw_rt_err (
			"couldn\'t get a pixel format identifier for the output clip."
		);
	}

	_vi_out.format = fmt_dst_ptr;
}
Beispiel #11
0
Matrix::Matrix (const ::VSMap &in, ::VSMap &out, void * /*user_data_ptr*/, ::VSCore &core, const ::VSAPI &vsapi)
:	vsutl::FilterBase (vsapi, "matrix", ::fmParallel, 0)
,	_clip_src_sptr (vsapi.propGetNode (&in, "clip", 0, 0), vsapi)
,	_vi_in (*_vsapi.getVideoInfo (_clip_src_sptr.get ()))
,	_vi_out (_vi_in)
,	_sse_flag (false)
,	_sse2_flag (false)
,	_avx_flag (false)
,	_avx2_flag (false)
,	_range_set_src_flag (false)
,	_range_set_dst_flag (false)
,	_full_range_src_flag (false)
,	_full_range_dst_flag (false)
/*,	_mat_main ()*/
,	_csp_out (fmtcl::ColorSpaceH265_UNSPECIFIED)
,	_plane_out (get_arg_int (in, out, "singleout", -1))
,	_proc_uptr ()
{
	assert (&in != 0);
	assert (&out != 0);
	assert (&core != 0);
	assert (&vsapi != 0);

	vsutl::CpuOpt  cpu_opt (*this, in, out);
	_sse_flag  = cpu_opt.has_sse ();
	_sse2_flag = cpu_opt.has_sse2 ();
	_avx_flag  = cpu_opt.has_avx ();
	_avx2_flag = cpu_opt.has_avx2 ();

	_proc_uptr = std::unique_ptr <fmtcl::MatrixProc> (new fmtcl::MatrixProc (
		_sse_flag, _sse2_flag, _avx_flag, _avx2_flag
	));

	// Checks the input clip
	if (_vi_in.format == 0)
	{
		throw_inval_arg ("only constant pixel formats are supported.");
	}

	const ::VSFormat &   fmt_src = *_vi_in.format;

	if (fmt_src.subSamplingW != 0 || fmt_src.subSamplingH != 0)
	{
		throw_inval_arg ("input must be 4:4:4.");
	}
	if (fmt_src.numPlanes != NBR_PLANES)
	{
		throw_inval_arg ("greyscale format not supported as input.");
	}
	if (   (   fmt_src.sampleType == ::stInteger
	        && (   fmt_src.bitsPerSample <  8
	            || fmt_src.bitsPerSample > 12)
	        && fmt_src.bitsPerSample != 16)
	    || (   fmt_src.sampleType == ::stFloat
	        && fmt_src.bitsPerSample != 32))
	{
		throw_inval_arg ("pixel bitdepth not supported.");
	}

	if (_plane_out >= NBR_PLANES)
	{
		throw_inval_arg (
			"singleout is a plane index and must be -1 or ranging from 0 to 3."
		);
	}

	// Destination colorspace
	bool           force_col_fam_flag;
	const ::VSFormat *   fmt_dst_ptr = get_output_colorspace (
		in, out, core, fmt_src, _plane_out, force_col_fam_flag
	);

	if (   fmt_dst_ptr->colorFamily != ::cmGray
	    && fmt_dst_ptr->colorFamily != ::cmRGB
	    && fmt_dst_ptr->colorFamily != ::cmYUV
	    && fmt_dst_ptr->colorFamily != ::cmYCoCg)
	{
		throw_inval_arg ("unsupported color family for output.");
	}
	if (   (   fmt_dst_ptr->sampleType == ::stInteger
	        && (   fmt_dst_ptr->bitsPerSample <  8
	            || fmt_dst_ptr->bitsPerSample > 12)
	        && fmt_dst_ptr->bitsPerSample != 16)
	    || (   fmt_dst_ptr->sampleType == ::stFloat
	        && fmt_dst_ptr->bitsPerSample != 32))
	{
		throw_inval_arg ("output bitdepth not supported.");
	}
	if (   fmt_dst_ptr->sampleType    != fmt_src.sampleType
	    || fmt_dst_ptr->bitsPerSample <  fmt_src.bitsPerSample
	    || fmt_dst_ptr->subSamplingW  != fmt_src.subSamplingW
	    || fmt_dst_ptr->subSamplingH  != fmt_src.subSamplingH)
	{
		throw_inval_arg (
			"specified output colorspace is not compatible with the input."
		);
	}

	// Preliminary matrix test: deduce the target color family if unspecified
	if (   ! force_col_fam_flag
	    && fmt_dst_ptr->colorFamily != ::cmGray)
	{
		int               def_count = 0;
		def_count += is_arg_defined (in, "mat" ) ? 1 : 0;
		def_count += is_arg_defined (in, "mats") ? 1 : 0;
		def_count += is_arg_defined (in, "matd") ? 1 : 0;
		if (def_count == 1)
		{
			std::string    tmp_mat (get_arg_str (in, out, "mat", ""));
			tmp_mat = get_arg_str (in, out, "mats", tmp_mat);
			tmp_mat = get_arg_str (in, out, "matd", tmp_mat);

			fmtcl::ColorSpaceH265   tmp_csp =
				find_cs_from_mat_str (*this, tmp_mat, false);

			fmt_dst_ptr = find_dst_col_fam (tmp_csp, fmt_dst_ptr, fmt_src, core);
		}
	}

	// Output format is validated.
	_vi_out.format = fmt_dst_ptr;
	const ::VSFormat &fmt_dst = *fmt_dst_ptr;

	const int      nbr_expected_coef = NBR_PLANES * (NBR_PLANES + 1);

	bool           mat_init_flag = false;

	// Matrix presets
	std::string    mat (get_arg_str (in, out, "mat", ""));
	std::string    mats ((   fmt_src.colorFamily == ::cmYUV ) ? mat : "");
	std::string    matd ((   fmt_dst.colorFamily == ::cmYUV
	                      || fmt_dst.colorFamily == ::cmGray) ? mat : "");
	mats = get_arg_str (in, out, "mats", mats);
	matd = get_arg_str (in, out, "matd", matd);
	if (! mats.empty () || ! matd.empty ())
	{
		fstb::conv_to_lower_case (mats);
		fstb::conv_to_lower_case (matd);
		select_def_mat (mats, fmt_src);
		select_def_mat (matd, fmt_dst);

		fmtcl::Mat4    m2s;
		fmtcl::Mat4    m2d;
		make_mat_from_str (m2s, mats, true);
		make_mat_from_str (m2d, matd, false);
		_csp_out = find_cs_from_mat_str (*this, matd, false);

		_mat_main = m2d * m2s;

		mat_init_flag = true;
	}

	// Range
	_full_range_src_flag = (get_arg_int (
		in, out, "fulls" ,
		vsutl::is_full_range_default (fmt_src) ? 1 : 0,
		0, &_range_set_src_flag
	) != 0);
	_full_range_dst_flag = (get_arg_int (
		in, out, "fulld",
		vsutl::is_full_range_default (fmt_dst) ? 1 : 0,
		0, &_range_set_dst_flag
	) != 0);

	// Custom coefficients
	const int      nbr_coef = _vsapi.propNumElements (&in, "coef");
	const bool     custom_mat_flag = (nbr_coef > 0);
	if (custom_mat_flag)
	{
		if (nbr_coef != nbr_expected_coef)
		{
			throw_inval_arg ("coef has a wrong number of elements.");
		}

		for (int y = 0; y < NBR_PLANES + 1; ++y)
		{
			for (int x = 0; x < NBR_PLANES + 1; ++x)
			{
				_mat_main [y] [x] = (x == y) ? 1 : 0;

				if (   (x < fmt_src.numPlanes || x == NBR_PLANES)
				    &&  y < fmt_dst.numPlanes)
				{
					int            err = 0;
					const int      index = y * (fmt_src.numPlanes + 1) + x;
					const double   c = _vsapi.propGetFloat (&in, "coef", index, &err);
					if (err != 0)
					{
						throw_rt_err ("error while reading the matrix coefficients.");
					}
					_mat_main [y] [x] = c;
				}
			}
		}

		mat_init_flag = true;
	}

	if (! mat_init_flag)
	{
		throw_inval_arg (
			"you must specify a matrix preset or a custom coefficient list."
		);
	}

	prepare_coef (fmt_dst, fmt_src);

	if (_vsapi.getError (&out) != 0)
	{
		throw -1;
	}
}
Beispiel #12
0
const ::VSFormat *	Matrix::get_output_colorspace (const ::VSMap &in, ::VSMap &out, ::VSCore &core, const ::VSFormat &fmt_src, int &plane_out, bool &force_col_fam_flag) const
{
	assert (&in != 0);
	assert (&out != 0);
	assert (&core != 0);
	assert (&fmt_src != 0);
	assert (&plane_out != 0);

	force_col_fam_flag = false;

	const ::VSFormat *   fmt_dst_ptr = &fmt_src;

	// Full colorspace
	int            csp_dst = get_arg_int (in, out, "csp", ::pfNone);
	if (csp_dst != ::pfNone)
	{
		fmt_dst_ptr = _vsapi.getFormatPreset (csp_dst, &core);
		if (fmt_dst_ptr == 0)
		{
			throw_inval_arg ("unknown output colorspace.");
		}
		else
		{
			force_col_fam_flag = true;
		}
	}

	int            col_fam  = fmt_dst_ptr->colorFamily;
	int            spl_type = fmt_dst_ptr->sampleType;
	int            bits     = fmt_dst_ptr->bitsPerSample;
	int            ssh      = fmt_dst_ptr->subSamplingW;
	int            ssv      = fmt_dst_ptr->subSamplingH;

	// Color family
	if (is_arg_defined (in, "col_fam"))
	{
		force_col_fam_flag = true;
		col_fam = get_arg_int (in, out, "col_fam", col_fam);
	}

	if (plane_out >= 0)
	{
		col_fam = ::cmGray;
	}
	else if (col_fam == ::cmGray)
	{
		plane_out = 0;
	}

	// Destination bit depth
	bits = get_arg_int (in, out, "bits", bits);

	// Combines the modified parameters and validates the format
	try
	{
		fmt_dst_ptr = register_format (
			col_fam,
			spl_type,
			bits,
			ssh,
			ssv,
			core
		);
	}
	catch (std::exception &)
	{
		throw;
	}
	catch (...)
	{
		fmt_dst_ptr = 0;
	}

	if (fmt_dst_ptr == 0)
	{
		throw_rt_err (
			"couldn\'t get a pixel format identifier for the output clip."
		);
	}

	return (fmt_dst_ptr);
}
Beispiel #13
0
Transfer::Transfer (const ::VSMap &in, ::VSMap &out, void * /*user_data_ptr*/, ::VSCore &core, const ::VSAPI &vsapi)
:	vsutl::FilterBase (vsapi, "transfer", ::fmParallel, 0)
,	_clip_src_sptr (vsapi.propGetNode (&in, "clip", 0, 0), vsapi)
,	_vi_in (*_vsapi.getVideoInfo (_clip_src_sptr.get ()))
,	_vi_out (_vi_in)
,	_sse2_flag (false)
,	_avx2_flag (false)
,	_transs (get_arg_str (in, out, "transs", ""))
,	_transd (get_arg_str (in, out, "transd", ""))
,	_contrast (get_arg_flt (in, out, "cont", 1))
,	_gcor (get_arg_flt (in, out, "gcor", 1))
,	_lvl_black (get_arg_flt (in, out, "blacklvl", 0))
,	_full_range_src_flag (get_arg_int (in, out, "fulls", 1) != 0)
,	_full_range_dst_flag (get_arg_int (in, out, "fulld", 1) != 0)
,	_curve_s (fmtcl::TransCurve_UNDEF)
,	_curve_d (fmtcl::TransCurve_UNDEF)
,	_loglut_flag (false)
,	_plane_processor (vsapi, *this, "transfer", true)
,	_lut_uptr ()
{
	assert (&in != 0);
	assert (&out != 0);
	assert (&core != 0);
	assert (&vsapi != 0);

	fstb::conv_to_lower_case (_transs);
	fstb::conv_to_lower_case (_transd);

	vsutl::CpuOpt  cpu_opt (*this, in, out);
	_sse2_flag = cpu_opt.has_sse2 ();
	_avx2_flag = cpu_opt.has_avx2 ();

	// Checks the input clip
	if (_vi_in.format == 0)
	{
		throw_inval_arg ("only constant pixel formats are supported.");
	}

	const ::VSFormat &   fmt_src = *_vi_in.format;

	if (   fmt_src.colorFamily != ::cmGray
	    && fmt_src.colorFamily != ::cmRGB)
	{
		throw_inval_arg ("unsupported color family.");
	}
	if (   (   fmt_src.sampleType == ::stInteger
	        && (   fmt_src.bitsPerSample <  8
	            || fmt_src.bitsPerSample > 16))
	    || (   fmt_src.sampleType == ::stFloat
	        && fmt_src.bitsPerSample != 32))
	{
		throw_inval_arg ("pixel bitdepth not supported.");
	}

	// Destination colorspace
	const ::VSFormat& fmt_dst =
		get_output_colorspace (in, out, core, fmt_src);

	if (   (   fmt_dst.sampleType == ::stInteger
	        && fmt_dst.bitsPerSample != 16)
	    || (   fmt_dst.sampleType == ::stFloat
	        && fmt_dst.bitsPerSample != 32))
	{
		throw_inval_arg ("output bitdepth not supported.");
	}

	// Output format is validated.
	_vi_out.format = &fmt_dst;

	init_table ();
}
Beispiel #14
0
const ::VSFormat &	Transfer::get_output_colorspace (const ::VSMap &in, ::VSMap &out, ::VSCore &core, const ::VSFormat &fmt_src) const
{
	assert (&in != 0);
	assert (&out != 0);
	assert (&core != 0);
	assert (&fmt_src != 0);

	const ::VSFormat *   fmt_dst_ptr = &fmt_src;

	const int      undef    = -666666666;
	const int      dst_flt  = get_arg_int (in, out, "flt" , undef);
	const int      dst_bits = get_arg_int (in, out, "bits", undef);

	int            col_fam  = fmt_dst_ptr->colorFamily;
	int            spl_type = fmt_dst_ptr->sampleType;
	int            bits     = fmt_dst_ptr->bitsPerSample;
	int            ssh      = fmt_dst_ptr->subSamplingW;
	int            ssv      = fmt_dst_ptr->subSamplingH;

	// Data type
	if (dst_flt == 0)
	{
		spl_type = ::stInteger;
	}
	else if (dst_flt != undef)
	{
		spl_type = ::stFloat;
		if (dst_bits == undef)
		{
			bits = 32;
		}
	}

	// Bitdepth
	if (dst_bits != undef)
	{
		bits = dst_bits;
		if (dst_flt == undef)
		{
			if (bits < 32)
			{
				spl_type = ::stInteger;
			}
			else
			{
				spl_type = ::stFloat;
			}
		}
	}

	// Combines the modified parameters and validates the format
	try
	{
		fmt_dst_ptr = register_format (
			col_fam,
			spl_type,
			bits,
			ssh,
			ssv,
			core
		);
	}
	catch (...)
	{
		fmt_dst_ptr = 0;
	}
	if (fmt_dst_ptr == 0)
	{
		throw_rt_err (
			"couldn\'t get a pixel format identifier for the output clip."
		);
	}

	return (*fmt_dst_ptr);
}
int main(int argc, char *argv[]) {
  char *msa_fname = NULL, *alph = "ACGT";
  msa_format_type input_format = UNKNOWN_FORMAT;
  char c;
  int opt_idx, seed=-1;
  String *optstr;
  List *tmplist = NULL; 
  struct phyloFit_struct *pf;
  FILE *infile;
  
  struct option long_opts[] = {
    {"msa", 1, 0, 'm'},
    {"tree", 1, 0, 't'},
    {"subst-mod", 1, 0, 's'},
    {"msa-format", 1, 0, 'i'},
    {"nrates", 1, 0, 'k'},
    {"alpha", 1, 0, 'a'},
    {"features", 1, 0, 'g'},
    {"catmap", 1, 0, 'c'},
    {"log", 1, 0, 'l'},
    {"out-root", 1, 0, 'o'},
    {"EM", 0, 0, 'E'},
    {"error", 1, 0, 'e'},
    {"precision", 1, 0, 'p'},
    {"do-cats", 1, 0, 'C'},
    {"non-overlapping", 0, 0, 'V'},
    {"markov", 0, 0, 'N'},
    {"reverse-groups", 1, 0, 'R'},
    {"init-model", 1, 0, 'M'},
    {"init-random", 0, 0, 'r'},
    {"init-parsimony", 0, 0, 'y'},
    {"print-parsimony", 1, 0, 'Y'},
    {"lnl", 0, 0, 'L'},
    {"scale-only", 0, 0, 'B'},
    {"scale-subtree", 1, 0, 'S'},
    {"estimate-freqs", 0, 0, 'F'},
    {"sym-freqs", 0, 0, 'W'},
    {"no-freqs", 0, 0, 'f'},
    {"no-rates", 0, 0, 'n'},
    {"no-opt", 1, 0, 'O'},
    {"min-informative", 1, 0, 'I'},
    {"gaps-as-bases", 0, 0, 'G'},     
    {"quiet", 0, 0, 'q'},
    {"help", 0, 0, 'h'},
    {"windows", 1, 0, 'w'},
    {"windows-explicit", 1, 0, 'v'},
    {"ancestor", 1, 0, 'A'},
    {"post-probs", 0, 0, 'P'},
    {"expected-subs", 0, 0, 'X'},
    {"expected-total-subs", 0, 0, 'Z'},
    {"expected-subs-col", 0, 0, 'J'},
    {"column-probs", 0, 0, 'U'},
    {"rate-constants", 1, 0, 'K'},
    {"ignore-branches", 1, 0, 'b'},
    {"clock", 0, 0, 'z'},
    {"alt-model", 1, 0, 'd'},
    {"label-branches", 1, 0, 0},
    {"label-subtree", 1, 0, 0},
    {"selection", 1, 0, 0},
    {"bound", 1, 0, 'u'},
    {"seed", 1, 0, 'D'},
    {0, 0, 0, 0}
  };

  // NOTE: remaining shortcuts left: HjQx

  pf = phyloFit_struct_new(0);

  while ((c = (char)getopt_long(argc, argv, "m:t:s:g:c:C:i:o:k:a:l:w:v:M:p:A:I:K:S:b:d:O:u:Y:e:D:GVENRqLPXZUBFfnrzhWyJ", long_opts, &opt_idx)) != -1) {
    switch(c) {
    case 'm':
      msa_fname = optarg;
      break;
    case 't':
      if (optarg[0] == '(')        /* in this case, assume topology given
                                   at command line */
        pf->tree = tr_new_from_string(optarg);
      else 
        pf->tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 's':
      pf->subst_mod = tm_get_subst_mod_type(optarg);
      if (pf->subst_mod == UNDEF_MOD) 
        die("ERROR: illegal substitution model.     Type \"phyloFit -h\" for usage.\n");
      break;
    case 'g':
      pf->gff = gff_read_set(phast_fopen(optarg, "r"));
      break;
    case 'c':
      pf->cm = cm_new_string_or_file(optarg);
      break;
    case 'C':
      pf->cats_to_do_str = get_arg_list(optarg);
      break;
    case 'V':
      pf->nonoverlapping = TRUE;
      break;
    case 'o':
      pf->output_fname_root = optarg;
      break;
    case 'k':
      pf->nratecats = get_arg_int_bounds(optarg, 0, INFTY);
      break;
    case 'a':
      pf->alpha = get_arg_dbl(optarg);
      break;
    case 'R':
      pf->reverse_group_tag = optarg;
      break;
    case 'i':
      input_format = msa_str_to_format(optarg);
      if (input_format == UNKNOWN_FORMAT)
        die("ERROR: unrecognized alignment format.    Type 'phyloFit -h' for usage.\n");
      break;
    case 'l':
      if (!strcmp(optarg, "-"))
	pf->logf = stderr;
      else pf->logf = phast_fopen(optarg, "w+");
      break;
    case 'N':
      pf->use_conditionals = 1;
      break;
    case 'w':
      tmplist = get_arg_list(optarg);
      if (lst_size(tmplist) != 2 ||
          str_as_int(lst_get_ptr(tmplist, 0), &(pf->window_size)) != 0 ||
          str_as_int(lst_get_ptr(tmplist, 1), &(pf->window_shift)) != 0) 
        die("ERROR: illegal arguments to --windows.\n");
      lst_free_strings(tmplist);
      lst_free(tmplist);
      break;
    case 'v':
      tmplist = get_arg_list(optarg);
      if (lst_size(tmplist) % 2 != 0) 
        die("ERROR: argument to --windows-explicit must be a list of even length.\n");
      pf->window_coords = str_list_as_int(tmplist);
      lst_free(tmplist);
      break;
    case 'E':
      pf->use_em = TRUE;
      break;
    case 'e':
      pf->error_fname=optarg;
      break;
    case 'p':
      if (!strcmp(optarg, "LOW")) pf->precision = OPT_LOW_PREC;
      else if (!strcmp(optarg, "MED")) pf->precision = OPT_MED_PREC;
      else if (!strcmp(optarg, "HIGH")) pf->precision = OPT_HIGH_PREC;
      else if (!strcmp(optarg, "VERY_HIGH")) pf->precision = OPT_VERY_HIGH_PREC;
      else die("ERROR: --precision must be LOW, MED, or HIGH.\n\n");
      break;
    case 'M':
      pf->input_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
      break;
    case 'r':
      pf->random_init = TRUE;
      break;
    case 'y':
      pf->init_parsimony = TRUE;
      break;
    case 'Y':
      pf->init_parsimony = TRUE;
      pf->parsimony_cost_fname = optarg;
      pf->parsimony_only=TRUE;
      break; 
    case 'L':
      pf->likelihood_only = TRUE;
      break;
    case 'q':
      pf->quiet = TRUE;
      break;
    case 'P':
      pf->do_bases = TRUE;
      break;
    case 'X':
      pf->do_expected_nsubst = TRUE;
      break;
    case 'Z':
      pf->do_expected_nsubst_tot = TRUE;
      break;
    case 'J':
      pf->do_expected_nsubst_col = TRUE;
      break;
    case 'U':
      pf->likelihood_only = TRUE;        /* force -L */
      pf->nsites_threshold = 0;        /* also force this; typical use is
                                   with small number of tuples, no
                                   tuple_idx */
      pf->do_column_probs = TRUE;
      break;
    case 'A':
      pf->root_seqname = optarg;
      break;
    case 'I':
      pf->nsites_threshold = get_arg_int(optarg);
      break;
    case 'G':
      pf->gaps_as_bases = TRUE;
      alph = "ACGT-";
      break;
    case 'B':
      pf->estimate_scale_only = TRUE;
      break;
    case 'S':
      pf->subtree_name = optarg;
      break;       
    case 'F':
      pf->estimate_backgd = TRUE;
      break;
    case 'W':
      pf->estimate_backgd = TRUE;
      pf->symfreq = TRUE;
      break;
    case 'f':
      pf->no_freqs = TRUE;
      break;
    case 'n':
      pf->no_rates = TRUE;
      break;
    case 'K':
      tmplist = get_arg_list(optarg);
      pf->rate_consts = str_list_as_dbl(tmplist);
      pf->nratecats = lst_size(pf->rate_consts);
      pf->use_em = 1;
      lst_free_strings(tmplist); lst_free(tmplist);
      break;
    case 'b':
      pf->ignore_branches = get_arg_list(optarg);
      break;
    case 'z':
      pf->assume_clock = TRUE;
      break;
    case 'O':
      if (pf->nooptstr == NULL) 
	pf->nooptstr = str_new_charstr(optarg);
      else die("ERROR: no-opt argument can only be used once!  parameters can be comma-separated list.");
      break;
    case 'd':
      if (pf->alt_mod_str == NULL) {
	pf->alt_mod_str = lst_new_ptr(1);
      }
      optstr = str_new_charstr(optarg);
      lst_push_ptr(pf->alt_mod_str, optstr);
      break;
    case 0:
      if (strcmp(long_opts[opt_idx].name, "label-branches") == 0 ||
	  strcmp(long_opts[opt_idx].name, "label-subtree") == 0) {
	optstr = str_new_charstr(optarg);
	if (pf->label_str == NULL) {
	  pf->label_str = lst_new_ptr(3);
	  pf->label_type = lst_new_int(3);
	}
	lst_push_ptr(pf->label_str, optstr);
	lst_push_int(pf->label_type, 
		     strcmp(long_opts[opt_idx].name, "label-branches") == 0 ? 
		     BRANCH_TYPE : SUBTREE_TYPE);
      }
      else if (strcmp(long_opts[opt_idx].name, "selection") == 0) {
	pf->selection = get_arg_dbl(optarg);
	pf->use_selection = TRUE;
      }
      else {
	die("ERROR: unknown option.  Type 'phyloFit -h' for usage.\n");
      }
      break;
    case 'u':
      if (pf->bound_arg == NULL) 
	pf->bound_arg = lst_new_ptr(1);
      optstr = str_new_charstr(optarg);
      lst_push_ptr(pf->bound_arg, optstr);
      break;
    case 'D':
      seed = get_arg_int_bounds(optarg, 1, INFTY);
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("ERROR: illegal argument.     Type 'phyloFit -h' for usage.\n");
    }
  }

  set_seed(seed);

  if (msa_fname == NULL) {
    if (optind >= argc) 
      die("ERROR: missing alignment filename.  Type 'phyloFit -h' for usage.\n");
    msa_fname = argv[optind];
    pf->msa_fname = msa_fname;
  }

  infile = phast_fopen(msa_fname, "r");

  if (input_format == UNKNOWN_FORMAT)
    input_format = msa_format_for_content(infile, 1);

  if (pf->nonoverlapping && (pf->use_conditionals || pf->gff != NULL || 
			     pf->cats_to_do_str || input_format == SS))
    die("ERROR: cannot use --non-overlapping with --markov, --features,\n--msa-format SS, or --do-cats.\n");


  /* read alignment */
  if (!pf->quiet) fprintf(stderr, "Reading alignment from %s ...\n", msa_fname);
  if (input_format == MAF) {
    pf->msa = maf_read(infile, NULL, 
		       tm_order(pf->subst_mod) + 1, 
		       NULL, pf->gff, pf->cm, 
		       pf->nonoverlapping ? tm_order(pf->subst_mod) + 1 : -1, 
		       FALSE, pf->reverse_group_tag, NO_STRIP, FALSE);
    if (pf->gaps_as_bases) 
      msa_reset_alphabet(pf->msa, alph);
  }
  else 
    pf->msa = msa_new_from_file_define_format(infile, 
				input_format, alph);

  /* set up for categories */
  /* first label sites, if necessary */
  pf->label_categories = (input_format != MAF);

  run_phyloFit(pf);

  if (pf->logf != NULL && pf->logf != stderr && pf->logf != stdout)
    phast_fclose(pf->logf);
  if (!pf->quiet) fprintf(stderr, "Done.\n");
  sfree(pf);
  
  return 0;
}
Beispiel #16
0
/*
 * gets the classses fields and items for each class
 */
int test_ssds(int fd)
{
  int ret;
  int res;
  int num;
  int i;
  int id;
  char **argv;
  char *myargv[16];
  int argc;
  int fields;
  int items;
  char myclass[32];
  char rbuf[132];
  char sbuf[132];

  argv = myargv;

  ret = 0;
  num = -1;

  sprintf(sbuf,"msg 5678:action:get_classes:");
  res = ssds_send(fd, sbuf, rbuf);
  if (res < 0)
    printf(" Client res error %d \n", res);

  argc = get_argc(rbuf, argv, 16);
  ret = get_arg_int(argc, argv, "reply", &num);
  //printf(" Client num_classes is %d \n", num);

  for (i=0; i<num; i++) {
    sprintf(sbuf,"msg 5678:action:get_class_num:num:%d:",i);
    res = ssds_send(fd, sbuf, rbuf);
    if (res < 0)
      printf(" Client res error %d \n", res);
    argc = get_argc(rbuf, argv, 16);
    get_arg_char(argc, argv, "class", myclass, 32);
    ret = get_arg_int(argc, argv, "id", &id);
    if (ret == 0) {
      printf(" Client class name (%s) id %d \n", myclass, id);
      sprintf(sbuf,"msg 5678:action:get_fields:id:%d:",id);
      res = ssds_send(fd, sbuf, rbuf);
      if (res < 0)
	printf(" Client res error %d \n", res);

      argc = get_argc(rbuf, argv, 16);
      get_arg_char(argc, argv,"class", myclass, 32);
      ret = get_arg_int(argc, argv,"reply", &fields);

      sprintf(sbuf,"msg 5678:action:get_items:id:%d:", id);
      res = ssds_send(fd, sbuf, rbuf);
      if (res < 0)
	printf(" Client res error %d \n", res);
      argc = get_argc(rbuf, argv, 16);
      get_arg_char(argc, argv,"class", myclass, 32);
      ret = get_arg_int(argc, argv,"reply", &items);

      printf(" Client class (%s) id %d num fields %d num items %d\n",
	     myclass, id, fields, items);

    } else {
      printf(" Client class name NOT_FOUND \n");
    }
  }

  return ret;
}
Beispiel #17
0
int main(int argc, char* argv[]) {
    FILE* F;
    MSA *msa;
    int *msa_gap_patterns = NULL;
    HMM *hmm = NULL;
    TreeNode *tree = NULL;
    int i, input_format = SS, msa_idx, quiet_mode = FALSE,
           ncats, nmsas, ncats_unspooled, indel_nseqs = -1;
    String *msa_fname, *gff_fname;
    List *gff_fname_list = NULL, *msa_fname_list = NULL,
          *msa_length_list = NULL, *model_indels_str = NULL;
    Matrix *traincounts = NULL;
    Vector *begcounts = NULL, *statecounts = NULL;
    CategoryMap *cm = NULL;
    char c;
    GapPatternMap *gpm = NULL;
    GFF_Set *gff;
    char *reverse_groups_tag = NULL;

    while ((c = getopt(argc, argv, "i:g:c:m:M:R:I:n:t:P:G:qh")) != -1) {
        switch(c) {
        case 'i':
            input_format = msa_str_to_format(optarg);
            if (input_format == -1)
                die("ERROR: bad alignment format.\n");
            break;
        case 'g':
            gff_fname_list = get_arg_list(optarg);
            break;
        case 'c':
            cm = cm_new_string_or_file(optarg);
            break;
        case 'm':
            msa_fname_list = get_arg_list(optarg);
            break;
        case 'M':
            msa_length_list = str_list_as_int(get_arg_list(optarg));
            break;
        case 'R':
            reverse_groups_tag = optarg;
            break;
        case 'I':
            model_indels_str = get_arg_list(optarg);
            break;
        case 'n':
            indel_nseqs = get_arg_int(optarg);
            break;
        case 't':
            if (optarg[0] == '(')     /* in this case, assume topology given
                                   at command line */
                tree = tr_new_from_string(optarg);
            else
                tree = tr_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'q':
            quiet_mode = TRUE;
            break;
        case 'h':
            print_usage();
            exit(0);
        case '?':
            die("ERROR: unrecognized option.\n\nType 'hmm_train -h' for usage.\n");
        }
    }

    if (msa_fname_list == NULL)
        die("ERROR: -m required.  Type 'hmm_train -h' for usage.\n");
    if (gff_fname_list == NULL)
        die("ERROR: -g required in training mode.  Type 'hmm_train -h' for usage.\n");
    if (msa_length_list != NULL && msa_fname_list != NULL)
        die("ERROR: -m and -M are mutually exclusive.  Type 'hmm_train -h' for usage.\n");
    if (model_indels_str != NULL && tree == NULL)
        die("ERROR: -I requires -t.  Type 'hmm_train -h' for usage.\n");
    if (cm == NULL)
        die("ERROR: category map required.\n");

    set_seed(-1);

    ncats = cm->ncats + 1;
    ncats_unspooled = cm->unspooler != NULL ? cm->unspooler->nstates_unspooled :
                      ncats;
    nmsas = (msa_length_list != NULL ? lst_size(msa_length_list) :
             lst_size(msa_fname_list));

    if (model_indels_str != NULL) {
        if (tree == NULL)
            die("ERROR: tree is NULL\n");  /*FIXME: indel_ncats broken */
        gpm = gp_create_gapcats(cm, model_indels_str, tree, FALSE);
        ncats = cm->ncats + 1;    /* numbers will change */
        ncats_unspooled = cm->unspooler == NULL ? ncats :
                          cm->unspooler->nstates_unspooled;
    }

    /* allocate memory for storage of "training paths" */
    traincounts = mat_new(ncats_unspooled, ncats_unspooled);
    statecounts = vec_new(ncats_unspooled);
    begcounts = vec_new(ncats_unspooled);
    mat_zero(traincounts);
    vec_zero(statecounts);
    vec_zero(begcounts);


    /* create skeleton of new HMM. */
    hmm = hmm_new_nstates(ncats_unspooled, 0, 0);

    /* Main loop: consider each MSA in turn */
    for (msa_idx = 0; msa_idx < nmsas; msa_idx++) {
        if (msa_fname_list != NULL) {
            msa_fname = (String*)lst_get_ptr(msa_fname_list, msa_idx);
            F = phast_fopen(msa_fname->chars, "r");
            if (!quiet_mode)
                fprintf(stderr, "Reading alignment from %s ...\n",
                        F == stdin ? "stdin" : msa_fname->chars);
            msa = msa_new_from_file(F, NULL);
            phast_fclose(F);

        }
        else {                      /* only lengths of alignments specified */
            msa = msa_new(NULL, NULL, 0, lst_get_int(msa_length_list, msa_idx), NULL);
            /* just a shell in this case */
        }

        gff_fname = (String*)lst_get_ptr(gff_fname_list, msa_idx);
        if (!quiet_mode)
            fprintf(stderr, "Reading annotations from %s ...\n", gff_fname->chars);
        gff = gff_read_set(phast_fopen(gff_fname->chars, "r"));

        /* convert GFF to coordinate frame of alignment */
        if (msa_length_list == NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Mapping annotations to alignment ...\n");
            msa_map_gff_coords(msa, gff, 1, 0, 0); /* assume seq 1 is ref */
        }

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Obtaining gap patterns ...\n");
            msa_gap_patterns = smalloc(msa->length * sizeof(int));
            gp_set_phylo_patterns(gpm, msa_gap_patterns, msa);
        }

        /* at this point, we don't actually need the alignment anymore;
           if using ordered suff stats (likely with large data sets),
           can free them now, to avoid running out of memory */
        if (msa->ss != NULL) {
            ss_free(msa->ss);
            msa->ss = NULL;
        }

        if (reverse_groups_tag != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Reverse complementing features on negative strand (group by '%s') ...\n",
                        reverse_groups_tag);
            /* we don't need to reverse complement the whole alignment --
               just the gff and possibly the gap pattern array (pass a
               NULL msa) */
            gff_group(gff, reverse_groups_tag);
            msa_reverse_compl_feats(NULL, gff, msa_gap_patterns);
        }

        if (!quiet_mode)
            fprintf(stderr, "Labeling sites by category ...\n");
        msa_label_categories(msa, gff, cm);

        gff_free_set(gff);

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Remapping categories according to gap patterns ...\n");

            if (indel_nseqs > 0 && indel_nseqs != msa->nseqs) {
                /* in this case, we'll simply reassign non-trivial gap
                   patterns randomly.  This will achieve the desired
                   effect with minimal coding, as long as the number of
                   sites is not too small (the indel model is probably
                   useless anyway if the number is small) */
                int pat, newpat;
                int npatterns = 4 * indel_nseqs - 5;
                int complex_allowed[cm->ncats+1];
                List *no_complex_names, *no_complex_nums;

                if (!quiet_mode)
                    fprintf(stderr, "(target number of sequences: %d)\n", indel_nseqs);

                /* set up index indicating by cat no. whether complex gaps
                   are allowed */
                for (i = 0; i < ncats; i++) complex_allowed[i] = 1;
                no_complex_names = lst_new_ptr(10);
                str_split(str_new_charstr(NO_COMPLEX), ",", no_complex_names);
                no_complex_nums = cm_get_category_list(cm, no_complex_names, 1);
                for (i = 0; i < lst_size(no_complex_nums); i++)
                    complex_allowed[lst_get_int(no_complex_nums, i)] = 0;
                lst_free(no_complex_nums);
                lst_free_strings(no_complex_names);
                lst_free(no_complex_names);

                /* now reassign all non-null numbers */
                for (i = 0; i < msa->length; ) {
                    if ((pat = msa_gap_patterns[i]) != 0) {
                        if (complex_allowed[msa->categories[i]])
                            newpat = 1 + ((double)npatterns * unif_rand());
                        /* random number in interval [1, npatterns] */
                        else
                            newpat = 1 + ((double)(npatterns-1) * unif_rand());
                        /* random number in interval [1,npatterns-1]
                           (excludes complex gap pattern) */
                        for (; i < msa->length && msa_gap_patterns[i] == pat; i++)
                            msa_gap_patterns[i] = newpat; /* change for whole sequence */
                    }
                    else i++;
                }
            }

            /* obtain gapped category number for each site */
            for (i = 0; i < msa->length; i++)
                if (gpm->cat_x_pattern_to_gapcat[msa->categories[i]] != NULL)
                    msa->categories[i] = gpm->cat_x_pattern_to_gapcat[msa->categories[i]][msa_gap_patterns[i]];
        }

        if (!quiet_mode)
            fprintf(stderr, "Unspooling categories ...\n");
        cm_spooled_to_unspooled(cm, msa->categories, msa->length);

        if (!quiet_mode)
            fprintf(stderr, "Collecting training data ...\n");
        hmm_train_update_counts(traincounts, statecounts, begcounts,
                                msa->categories, msa->length,
                                ncats_unspooled);

        if (msa_gap_patterns != NULL) sfree(msa_gap_patterns);
        msa_free(msa);
    }

    /* now train HMM, using cumulative data */
    hmm_train_from_counts(hmm, traincounts, NULL, statecounts, NULL,
                          begcounts, NULL);

    /* if modeling indels, adjust begin transitions so probability is
       distributed among different "gap pattern" states that all
       correspond to the same ungapped state (category); this helps
       avoid problems that occur when training on a few large sequences
       (e.g., whole chromosomes) and then testing on many shorter ones */
    if (model_indels_str != NULL) {
        double tprob[gpm->ncats];
        int nst[gpm->ncats];  /* total prob and number of states per
                             spooled, ungapped category */
        for (i = 0; i < gpm->ncats; i++) tprob[i] = nst[i] = 0;
        for (i = 0; i < hmm->nstates; i++) {
            if (vec_get(hmm->begin_transitions, i) > 0)
                /* have to go from unspooled space to spooled space, then to
                   ungapped space (HMM states correspond to unspooled,
                   gapped categories).  Note that states with nonzero begin
                   probs shouldn't be conditioned on other states. */
                tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] +=
                    vec_get(hmm->begin_transitions, i);
            nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]++;
        }
        for (i = 0; i < hmm->nstates; i++)
            if (tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] > 0)
                vec_set(hmm->begin_transitions, i,
                        tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] /
                        nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]);
        /* (uniform prior) */
    }

    /* write trained HMM */
    hmm_print(stdout, hmm);

    if (!quiet_mode) fprintf(stderr, "Done.\n");

    return 0;
}
int main(int argc, char* argv[]) {
  FILE* F;
  GFF_Set *gff_real=NULL, *gff_pred=NULL;
  char c;
  List *real_fname_list = NULL, *pred_fname_list = NULL, 
    *feat_list = NULL, *seq_len_list = NULL, *l = NULL;
  int nfile, i, j;
  char *prefix = NULL;
  int tot_tp = 0, tot_fp = 0, tot_nreal_pos = 0, tot_npred_pos = 0, 
    tot_seqlen = 0, tot_ncr = 0, tot_npca = 0, tot_nola = 0, tot_nme = 0, 
    tot_npcp = 0, tot_nolp = 0, tot_nwe = 0, tot_nexons_real = 0, 
    tot_nexons_pred = 0, dump_exons = 0, nnc = -1, tot_nnc = -1, 
    nc_threshold = 0;

  while ((c = (char)getopt(argc, argv, "r:p:f:l:d:n:h")) != -1) {
    switch(c) {
    case 'r':
      real_fname_list = get_arg_list(optarg);
      break;
    case 'p':
      pred_fname_list = get_arg_list(optarg);
      break;
    case 'l':
      l = get_arg_list(optarg);
      /* convert to ints */
      seq_len_list = lst_new_int(lst_size(l));
      for (i = 0; i < lst_size(l); i++) {
        int tmp;
        if (str_as_int((String*)lst_get_ptr(l, i), 
                       &tmp) != 0) {
          die("ERROR: Bad integer in <seq_len_list>.\n"); 
        }
        lst_push_int(seq_len_list, tmp);
      }
      break;
    case 'f':
      feat_list = get_arg_list(optarg);
      break;
    case 'd':
      dump_exons = 1;
      prefix = optarg;
      break;
    case 'n':
      nnc = tot_nnc = 0;
      nc_threshold = get_arg_int(optarg);
      break;
    case 'h':
      print_usage();
      exit(0);
    case '?':
      die("Unrecognized option.  Try \"eval_predictions -h\" for help.\n");
    }
  }

  set_seed(-1);

  if (feat_list == NULL) {
    feat_list = lst_new_ptr(1);
    lst_push_ptr(feat_list, str_new_charstr(GFF_CDS_TYPE));
  }
  
  if (real_fname_list == NULL || pred_fname_list == NULL || 
      seq_len_list == NULL) {
    die("ERROR: Must specify -r, -p, and -l.  Try \"eval_predictions -h\" for help.\n");
  }

  if (lst_size(real_fname_list) != lst_size(pred_fname_list)) {
    die("ERROR: Must specify lists of equal length for real and predicted filenames.\n\n.");
  }

  if (lst_size(seq_len_list) == 1 && lst_size(real_fname_list) > 1)
    for (i = 1; i < lst_size(real_fname_list); i++)
      lst_push_int(seq_len_list, lst_get_int(seq_len_list, 0));
  else if (lst_size(seq_len_list) != lst_size(real_fname_list))
    die("ERROR: List of sequence lengths does not match lists of real and predicted filenames.\n");

  /* print header */
  printf("%-25s %-25s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s", "Real_fname", "Pred_fname", "Sn", "Sp", "AC", "CC", "ESn", "ESp", "CRa", "PCa", "OLa", "ME", "CRp", "PCp", "OLp", "WE");
  if (nnc != -1) printf(" %7s %7s %7s %7s", "NCa", "NCp", "CR+NCa", "CR+NCp");
  printf("\n");

  for (nfile = 0; nfile < lst_size(real_fname_list); nfile++) {
    int tp, fp, nexons_real, nexons_pred, nwe, nme, ncr, npca, 
      npcp, nola, nolp, nreal_pos, npred_pos, len_real, len_pred, seqlen,
      already_counted_real;
    String *real_fname, *pred_fname;
    GFF_Feature *feat_real, *feat_pred=NULL;

    real_fname = (String*)lst_get_ptr(real_fname_list, nfile);
    F = phast_fopen(real_fname->chars, "r");
    if ((gff_real = gff_read_set(F)) == NULL) {
      die("ERROR: Unable to read file \"%s\".\n", 
	  real_fname->chars);
    }
    phast_fclose(F);

    pred_fname = (String*)lst_get_ptr(pred_fname_list, nfile);
    F = phast_fopen(pred_fname->chars, "r");
    if ((gff_pred = gff_read_set(F)) == NULL) {
      die("ERROR: Unable to read file \"%s\".\n", 
	  pred_fname->chars);
    }
    phast_fclose(F);

    seqlen = lst_get_int(seq_len_list, nfile);

    /* sort ungrouped -- only cds exons will be considered, and each
       one will be considered individually */
    gff_ungroup(gff_real); 
    gff_ungroup(gff_pred);
    gff_sort(gff_real);
    gff_sort(gff_pred);

    nexons_real = nexons_pred = nwe = nme = ncr = npca = npcp = nola = 
      nolp = tp = fp = nreal_pos = npred_pos = 0;
    if (nnc != -1) nnc = 0;
    i = j = 0;
    already_counted_real = 0;
    while (i < lst_size(gff_real->features)) {
      feat_real = (GFF_Feature*)lst_get_ptr(gff_real->features, i);
      if (!is_exon(feat_real, feat_list)) { i++; continue; }

      len_real = feat_real->end - feat_real->start + 1;

      if (!already_counted_real) {
        nexons_real++;
        nreal_pos += len_real;
      }

      /* look at all predicted exons up to and overlapping this real exon */
      while (j < lst_size(gff_pred->features)) {
        feat_pred = (GFF_Feature*)lst_get_ptr(gff_pred->features, j);
        if (!is_exon(feat_pred, feat_list)) {
          j++;
          continue;
        }
        else if (feat_pred->start > feat_real->end) {
          if (!already_counted_real) {
            nme++;
            if (dump_exons) dump(prefix, feat_real, NULL, ME, -1);
          }
          break;
        }

        /* otherwise we have a predicted exon to count (start of pred
           <= end of real) */
        nexons_pred++;
        len_pred = feat_pred->end - feat_pred->start + 1;
        npred_pos += len_pred;
        j++;                    /* we'll be done with this prediction
                                   one way or another; next time
                                   through look at a new one */

        if (feat_pred->end < feat_real->start) { /* WE */
          nwe++;
          fp += len_pred;
          if (dump_exons) dump(prefix, NULL, feat_pred, WE, 0);
        }
        else if (feat_pred->start == feat_real->start && /* CR */
                 feat_pred->end == feat_real->end) {
          ncr++;
          tp += len_pred;
          if (dump_exons) dump(prefix, feat_real, feat_pred, CR, 1);
          break;
        }
        else if (feat_pred->start == feat_real->start || /* PC */
                 feat_pred->end == feat_real->end) {
          pred_type type;
          npca++;
          npcp++;
          if (nnc != -1 && 
              max(abs(feat_pred->start - feat_real->start), 
                  abs(feat_pred->end - feat_real->end)) <= nc_threshold) {
            nnc++; 
            type = NC;
          }
          else type = PC;
          if (len_pred < len_real) 
            tp += len_pred;
          else {
            tp += len_real;
            fp += (len_pred - len_real);
          }
          if (dump_exons) dump(prefix, feat_real, feat_pred, type, 
                               min(1, (double)len_real/len_pred));
          break;
        }
        else {                  /* OL */
          int overlap_size;
          pred_type type;
          nola++;
          nolp++;
          if (nnc != -1 && 
              max(abs(feat_pred->start - feat_real->start), 
                  abs(feat_pred->end - feat_real->end)) <= nc_threshold) {
            nnc++; 
            type = NC;
          }
          else type = PC;

          overlap_size = min(feat_pred->end, feat_real->end) - 
            max(feat_pred->start, feat_real->start) + 1;
          tp += overlap_size;
          fp += len_pred - overlap_size;
          if (dump_exons) dump(prefix, feat_real, feat_pred, type,
                               (double)overlap_size/len_pred);
          break;
        }
        /* NOTE: I'm ignoring the possibility that a predicted exon
           could be a PC and/or OL with respect to multiple real
           exons.  The effect on the exon-level stats will be fairly
           minor (at worst a predicted exon is scored as an OL when it
           should be scored as an PC, and a real exon is erroneously
           counted as a ME), but the effect on the nucleotide-level Sn
           and Sp could conceivably be significant.  */
      }

      /* if we have counted at least one prediction (and thus failed
         to reach the end of the list), but the last prediction did
         not extend as far as the end of the real exon, then delay
         moving on to the next real exon */
      if (j < lst_size(gff_pred->features) && feat_pred->end < feat_real->end) 
          already_counted_real = 1;
      else {
        /* if we reached the end of the list of predictions, then it
           must not have contained any exons, and the real exon in
           question is a ME (if it hasn't already been counted) */
        if (j == lst_size(gff_pred->features) && !already_counted_real) 
          nme++; 

        i++;
        already_counted_real = 0;
      }
    }
    
    /* any remaining predictions must be wrong */
    for (; j < lst_size(gff_pred->features); j++) {
      if (is_exon((GFF_Feature*)lst_get_ptr(gff_pred->features, j), 
                  feat_list)) {
        nexons_pred++;
        nwe++;
      }
    }

    compute_and_print_stats(stdout, real_fname, pred_fname, 
                            tp, fp, nreal_pos, npred_pos, seqlen, ncr, 
                            npca, nola, nme, npcp, nolp, nwe, 
                            nexons_real, nexons_pred, nnc);

    tot_tp += tp;
    tot_fp += fp;
    tot_nreal_pos += nreal_pos;
    tot_npred_pos += npred_pos;
    tot_seqlen += seqlen;
    tot_ncr += ncr;
    tot_npca += npca;
    tot_nola += nola;
    tot_nme += nme;
    tot_npcp += npcp;
    tot_nolp += nolp;
    tot_nwe += nwe;
    tot_nexons_real += nexons_real;
    tot_nexons_pred += nexons_pred;
    if (nnc != -1) tot_nnc += nnc;

    if (dump_exons && SUMF != NULL)
      fprintf(SUMF, "# Total number of bases in real exons: %d\n", nreal_pos);

    gff_free_set(gff_real);
    gff_free_set(gff_pred);
  }

  if (lst_size(real_fname_list) > 1)
    compute_and_print_stats(stdout, str_new_charstr("TOTAL"), str_new_charstr(""), 
                            tot_tp, tot_fp, tot_nreal_pos, tot_npred_pos, 
                            tot_seqlen, tot_ncr, tot_npca, tot_nola, tot_nme, 
                            tot_npcp, tot_nolp, tot_nwe, tot_nexons_real, 
                            tot_nexons_pred, tot_nnc);

  return 0;
}
Beispiel #19
0
int main(int argc, char *argv[]) {
    char c;
    List *l;
    int i, j, strand, bed_output = 0, backgd_nmods = -1, feat_nmods = -1,
                      winsize = -1, verbose = 0, max_nmods, memblocksize, old_nleaves,
                      refidx = 1, base_by_base = FALSE, windowWig = FALSE;
    TreeModel **backgd_mods = NULL, **feat_mods = NULL;
    HMM *backgd_hmm = NULL, *feat_hmm = NULL;
    msa_format_type inform = UNKNOWN_FORMAT;
    GFF_Set *features = NULL;
    MSA *msa, *msa_compl=NULL;
    double **backgd_emissions, **feat_emissions, **mem, **dummy_emissions,
           *winscore_pos=NULL, *winscore_neg=NULL;
    int *no_alignment=NULL;
    List *pruned_names;
    char *msa_fname;
    FILE *infile;

    int opt_idx;
    struct option long_opts[] = {
        {"background-mods", 1, 0, 'b'},
        {"background-hmm", 1, 0, 'B'},
        {"feature-mods", 1, 0, 'f'},
        {"feature-hmm", 1, 0, 'F'},
        {"features", 1, 0, 'g'},
        {"window", 1, 0, 'w'},
        {"window-wig", 1, 0, 'W'},
        {"base-by-base", 0, 0, 'y'},
        {"msa-format", 1, 0, 'i'},
        {"refidx", 1, 0, 'r'},
        {"output-bed", 0, 0, 'd'},
        {"verbose", 0, 0, 'v'},
        {"help", 0, 0, 'h'},
        {0, 0, 0, 0}
    };

    while ((c = getopt_long(argc, argv, "B:b:F:f:r:g:w:W:i:ydvh", long_opts, &opt_idx)) != -1) {
        switch (c) {
        case 'B':
            backgd_hmm = hmm_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'b':
            l = get_arg_list(optarg);
            backgd_nmods = lst_size(l);
            backgd_mods = smalloc(backgd_nmods * sizeof(void*));
            for (i = 0; i < backgd_nmods; i++)
                backgd_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1);
            lst_free_strings(l);
            lst_free(l);
            break;
        case 'F':
            feat_hmm = hmm_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'f':
            l = get_arg_list(optarg);
            feat_nmods = lst_size(l);
            feat_mods = smalloc(feat_nmods * sizeof(void*));
            for (i = 0; i < feat_nmods; i++)
                feat_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1);
            lst_free_strings(l);
            lst_free(l);
            break;
        case 'g':
            features = gff_read_set(phast_fopen(optarg, "r"));
            break;
        case 'w':
            winsize = get_arg_int(optarg);
            if (winsize <= 0) die("ERROR: window size must be positive.\n");
            break;
        case 'W':
            winsize = get_arg_int(optarg);
            if (winsize <= 0) die("ERROR: window size must be positive.\n");
            windowWig = TRUE;
            break;
        case 'y':
            base_by_base = TRUE;
            break;
        case 'i':
            inform = msa_str_to_format(optarg);
            if (inform == UNKNOWN_FORMAT) die("Bad argument to -i.\n");
            break;
        case 'r':
            refidx = get_arg_int_bounds(optarg, 0, INFTY);
            break;
        case 'd':
            bed_output = 1;
            break;
        case 'h':
            printf("%s", HELP);
            exit(0);
        case 'v':
            verbose = 1;
            break;
        case '?':
            die("Bad argument.  Try '%s -h'.\n", argv[0]);
        }
    }

    set_seed(-1);

    if (backgd_mods == NULL || feat_mods == NULL)
        die("ERROR: -b and -f required.  Try '%s -h'.\n", argv[0]);

    if (backgd_nmods == 1 && backgd_hmm == NULL)
        backgd_hmm = hmm_create_trivial();
    else if (backgd_hmm == NULL)
        die("ERROR: -B required.  Try '%s -h'.\n", argv[0]);

    if (feat_nmods == 1 && feat_hmm == NULL)
        feat_hmm = hmm_create_trivial();
    else if (feat_hmm == NULL)
        die("ERROR: -F required.  Try '%s -h'.\n", argv[0]);

    if ((winsize == -1 && features == NULL && !base_by_base) ||
            (winsize != -1 && features != NULL) ||
            (winsize != -1 && base_by_base) ||
            (features != NULL && base_by_base))
        die("ERROR: must specify exactly one of -g, -w, and -y.  Try '%s -h'.\n", argv[0]);

    if (backgd_hmm->nstates != backgd_nmods)
        die("ERROR: number of states must equal number of tree models for background.\n");

    if (feat_hmm->nstates != feat_nmods)
        die("ERROR: number of states must equal number of tree models for features.\n");

    if (features != NULL && lst_size(features->features) == 0)
        die("ERROR: empty features file.\n");

    if (base_by_base && (backgd_nmods > 1 || feat_nmods > 1))
        die("ERROR: only single phylogenetic models (not HMMs) are supported with --base-by-base.\n");

    if (optind != argc - 1)
        die("ERROR: too few arguments.  Try '%s -h'.\n", argv[0]);

    if (verbose) fprintf(stderr, "Reading alignment ...\n");
    msa_fname = argv[optind];
    infile = phast_fopen(msa_fname, "r");
    if (inform == UNKNOWN_FORMAT)
        inform = msa_format_for_content(infile, 1);
    if (inform == MAF)
        msa = maf_read(infile, NULL, 1, NULL, NULL,
                       NULL, -1, TRUE, NULL, NO_STRIP, FALSE);
    else
        msa = msa_new_from_file_define_format(infile, inform, NULL);
    if (msa_alph_has_lowercase(msa)) msa_toupper(msa);
    msa_remove_N_from_alph(msa);

    /* need ordered representation of alignment */
    if (msa->seqs == NULL && (msa->ss == NULL || msa->ss->tuple_idx == NULL) )
        die("ERROR: ordered sufficient statistics are required.\n");

    pruned_names = lst_new_ptr(msa->nseqs);
    for (i = 0; i < backgd_nmods; i++) {
        old_nleaves = (backgd_mods[i]->tree->nnodes + 1) / 2;
        tm_prune(backgd_mods[i], msa, pruned_names);
        if (lst_size(pruned_names) >= old_nleaves)
            die("ERROR: no match for leaves of tree in alignment (background model #%d)\n", i+1);
        else if (lst_size(pruned_names) > 0) {
            fprintf(stderr, "WARNING: pruned away leaves in background model (#%d) with no match in alignment (", i+1);
            for (j = 0; j < lst_size(pruned_names); j++)
                fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars,
                        j < lst_size(pruned_names) - 1 ? ", " : ").\n");
        }
        lst_free_strings(pruned_names);
    }
    for (i = 0; i < feat_nmods; i++) {
        old_nleaves = (feat_mods[i]->tree->nnodes + 1) / 2;
        tm_prune(feat_mods[i], msa, pruned_names);
        if (lst_size(pruned_names) >= old_nleaves)
            die("ERROR: no match for leaves of tree in alignment (features model #%d)\n", i+1);
        else if (lst_size(pruned_names) > 0) {
            fprintf(stderr, "WARNING: pruned away leaves in features model (#%d) with no match in alignment (", i+1);
            for (j = 0; j < lst_size(pruned_names); j++)
                fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars,
                        j < lst_size(pruned_names) - 1 ? ", " : ").\n");
        }
        lst_free_strings(pruned_names);
    }
    lst_free(pruned_names);

    /* first have to subtract offset from features, if necessary */
    if (msa->idx_offset != 0 && features != NULL) {
        for (i = 0; i < lst_size(features->features); i++) {
            GFF_Feature *f = lst_get_ptr(features->features, i);
            f->start -= msa->idx_offset;
            f->end -= msa->idx_offset;
        }
    }

    /* convert to coord frame of alignment */
    if (features != NULL && refidx != 0) {
        if (verbose) fprintf(stderr, "Mapping coordinates ...\n");
        msa_map_gff_coords(msa, features, refidx, 0, 0);
        if (lst_size(features->features) == 0)
            die("ERROR: no features within coordinate range of alignment.\n");
    }

    /* Make a reverse complemented copy of the alignment.  The two
       strands will be processed separately, to avoid problems with
       overlapping features, etc. */
    if (!base_by_base) {          /* skip in base by base case */
        if (verbose) fprintf(stderr, "Creating reverse complemented alignment ...\n");
        msa_compl = msa_create_copy(msa, 0);
        /* temporary workaround: make sure reverse complement not based on
           sufficient stats */
        if (msa_compl->seqs == NULL) ss_to_msa(msa_compl);
        if (msa_compl->ss != NULL) {
            ss_free(msa_compl->ss);
            msa_compl->ss = NULL;
        }
        msa_reverse_compl(msa_compl);
    }

    /* allocate memory for computing scores */
    backgd_emissions = smalloc(backgd_nmods * sizeof(void*));
    for (i = 0; i < backgd_nmods; i++)
        backgd_emissions[i] = smalloc(msa->length * sizeof(double));
    feat_emissions = smalloc(feat_nmods * sizeof(void*));
    for (i = 0; i < feat_nmods; i++)
        feat_emissions[i] = smalloc(msa->length * sizeof(double));
    max_nmods = max(backgd_nmods, feat_nmods);
    dummy_emissions = smalloc(max_nmods * sizeof(void*));
    mem = smalloc(max_nmods * sizeof(void*));
    /* memory for forward algorithm -- each block must be as large as
       the largest feature */
    if (features != NULL) {
        for (i = 0, memblocksize = -1; i < lst_size(features->features); i++) {
            GFF_Feature *f = lst_get_ptr(features->features, i);
            if (f->end - f->start + 1 > memblocksize)
                memblocksize = f->end - f->start + 1;
        }
    }
    else memblocksize = winsize;  /* -1 if base-by-base mode */

    if (memblocksize > 0)
        for (i = 0; i < max_nmods; i++)
            mem[i] = smalloc(memblocksize * sizeof(double));

    if (winsize != -1) {
        winscore_pos = smalloc(msa->length * sizeof(double));
        winscore_neg = smalloc(msa->length * sizeof(double));
        no_alignment = smalloc(msa->length * sizeof(int));

        for (i = 0; i < msa->length; i++) {
            winscore_pos[i] = winscore_neg[i] = NEGINFTY;
            if (refidx == 0)
                no_alignment[i] = FALSE;
            else
                no_alignment[i] = msa_missing_col(msa, refidx, i);
        }
    }

    /* the rest will be repeated for each strand */
    for (strand = 1; strand <= 2; strand++) {
        MSA *thismsa = strand == 1 ? msa : msa_compl;
        double *winscore = strand == 1 ? winscore_pos : winscore_neg;

        if (base_by_base && strand == 2) break; /* don't do second pass in
                                               base_by_base case */

        if (verbose) fprintf(stderr, "Processing %c strand ...\n",
                                 strand == 1 ? '+' : '-');

        /* set up dummy categories array, so that emissions are only
           computed where needed */
        thismsa->categories = smalloc(thismsa->length * sizeof(int));
        thismsa->ncats = 1;
        if (winsize != -1) {
            if (strand == 1)
                for (i = 0; i < thismsa->length; i++)
                    thismsa->categories[i] = no_alignment[i] ? 0 : 1;
            else
                for (i = 0; i < thismsa->length; i++)
                    thismsa->categories[i] = no_alignment[thismsa->length - i - 1] ? 0 : 1;
        }
        else if (features != NULL) {
            for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 0;
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                if (f->start <= 0 || f->end <= 0) {
                    fprintf(stderr, "WARNING: feature out of range ('");
                    gff_print_feat(stderr, f);
                    fprintf(stderr, "')\n");
                    continue;
                }

                if (strand == 1 && f->strand != '-')
                    for (j = f->start - 1; j < f->end; j++)
                        thismsa->categories[j] = 1;
                else if (strand == 2 && f->strand == '-')
                    for (j = thismsa->length - f->end;
                            j < thismsa->length - f->start + 1; j++)
                        thismsa->categories[j] = 1;
            }
        }
        else {                      /* base-by-base scores */
            for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 1;
        }
        if (thismsa->ss != NULL) ss_update_categories(thismsa);

        /* compute emissions */
        for (i = 0; i < backgd_nmods; i++) {
            if (verbose)
                fprintf(stderr, "Computing emissions for background model #%d ...\n", i+1);
            tl_compute_log_likelihood(backgd_mods[i], thismsa,
                                      backgd_emissions[i], NULL, 1, NULL);
        }
        for (i = 0; i < feat_nmods; i++) {
            if (verbose)
                fprintf(stderr, "Computing emissions for features model #%d ...\n", i+1);
            tl_compute_log_likelihood(feat_mods[i], thismsa,
                                      feat_emissions[i], NULL, 1, NULL);
        }

        /* now compute scores */
        if (winsize != -1) {        /* windows case */
            int winstart;
            if (verbose) fprintf(stderr, "Computing scores ...\n");

            for (winstart = 0; winstart <= thismsa->length - winsize; winstart++) {
                int centeridx = winstart + winsize/2;

                if (strand == 2) centeridx = thismsa->length - centeridx - 1;

                if (no_alignment[centeridx]) continue;

                for (j = 0; j < feat_nmods; j++)
                    dummy_emissions[j] = &(feat_emissions[j][winstart]);
                winscore[centeridx] = hmm_forward(feat_hmm, dummy_emissions,
                                                  winsize, mem);

                if (winscore[centeridx] <= NEGINFTY) {
                    winscore[centeridx] = NEGINFTY;
                    continue;
                }

                for (j = 0; j < backgd_nmods; j++)
                    dummy_emissions[j] = &(backgd_emissions[j][winstart]);
                winscore[centeridx] -= hmm_forward(backgd_hmm, dummy_emissions,
                                                   winsize, mem);

                if (winscore[centeridx] < NEGINFTY) winscore[centeridx] = NEGINFTY;
            }
        }
        else if (features != NULL) { /* features case */
            if (verbose) fprintf(stderr, "Computing scores ...\n");
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                int s, e;

                if ((strand == 1 && f->strand == '-') ||
                        (strand == 2 && f->strand != '-') ||
                        f->start <= 0 || f->end <= 0 || f->end - f->start < 0)
                    continue;

                /* effective coords */
                if (f->strand == '-') {
                    s = thismsa->length - f->end + 1;
                    e = thismsa->length - f->start + 1;
                }
                else {
                    s = f->start;
                    e = f->end;
                }

                f->score_is_null = 0;

                for (j = 0; j < feat_nmods; j++)
                    dummy_emissions[j] = &(feat_emissions[j][s-1]);
                f->score = hmm_forward(feat_hmm, dummy_emissions, e - s + 1, mem);

                if (f->score <= NEGINFTY) {
                    f->score = NEGINFTY;
                    continue;
                }

                for (j = 0; j < backgd_nmods; j++)
                    dummy_emissions[j] = &(backgd_emissions[j][s-1]);
                f->score -= hmm_forward(backgd_hmm, dummy_emissions, e - s + 1, mem);

                if (f->score < NEGINFTY) f->score = NEGINFTY;
            }
        }
    }

    if (verbose) fprintf(stderr, "Generating output ...\n");

    if (winsize != -1 && windowWig == FALSE) { /* standard windows output */
        for (i = 0, j = 0; i < msa->length; i++) {
            if (no_alignment[i] == FALSE)
                printf("%d\t%.3f\t%.3f\n", j + msa->idx_offset + 1, winscore_pos[i],
                       winscore_neg[i]);
            if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR) j++;
        }
    }
    else if (windowWig == TRUE) { /* windows with wig output */
        int last = NEGINFTY;
        for (i = 0, j = 0; i < msa->length; i++) {
            if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) {
                if (no_alignment[i] == FALSE && winscore_pos[i] > NEGINFTY) {
                    if (j > last + 1)
                        printf("fixedStep chrom=%s start=%d step=1\n",
                               refidx > 0 ? msa->names[refidx-1] : "alignment",
                               j + msa->idx_offset + 1);
                    printf("%.3f\n", winscore_pos[i]);
                    last = j;
                }
                j++;
            }
        }
    }
    else if (features != NULL) {  /* features output */
        /* return to coord frame of reference seq (also, replace offset) */
        if (refidx != 0)
            msa_map_gff_coords(msa, features, 0, refidx, msa->idx_offset);
        else if (msa->idx_offset != 0) {
            for (i = 0; i < lst_size(features->features); i++) {
                GFF_Feature *f = lst_get_ptr(features->features, i);
                f->start += msa->idx_offset;
                f->end += msa->idx_offset;
            }
        }

        if (bed_output)
            gff_print_bed(stdout, features, FALSE);
        else
            gff_print_set(stdout, features);
    }
    else {           /* base-by-base scores */
        /* in this case, we can just output the difference between the emissions */
        printf("fixedStep chrom=%s start=%d step=1\n",
               refidx > 0 ? msa->names[refidx-1] : "alignment",
               msa->idx_offset + 1);
        for (i = 0, j = 0; i < msa->length; i++) {
            if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) {
                printf("%.3f\n", feat_emissions[0][i] - backgd_emissions[0][i]);
                j++;
            }
        }
    }

    if (verbose) fprintf(stderr, "\nDone.\n");

    return 0;
}
Beispiel #20
0
int
mdb_exec_list(
    PMDB_INSPECT_CMD pCmd,
    PMDB_INSPECT_CONTEXT pCtxt
    )
{
    int error = 0;
    ARG_TYPE type = ARG_TYPE_INVALID;
    int pgno = 0;
    int verbose = 0;
    int free = 0;
    const char *subdb = NULL;

    if(!pCmd || !pCtxt)
    {
        error = EINVAL;
        bail_on_error(error);
    }

    error = get_arg_int(pCmd, CMD_ARG_TYPE, &type);
    bail_on_error(error);

    error = get_arg_int_opt(pCmd, CMD_ARG_VERBOSE, 0, &verbose);
    bail_on_error(error);

    switch(type)
    {
        case ARG_TYPE_ENV:
            error = mdb_list_open_dbs(pCtxt);
            bail_on_error(error);
        break;
        case ARG_TYPE_PAGE:
            error = get_arg_int(pCmd, CMD_ARG_PGNO, &pgno);
            bail_on_error(error);

            if(verbose)
            {
                error = mdb_page_get_v(
                            pCtxt->pDB,
                            pgno,
                            pgno,
                            mdb_print_page,
                            mdb_print_page_v);
                bail_on_error(error);
            }
            else
            {
                error = mdb_page_get(pCtxt->pDB, pgno, pgno, mdb_print_page);
                bail_on_error(error);
            }
        break;
        case ARG_TYPE_SUBDB:
            error = get_arg_str_opt(pCmd, CMD_ARG_SUBDB, NULL, &subdb);
            bail_on_error(error);
            if(verbose)
            {
                error = mdb_subdb_get_v(
                            pCtxt->pDB,
                            subdb,
                            mdb_print_subdb_v);
                bail_on_error(error);
            }
            else
            {
                error = mdb_subdb_get(
                            pCtxt->pDB,
                            subdb,
                            mdb_print_subdb);
                bail_on_error(error);
            }
        break;
    }

cleanup:
    return error;

error:
    goto cleanup;
}
int main(int argc, char *argv[]) {

  int check_start = 0, check_stop = 0, check_splice = 0, check_nonsense = 0,
    offset5 = 0, offset3 = 0, opt_idx, i, j, indel_strict = 0, no_output = 0,
    check_alignment = 0, splice_strict = 0;
  int ncons_tested, nkept, nconserved_exons;
  int nce_gap_type[NGAP_TYPES], nconsid[NTYPES], nfail[NTYPES];
  double Nfrac = 0.05;
  char c;
  MSA *msa;
  GFF_Set *gff;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  List *keepers, *problems = lst_new_ptr(10), 
    *ends_adjusted = lst_new_ptr(1), *starts_adjusted = lst_new_ptr(1), 
    *discards=NULL, *intron_splice = lst_new_ptr(10);
  char *rseq_fname = NULL;
  FILE *logf = NULL, *mlogf = NULL, *statsf = NULL, *discardf = NULL;
  cds_gap_type fshift_mode = FSHIFT_BAD;
  char *groupby = "transcript_id";
  msa_coord_map *map;
  int *countNs, *countCDSs;
  FILE *infile;
  char *msa_fname;

  struct option long_opts[] = {
    {"start", 0, 0, 's'},
    {"stop", 0, 0, 't'},
    {"splice", 0, 0, 'l'},
    {"nonsense", 0, 0, 'n'},
    {"fshift", 0, 0, 'f'},
    {"conserved", 0, 0, 'c'},
    {"N-limit", 1, 0, 'N'},
    {"clean-gaps", 0, 0, 'e'},
    {"indel-strict", 0, 0, 'I'},
    {"splice-strict", 0, 0, 'C'},
    {"groupby", 1, 0, 'g'},
    {"msa-format", 1, 0, 'i'},
    {"refseq", 1, 0, 'r'},
    {"offset5", 1, 0, 'o'},
    {"offset3", 1, 0, 'p'},
    {"no-output", 0, 0, 'x'},
    {"discards", 1, 0, 'd'},
    {"log", 1, 0, 'L'},
    {"machine-log", 1, 0, 'M'},
    {"stats", 1, 0, 'S'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = (char)getopt_long(argc, argv, "N:i:r:L:M:S:g:d:stlnfceICxh", 
                          long_opts, &opt_idx)) != -1) {
    switch(c) {
    case 's':
      check_alignment = check_start = 1;
      break;
    case 't':
      check_alignment = check_stop = 1;
      break;
    case 'l':
      check_alignment = check_splice = 1;
      break;
    case 'n':
      check_alignment = check_nonsense = 1;
      break;
    case 'f':
      check_alignment = 1;
      fshift_mode = FSHIFT_OK;
      break;
    case 'c':
      check_alignment = check_start = check_stop = check_splice = check_nonsense = 1;
      if (fshift_mode < FSHIFT_OK) fshift_mode = FSHIFT_OK;
      break;
    case 'N':
      Nfrac = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'e':
      check_alignment = 1;
      if (fshift_mode < CLN_GAPS) fshift_mode = CLN_GAPS;
      break;
    case 'I':
      check_alignment = 1;
      fshift_mode = NOVRLP_CLN_GAPS;
      indel_strict = 1;
      break;
    case 'C':
      check_alignment = check_splice = splice_strict = 1;
      break;
    case 'g':
      groupby = optarg;
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT) die("Bad alignment format.\n");
      break;
    case 'r':
      rseq_fname = optarg;
      break;
    case 'o':
      offset5 = get_arg_int(optarg);
      break;
    case 'p':
      offset3 = get_arg_int(optarg);
      break;
    case 'L':
      logf = phast_fopen(optarg, "w+");
      break;
    case 'M':
      mlogf = phast_fopen(optarg, "w+");
      break;
    case 'S':
      statsf = phast_fopen(optarg, "w+");
      break;
    case 'd':
      discardf = phast_fopen(optarg, "w+");
      break;
    case 'x':
      no_output = 1;
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("ERROR: Bad argument.  Try the --help option.\n");
    }
  }

  if (optind + 1 >= argc ) {
    die("ERROR:  Missing required arguments.  Try the --help option.\n");
  }
  
  set_seed(-1);

  gff = gff_read_set(phast_fopen(argv[optind], "r"));
  msa_fname = argv[optind+1];
  infile = phast_fopen(msa_fname, "r");
  if (msa_format == UNKNOWN_FORMAT)
    msa_format = msa_format_for_content(infile, 1);
  if (msa_format == MAF) {
    msa = maf_read(infile, 
                   rseq_fname == NULL ? NULL : phast_fopen(rseq_fname, "r"), 
                   1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); 
  }
  else {
    msa = msa_new_from_file_define_format(infile,
                            msa_format, NULL); 
    if (msa->ss == NULL) 
      ss_from_msas(msa, 1, 1, NULL, NULL, NULL, -1, 0);
  }
  if (!msa->ss->tuple_idx)
    die("ERROR: need ordered tuples\n");
  msa_remove_N_from_alph(msa);  /* for backward compatibility (old SS files) */

  if (msa->idx_offset != 0) {   /* avoids offset problem */
    for (i = 0; i < lst_size(gff->features); i++) {
      GFF_Feature *f = lst_get_ptr(gff->features, i);
      f->start -= msa->idx_offset;
      f->end -= msa->idx_offset;
    }
  }

  /* set up coordinate map; assume GFF is for sequence 1 */
  map = msa_build_coord_map(msa, 1);

  /* convert all features */
  for (i = 0; i < lst_size(gff->features); i++) {
    GFF_Feature *f = lst_get_ptr(gff->features, i);
    int newstart, newend;
 
    if (f->start < 0 || f->end < f->start)
      die("ERROR: bad feature in GFF (start=%d, end=%d).\n",
          f->start, f->end);

    newstart = msa_map_seq_to_msa(map, f->start);
    newend = msa_map_seq_to_msa(map, f->end);

    if (newstart < 0 || newend < newstart)
      die("ERROR: unable to map coordinates for feature (start=%d, end=%d).\n",
          f->start, f->end);

    f->start = newstart;
    f->end = newend;
  }

  gff_group(gff, groupby);	/* do this after coord conversion, or
                               group coords and feature coords
                               will be out of sync */

  keepers = lst_new_ptr(lst_size(gff->features));
  if (discardf != NULL) discards = lst_new_ptr(lst_size(gff->features));

  ncons_tested = nkept = nconserved_exons = 0;
  for (i = 0; i < NTYPES; i++) nconsid[i] = 0;
  for (i = 0; i < NTYPES; i++) nfail[i] = 0;
  for (i = 0; i < NGAP_TYPES; i++) nce_gap_type[i] = 0;  

  countNs = smalloc(msa->nseqs * sizeof(int));
  countCDSs = smalloc(msa->nseqs * sizeof(int));

  for (i = 0; i < lst_size(gff->groups); i++) {
    GFF_FeatureGroup *group = lst_get_ptr(gff->groups, i);
    List *gfeatures = group->features;
    GFF_Feature *feat;
    status_type status = OKAY;
    cds_gap_type gt = FSHIFT_BAD;
    problems_clear(problems);

    /* make sure have frame info for CDSs */
    for (j = 0; j < lst_size(gfeatures); j++) {
      feat = lst_get_ptr(gfeatures, j);
      if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && 
          feat->frame == GFF_NULL_FRAME)
        die("ERROR: Missing frame info for CDS.\n");
    }

    /* First, exclude stop codons from cds's, if necessary (simplifies
       the detection of nonsense mutations). */
    exclude_stops(group, starts_adjusted, ends_adjusted);

    /* In all cases, discard any group for which the reference sequence
       doesn't have valid splice sites or start/stop codons, or has a
       premature stop codon */
    if (!ref_seq_okay(gfeatures, msa, offset3, indel_strict, splice_strict,
                      problems)) {
      status = BAD_REF;
      nfail[BAD_REF]++;
    }
    else
      /* Everything else counts as a potentially valid group */
      ncons_tested++;

    if (status == OKAY && check_alignment) {      
                                /* only bother with below if
                                   interested in cross-species
                                   conservation */

      /* Check first to make sure there's alignment across species in
         the cds; if not, there's no need to look at individual
         features. */
      for (j = 0; j < lst_size(gfeatures); j++) { 
        feat = lst_get_ptr(gfeatures, j);
        if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) &&
            is_incomplete_alignment(feat, msa)) {
          status = NO_ALN;
          nfail[NO_ALN]++;
          problem_add(problems, feat, NO_ALN, -1, -1);
          break;
        }
      }

      if (status == OKAY) {     /* we have alignment and agreement
                                   with the ref seq; now check feature
                                   by feature  */

        lst_clear(intron_splice);
        for (j = 0; j < msa->nseqs; j++) countNs[j] = countCDSs[j] = 0;

        for (j = 0; j < lst_size(gfeatures); j++) {
          feat = lst_get_ptr(gfeatures, j);

          if (feat->end - 1 >= msa->length) 
            die("ERROR: feature extends beyond alignment (%d >= %d).\n",
                feat->end - 1, msa->length);
        
          if (check_start && str_equals_charstr(feat->feature, GFF_START_TYPE)) {

            nconsid[BAD_START]++;

            if (!is_conserved_start(feat, msa)) {
              status = BAD_START;
              problem_add(problems, feat, BAD_START, -1, -1);
            }
          }

          else if (check_stop && str_equals_charstr(feat->feature, GFF_STOP_TYPE)) {

            nconsid[BAD_STOP]++;

            if (!is_conserved_stop(feat, msa)) {
              status = BAD_STOP;
              problem_add(problems, feat, BAD_STOP, -1, -1);
            }
          }

          else if (check_splice && 
                   str_equals_charstr(feat->feature, SPLICE_5)) {

            nconsid[BAD_5_SPLICE]++;

            if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) {
              status = BAD_5_SPLICE;
              problem_add(problems, feat, BAD_5_SPLICE, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && 
                   str_equals_charstr(feat->feature, SPLICE_5_UTR)) {

            nconsid[BAD_5_SPLICE_UTR]++;

            if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) {
              status = BAD_5_SPLICE_UTR;
              problem_add(problems, feat, BAD_5_SPLICE_UTR, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) {


            nconsid[BAD_3_SPLICE]++;

            if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) {
              status = BAD_3_SPLICE;
              problem_add(problems, feat, BAD_3_SPLICE, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) {

            nconsid[BAD_3_SPLICE_UTR]++;

            if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) {
              status = BAD_3_SPLICE_UTR;
              problem_add(problems, feat, BAD_3_SPLICE_UTR, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
 
            if (fshift_mode > FSHIFT_BAD 
		&& (gt = get_cds_gap_type(feat, msa, problems)) < fshift_mode) {
              if (status == OKAY || status == NONSENSE) status = FSHIFT;
            }

            if (check_nonsense && !is_nonsense_clean(feat, msa, problems)) {
              if (status == OKAY) status = NONSENSE;
            }

            if (Nfrac < 1) 
              get_N_counts(countNs, countCDSs, feat, msa);
          }
        } /* end loop through features in group */

        /* still have to make sure splice sites are paired correctly
           (GT-AG, GC-AG, AT-AC) */
        if (status == OKAY && !splice_strict && lst_size(intron_splice) >= 2 &&
            !are_introns_okay(intron_splice, msa, problems, offset5, offset3)) 
          status = BAD_INTRON;

        /* also check fraction of Ns */
        if (Nfrac < 1) {
          enum {MY_OKAY, MY_FAIL, MY_WARN} Nstatus = MY_OKAY;
          for (j = 0; j < msa->nseqs; j++) {
            if ((double)countNs[j] / countCDSs[j] > Nfrac) Nstatus = MY_FAIL;
            if (Nstatus == MY_OKAY && countNs[j] > 0) Nstatus = MY_WARN;
          }
          if (Nstatus == MY_FAIL) {
            problem_add(problems, NULL, TOO_MANY_Ns, -1, -1);
            if (status == OKAY) status = TOO_MANY_Ns;
          }
          else if (Nstatus == MY_WARN) 
            problem_add(problems, NULL, WARN_Ns, -1, -1);
        }

        /* if collecting stats, record counts for failures */
        if (statsf != NULL) {
          if (status != OKAY) {
            for (j = 0; j < lst_size(problems); j++) {
              struct Problem *problem = lst_get_ptr(problems, j);
              status_type ftype = problem->status;
              if ((ftype == FSHIFT || ftype == NONSENSE) && 
                  status != FSHIFT && status != NONSENSE)
                continue;       /* don't count secondary frame shifts
                                   and nonsense mutations */ 

              if (ftype == BAD_INTRON && j % 2 == 0)
                continue;       /* only count one of every pair of these */

              nfail[ftype]++;
            }
          }

          /* also keep track of the total number of "conserved exons", and
             the number having each kind of gap */
          if ((status == OKAY || (status == FSHIFT && gt >= FSHIFT_OK))) {
            nconserved_exons++;
            nce_gap_type[gt]++;     /* number of conserved exons having
                                       given type of gaps */
          }
        }
      } /* end if (status == OKAY) [checks for conserved features] */
    } /* end if (status == OKAY && check_alignment) [all cross-species
         checks] */

    /* now we have looked at the whole group; we just need to do some
       final accounting and logging */

    if (status == OKAY) {
      nkept++;
      if (!no_output) {
        restore_stops(group, starts_adjusted, ends_adjusted);
        for (j = 0; j < lst_size(gfeatures); j++)
          lst_push_ptr(keepers, lst_get_ptr(gfeatures, j));
      }
      if (logf != NULL && lst_size(problems) > 0) /* warnings only */
        write_log(logf, group, status, problems, msa, map);
      if (mlogf != NULL) {
        /* no problem, need to add an okay status to log */
        problem_add(problems, NULL, OKAY, -1, -1);
        write_machine_log(mlogf, group, problems, map); /* may include
                                                           warnings */
      }
    }
    else {
      if (discardf != NULL) {
        restore_stops(group, starts_adjusted, ends_adjusted);
        for (j = 0; j < lst_size(gfeatures); j++) 
          lst_push_ptr(discards, lst_get_ptr(gfeatures, j));
      }
      if (logf != NULL) 
        write_log(logf, group, status, problems, msa, map);
      if (mlogf != NULL)
        write_machine_log(mlogf, group, problems, map);
    }
  } /* end loop over groups */

  /* write main output and discards */
  if (!no_output || discardf != NULL) {
    /* first map features back to coord frame of reference seq. */
    for (i = 0; i < lst_size(gff->features); i++) {
      GFF_Feature *f = lst_get_ptr(gff->features, i);
      f->start = msa_map_msa_to_seq(map, f->start) + msa->idx_offset;
      f->end = msa_map_msa_to_seq(map, f->end) + msa->idx_offset;
    }

    if (!no_output) {
      gff->features = keepers;
      gff_print_set(stdout, gff);
    }

    if (discardf != NULL) {
      gff->features = discards;
      gff_print_set(discardf, gff);
    }
  }


  /* dump counts to stats file */
  if (statsf != NULL) {
    fprintf(statsf, "#%11s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", 
            "total", "nbad_ref", "nconsid", "nkept", "nno_aln", 
            "nbad_starts", "(out of)", "nbad_stops", "(out of)", 
            "nbad_5spl", "(out of)", "nbad_3spl", "(out of)", 
            "nbad_5utr", "(out of)", "nbad_3utr", "(out of)", 
            "nbad_intron", "nnons", "nfshifts", "nNs", "ncons_exons", 
            "nce_ngaps", "nce_nov_cln", "nce_clean", "nce_fshftok");
    fprintf(statsf, "%12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d\n", 
            nfail[BAD_REF]+ncons_tested, nfail[BAD_REF], ncons_tested, nkept, 
            nfail[NO_ALN], nfail[BAD_START], nconsid[BAD_START], 
            nfail[BAD_STOP], nconsid[BAD_STOP], nfail[BAD_5_SPLICE], 
            nconsid[BAD_5_SPLICE], nfail[BAD_3_SPLICE], nconsid[BAD_3_SPLICE],
            nfail[BAD_5_SPLICE_UTR], nconsid[BAD_5_SPLICE_UTR],
            nfail[BAD_3_SPLICE_UTR], nconsid[BAD_3_SPLICE_UTR], 
            nfail[BAD_INTRON], nfail[NONSENSE], nfail[FSHIFT], 
            nfail[TOO_MANY_Ns], nconserved_exons, nce_gap_type[NGAPS], 
            nce_gap_type[NOVRLP_CLN_GAPS], nce_gap_type[CLN_GAPS], 
            nce_gap_type[FSHIFT_OK]);
    fprintf(statsf, "%s", STATS_DESCRIPTION);
  }

  if (logf != NULL) phast_fclose(logf);
  if (mlogf != NULL) phast_fclose(mlogf);
  if (statsf != NULL) phast_fclose(statsf);
  if (discardf != NULL) phast_fclose(discardf);

  return 0;
}
Beispiel #22
0
int main(int argc, char* argv[]) {
  char *maf_fname = NULL, *out_root_fname = "maf_parse", *masked_fn = NULL;
  String *refseq = NULL, *currRefseq;
  int opt_idx, startcol = 1, endcol = -1, include = 1, splitInterval = -1;
  char c, outfilename[1000], splitFormat[100]="%s%.1i.maf", *group_tag = NULL;
  List *order_list = NULL, *seqlist_str = NULL, *cats_to_do_str=NULL, *cats_to_do=NULL;
  MafBlock *block;
  FILE *mfile, *outfile=NULL, *masked_file=NULL;
  int useRefseq=TRUE, currLen=-1, blockIdx=0, currSize, sortWarned=0;
  int lastIdx = 0, currStart=0, by_category = FALSE, i, pretty_print = FALSE;
  int lastStart = -1, gffSearchIdx=0;
  GFF_Set *gff = NULL, *gffSub;
  GFF_Feature *feat;
  CategoryMap *cm = NULL;
  int base_mask_cutoff = -1, stripILines=FALSE, stripELines=FALSE;//, numspec=0;
  List *outfileList=NULL;
  Hashtable *outfileHash=NULL;//, *specNameHash=NULL;
  msa_format_type output_format = MAF;
  MSA *msa = NULL;//, **catMsa;
  char *mask_features_spec_arg=NULL;
  List *mask_features_spec=NULL;
  

  struct option long_opts[] = {
    {"start", 1, 0, 's'},
    {"end", 1, 0, 'e'},
    {"seqs", 1, 0, 'l'},
    {"exclude", 0, 0, 'x'},
    {"order", 1, 0, 'O'},
    {"split", 1, 0, 'S'},
    {"out-root", 1, 0, 'r'},
    {"out-root-digits", 1, 0, 'd'},
    {"no-refseq", 0, 0, 'n'},
    {"features", 1, 0, 'g'},
    {"by-category", 0, 0, 'L'},
    {"do-cats", 1, 0, 'C'},
    {"catmap", 1, 0, 'c'},
    {"by-group", 1, 0, 'P'},
    {"mask-bases", 1, 0, 'b'},
    {"masked-file", 1, 0, 'm'},
    {"strip-i-lines", 0, 0, 'I'},
    {"strip-e-lines", 0, 0, 'E'},
    {"mask-features", 1, 0, 'M'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };


  while ((c = getopt_long(argc, argv, "s:e:l:O:r:S:d:g:c:P:b:o:m:M:pLnxEIh", long_opts, &opt_idx)) != -1) {
    switch(c) {
    case 's':
      startcol = get_arg_int(optarg);
      break;
    case 'e':
      endcol = get_arg_int(optarg);
      break;
    case 'l':
      seqlist_str = get_arg_list(optarg);
      break;
    case 'O':
      order_list = get_arg_list(optarg);
      break;
    case 'x':
      include = FALSE;
      break;
    case 'S':
      splitInterval = atoi(optarg);
      break;
    case 'r':
      out_root_fname = optarg;
      break;
    case 'd':
      sprintf(splitFormat, "%%s%%.%si.%%s", optarg);
      break;
    case 'n':
      useRefseq = FALSE;
      break;
    case 'g':
      gff = gff_read_set(phast_fopen(optarg, "r"));
      gff_sort(gff);
      stripILines=TRUE;
      stripELines=TRUE;
      break;
    case 'c':
      cm = cm_new_string_or_file(optarg);
      break;
    case 'C':
      cats_to_do_str = get_arg_list(optarg);
      break;
    case 'L':
      by_category = TRUE;
      break;
    case 'P':
      group_tag = optarg;
      break;
    case 'b':
      base_mask_cutoff = atoi(optarg);
      break;
    case 'm':
      masked_fn = optarg;
      break;
    case 'M':
      mask_features_spec_arg = optarg;
      break;
    case 'E':
      stripELines=TRUE;
      break;
    case 'I':
      stripILines=TRUE;
      break;
    case 'o':
      output_format = msa_str_to_format(optarg);
      if (output_format == UNKNOWN_FORMAT) 
	die("ERROR: bad output format.  Try \"maf_parse -h\" for help.\n");
      if (output_format != MAF)
	die("Sorry, only MAF format output has been implemented right now.\n");
      break;
    case 'p':
      pretty_print = TRUE;
      break;
    case 'h':
      print_usage();
      exit(0);
    case '?':
      die("Bad argument.  Try 'maf_parse -h' for help.\n");
    }
  }

  if (optind >= argc) 
    die("Missing alignment filename.  Try 'maf_parse -h' for help.\n");
  else if (optind == argc - 1) 
    maf_fname = argv[optind];
  else 
    die("ERROR: Too many arguments.  Try 'maf_parse -h' for help.\n");
  
  set_seed(-1);

  if (startcol < 1 || (endcol != -1 && endcol < startcol))
    die("ERROR: must have 1 <= start <= end <= [msa_length]\n");

  if ((group_tag != NULL || by_category) && gff == NULL)
    die("ERROR: --by-category and --by-group require --features.  Try \"maf_parse -h\""
	" for help.\n");

  if (group_tag != NULL && by_category) 
    die("ERROR: --by-category and --by-group cannot be used together.  Try \"maf_parse -h\""
	" for help.\n");
  
  if (splitInterval != -1 && gff != NULL)
    die("ERROR: can't use --split and --features together.  Try \"maf_parse -h\""
	"for help\n");

  if (group_tag != NULL || by_category) {
    outfileList = lst_new_ptr(10);
    outfileHash = hsh_new(100);
  }

  if (gff != NULL && cm == NULL) 
    cm = cm_new_from_features(gff);

  if (cats_to_do_str != NULL) {
    cats_to_do = cm_get_category_str_list(cm, cats_to_do_str, FALSE);
    if (gff != NULL) 
      gff_filter_by_type(gff, cats_to_do, 0, NULL);
  }

  if (masked_fn != NULL) {
    if (base_mask_cutoff == -1)
      die("ERROR: need to use --mask-bases with --masked-file");
    masked_file = phast_fopen(masked_fn, "w");
  }

  if (mask_features_spec_arg != NULL) {
    if (gff==NULL)
      die("ERROR: need --features with --mask-features");
    mask_features_spec = lst_new_ptr(10);
    str_split(str_new_charstr(mask_features_spec_arg), ",", mask_features_spec);
    for (i=0; i < lst_size(mask_features_spec); i++) {
      fprintf(stderr, "masking species %s within features\n", 
	      ((String*)lst_get_ptr(mask_features_spec, i))->chars);
    }
  }

  /* Check to see if --do-cats names a feature which is length 1. 
     If so, set output_format to SS ? or FASTA ? */
  
  mfile = phast_fopen(maf_fname, "r");
  block = mafBlock_read_next(mfile, NULL, NULL);

  if (splitInterval == -1 && gff==NULL) {
    //TODO: do we want to copy header from original MAF in this case?
    mafBlock_open_outfile(NULL, argc, argv);
  }

  while (block != NULL) {
    if (order_list != NULL)
      mafBlock_reorder(block, order_list);
    if (seqlist_str != NULL)
      mafBlock_subSpec(block, seqlist_str, include);
    if (mafBlock_numSpec(block)==0 || mafBlock_all_gaps(block)) 
      goto get_next_block;
    if (stripILines)
      mafBlock_strip_iLines(block);
    if (stripELines)
      mafBlock_strip_eLines(block);
    if (base_mask_cutoff != -1)
      mafBlock_mask_bases(block, base_mask_cutoff, masked_file);
    //TODO: still need to implement (either here or elsewhere)
    //    if (indel_mask_cutoff != -1) 
    //      mafBlock_mask_indels(block, indel_mask_cutoff, mfile);

    if (useRefseq) {  //get refseq and check that it is consistent in MAF file
      currRefseq = mafBlock_get_refSpec(block);
      if (refseq == NULL) 
	refseq = str_new_charstr(currRefseq->chars);
      else if (str_compare(refseq, currRefseq)!=0)
	die("Error: refseq not consistent in MAF (got %s, %s)\n",
	    refseq->chars, currRefseq->chars);
    }
    
    if (startcol != 1 || endcol != -1) 
      if (0 == mafBlock_trim(block, startcol, endcol, refseq, useRefseq ? 0 : lastIdx))
	goto get_next_block;

    currSize = mafBlock_get_size(block, refseq);
    if (useRefseq) {
      currStart = mafBlock_get_start(block, refseq);
      if (currStart < lastIdx && sortWarned == 0) {
	fprintf(stderr, "Warning: input MAF not sorted with respect to refseq.  Output files may not represent contiguous alignments. (%i, %i)\n", lastIdx, currStart);
	sortWarned = 1;
      }
    }
    else currStart = lastIdx;

    if (currStart < lastStart) gffSearchIdx = 0;
    lastStart = currStart;
    
    lastIdx = currStart + currSize;

    //split by length
    if (splitInterval != -1) {
      if (currLen == -1 || currLen+currSize > splitInterval) {
	sprintf(outfilename, splitFormat, out_root_fname, ++blockIdx,
		msa_suffix_for_format(output_format));
	if (output_format == MAF) {
	  if (outfile != NULL) mafBlock_close_outfile(outfile);
	  outfile = mafBlock_open_outfile(outfilename, argc, argv);
	}
	else if (output_format != MAF && msa != NULL) {
	  //	  msa_print_to_filename(msa, outfilename, output_format, pretty_print);
	  msa_free(msa);
	  msa = NULL;
	}
	currLen = 0;
      }
      currLen += currSize;
    }
    else outfile = stdout;
    if (gff != NULL && mask_features_spec != NULL) {
      gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx,
					       &gffSearchIdx);
      if (gffSub != NULL) {
	mafBlock_mask_region(block, gffSub, mask_features_spec);
	gff_free_set(gffSub);
      }
      mafBlock_print(outfile, block, pretty_print);


    } else if (gff != NULL) {
      gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx, 
					       &gffSearchIdx);
      if (gffSub != NULL) {
	if (by_category) gff_group_by_feature(gffSub);
	else if (group_tag != NULL) gff_group(gffSub, group_tag);
	gff_sort(gffSub);
	gff_flatten_within_groups(gffSub);
	for (i=0; i<lst_size(gffSub->features); i++) {
	  feat = (GFF_Feature*)lst_get_ptr(gffSub->features, i);
	  MafBlock *subBlock = mafBlock_copy(block);
	  mafBlock_trim(subBlock, feat->start, feat->end, refseq, 0);
	  if (by_category) 
	    outfile = get_outfile(outfileList, outfileHash, feat->feature, out_root_fname,
				  argc, argv);
	  else if (group_tag != NULL) 
	    outfile = get_outfile(outfileList, outfileHash, 
				  gff_group_name(gffSub, feat), out_root_fname,
				  argc, argv);
	  else outfile = stdout;
	  if (output_format == MAF)
	    mafBlock_print(outfile, subBlock, pretty_print);
	  //	  else msa_add_mafBlock(msa);
	  mafBlock_free(subBlock);
	}
	gff_free_set(gffSub);
      }
    }
    else {
      if (output_format == MAF) 
	mafBlock_print(outfile, block, pretty_print);
      //      else msa = msa_add_mafBlock(mafBlock, msa, );
    }
    
  get_next_block:
    mafBlock_free(block);
    block = mafBlock_read_next(mfile, NULL, NULL);
  }

  if (masked_file != NULL) fclose(masked_file);

  if (output_format == MAF) {
    if (by_category || group_tag != NULL)
      close_outfiles(outfileList, outfileHash);
    else if (outfile!=NULL) mafBlock_close_outfile(outfile);
  } else {
    msa_print(stdout, msa, output_format, pretty_print);
    msa_free(msa);
  }
  if (gff != NULL) gff_free_set(gff);
  phast_fclose(mfile);
  return 0;
}
int main(int argc, char* argv[]) {
  FILE* F;
  MSA *msa;
  msa_format_type format = UNKNOWN_FORMAT;
  int src_ref = -1, dest_ref = 0, offset = 0;
  char *msa_fname = NULL, *feat_fname = NULL;
  GFF_Set *gff;
  char c;

  while ((c = (char)getopt(argc, argv, "hm:f:s:d:i:p:n:")) != -1) {
    switch(c) {
    case 'm':
      msa_fname = optarg;
      break;
    case 'f':
      feat_fname = optarg;
      break;
    case 's':
      src_ref = get_arg_int(optarg);
      break;
    case 'd':
      dest_ref = get_arg_int(optarg);
      break;
    case 'i':
      format = msa_str_to_format(optarg);
      if (format == UNKNOWN_FORMAT) die("ERROR: bad alignment format.\n");
      break;
    case 'p':
      offset = get_arg_int(optarg);
      break;
    case 'n':
      offset = -1 * get_arg_int(optarg);
      break;
    case 'h':
      print_usage();
      exit(1);
    case '?':
      print_usage();
      exit(1);
    }
  }

  if (msa_fname == NULL || feat_fname == NULL) {
    print_usage();
    exit(1);
  }

  set_seed(-1);

  F = phast_fopen(feat_fname, "r");
  if ((gff = gff_read_set(F)) == NULL) { 
    die("ERROR: error reading %s.\n", feat_fname);
  }
  phast_fclose(F);

  /* handle case of local alignment specially -- avoid representing
     the alignment explicitly */
  F = phast_fopen(msa_fname, "r");
  if (format == UNKNOWN_FORMAT)
    format = msa_format_for_content(F, 1);
  if (format == LAV) {
    LocalPwAlignment *lpwa = NULL;
/*     int i; */

    fprintf(stderr, "WARNING: in local alignment mode, coordinates may only be mapped from query (reference) sequence to target (aligned) sequence.\n"); 

    lpwa = la_read_lav(F, 0);
    la_gff_transform(lpwa, gff);
/*     for (i = 0; i < lst_size(gff->features); i++) { */
/*       GFF_Feature *feat = lst_get_ptr(gff->features, i); */
/*       feat->start = la_get_target_coord(lpwa, feat->start); */
/*       feat->end = la_get_target_coord(lpwa, feat->end); */
/*     } */
  }

  else {                        /* normal alignment */
    msa = msa_new_from_file_define_format(F, format, NULL);
    phast_fclose(F);

    msa_map_gff_coords(msa, gff, src_ref, dest_ref, offset);
    msa_free(msa);
  }

  gff_print_set(stdout, gff);

  gff_free_set(gff);

  return 0;
}
Beispiel #24
0
int main(int argc, char *argv[]) {
  TreeNode *tree = NULL;
  TreeModel *backgd_mod = NULL;
  int i, j,
    size = DEFAULT_SIZE, meme_mode = 0, profile_mode = 0, 
    nrestarts = 10, npseudocounts = 5, nsamples = -1, 
    nmostprevalent = -1, tuple_size = -1, nbest = -1, sample_parms = 0,
    nmotifs = DEFAULT_NUMBER, nseqs = -1, do_html = 0, do_bed = 0, 
    suppress_stdout = 0;
  List *msa_name_list = NULL, *pos_examples = NULL, *init_list = NULL, *tmpl;
  List *msas, *motifs;
  SeqSet *seqset = NULL;
  PooledMSA *pmsa = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  Vector *backgd_mnmod = NULL;
  Hashtable *hash=NULL;
  String *output_prefix = str_new_charstr("phastm.");
  double *has_motif = NULL;
  double prior = PRIOR;
  char c;
  GFF_Set *bedfeats = NULL;

  while ((c = getopt(argc, argv, "t:i:b:sk:md:pn:I:R:P:w:c:SB:o:HDxh")) != -1) {
    switch (c) {
    case 't':
      tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT) 
	die("ERROR: bad input format.\n");
      break;
    case 'b':
      backgd_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
      break;
    case 's':
      break;
    case 'k':
      size = get_arg_int(optarg);
      break;
    case 'm':
      meme_mode = 1;
      break;
    case 'd':
      pos_examples = get_arg_list(optarg);
      break;
    case 'p':
      profile_mode = 1;
      break;
    case 'n':
      nrestarts = get_arg_int(optarg);
      break;
    case 'I':
      init_list = get_arg_list(optarg);
      break;
    case 'P':
      tmpl = str_list_as_int(get_arg_list(optarg));
      if (lst_size(tmpl) != 2) die("ERROR: bad argument to -P.\n");
      nmostprevalent = lst_get_int(tmpl, 0);
      tuple_size = lst_get_int(tmpl, 1);
      if (!(nmostprevalent > 0 && tuple_size > 0))
	die("ERROR: bad argument nmostprevalent=%i tuple_size=%i\n", 
	    nmostprevalent, tuple_size);
      lst_free(tmpl);
      break;
    case 'R':
      tmpl = str_list_as_int(get_arg_list(optarg));
      if (lst_size(tmpl) != 2) die("ERROR: bad argument to -R.\n");
      nsamples = lst_get_int(tmpl, 0);
      tuple_size = lst_get_int(tmpl, 1);
      if (!(nsamples > 0 && tuple_size > 0))
	die("ERROR nsamples=%i tuple_sizse=%i\n", nsamples, tuple_size);
      lst_free(tmpl);
      break;
    case 'c':
      npseudocounts = get_arg_int(optarg);
      break;
    case 'w':
      nbest = get_arg_int(optarg);
      break;
    case 'S':
      sample_parms = 1;
      break;
    case 'B':
      nmotifs = get_arg_int(optarg);
      break;
    case 'o': 
      str_free(output_prefix);
      output_prefix = str_new_charstr(optarg);
      str_append_char(output_prefix, '.'); 
      break;
    case 'H': 
      do_html = 1;
      break;
    case 'D': 
      do_bed = 1;
      break;
    case 'x':
      suppress_stdout = 1;
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("ERROR: List of alignment files required.  Try '%s -h'.\n", argv[0]);

  if ((nsamples > 0 && nmostprevalent > 0) || 
      (nsamples > 0 && init_list != NULL) || 
      (nmostprevalent > 0 && init_list != NULL)) 
    die("ERROR: -I, -P, and -R are mutually exclusive.");

  set_seed(-1);
    
  msa_name_list = get_arg_list(argv[optind]);

  if (backgd_mod != NULL && tree == NULL) tree = backgd_mod->tree;

  if (tree == NULL && !meme_mode && !profile_mode) 
    die("ERROR: Must specify -t, -m, or -p.\n");

  if ((init_list != NULL || nsamples > 0 || nmostprevalent > 0) && 
      !sample_parms)
    nrestarts = 1;

  if (pos_examples != NULL) {
    hash = hsh_new(lst_size(pos_examples));
    for (i = 0; i < lst_size(pos_examples); i++)
      hsh_put_int(hash, ((String*)lst_get_ptr(pos_examples, i))->chars, 1);
    has_motif = smalloc(lst_size(msa_name_list) * sizeof(double));
  }

  /* open all MSAs */
  msas = lst_new_ptr(lst_size(msa_name_list));
  fprintf(stderr, "Reading alignment(s) ...\n");
  for (i = 0, j = 0; i < lst_size(msa_name_list); i++) {
    String *name = lst_get_ptr(msa_name_list, i);
    FILE *mfile = phast_fopen(name->chars, "r");
    msa_format_type temp_format;
    MSA *msa;
    if (msa_format == UNKNOWN_FORMAT)
      temp_format = msa_format_for_content(mfile, 1);
    else temp_format = msa_format;
    msa = msa_new_from_file_define_format(mfile, temp_format, NULL);
    phast_fclose(mfile);
    if (nseqs == -1) nseqs = msa->nseqs;
    if (!meme_mode &&
        (msa->length - msa_num_gapped_cols(msa, STRIP_ANY_GAPS, -1, -1) < 300 ||
        msa->nseqs != nseqs)) {
      fprintf(stderr, "WARNING: ignoring alignment '%s' -- too few informative sites.\n", name->chars);
      msa_free(msa);
      continue;
    }

    if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
    msa_remove_N_from_alph(msa); /* Ns can be a problem */
    lst_push_ptr(msas, msa);
    if (has_motif != NULL) {
      int k, hm = (hsh_get_int(hash, name->chars) == 1);
      if (meme_mode) {          /* here need to record at individ seq level */
        has_motif = srealloc(has_motif, 
                             (j + msa->nseqs + 1) * sizeof(double)); /* FIXME */
        for (k = 0; k < msa->nseqs; k++) has_motif[j++] = hm;
      }
      else has_motif[j++] = hm;
    }
  }
  if (!meme_mode) {
    fprintf(stderr, "Extracting and pooling sufficient statistics ...\n");
    pmsa = ss_pooled_from_msas(msas, 1, size, NULL, 0);
    msa_remove_N_from_alph(pmsa->pooled_msa);
  }

  /* obtain individual sequences, if necessary */
  if (nmostprevalent > 0 || nsamples > 0 || meme_mode) {
    if (meme_mode) fprintf(stderr, "Converting to individual sequences ...\n");
    else fprintf(stderr, "Obtaining reference sequences for pre-processing ...\n");
    seqset = mtf_get_seqset(msas, meme_mode ? -1 : 1, 10 * size);
                                /* for now, assume 1st seq is reference */
    msa_remove_N_from_alph(seqset->set); 
  }

  if (nmostprevalent > 0) {
    fprintf(stderr, "Obtaining %d most prevalent %d-tuples ...\n", 
            nmostprevalent, tuple_size);
    init_list = lst_new_ptr(nmostprevalent);
    mtf_get_common_ntuples(seqset, init_list, tuple_size, nmostprevalent);
  }
  else if (nsamples > 0) {
    fprintf(stderr, "Sampling %d %d-tuples ...\n", nsamples, tuple_size);
    init_list = lst_new_ptr(nsamples);
    mtf_sample_ntuples(seqset, init_list, tuple_size, nsamples);
  }

  /* in meme_mode, backgd model can be specified as eq freqs in a .mod file */
  if (meme_mode && backgd_mod != NULL && has_motif == NULL)
    backgd_mnmod = backgd_mod->backgd_freqs;

  /* estimate background model, if necessary */
  else if (backgd_mod == NULL && (!meme_mode || has_motif == NULL)) {
    fprintf(stderr, "Fitting background model%s ...\n", 
            has_motif == NULL ? "" : " (for use in initialization)");
                                /* if discriminative, be clear
                                   backgd isn't really part of the
                                   estimation procedure */
    if (meme_mode) {
      backgd_mnmod = vec_new(strlen(seqset->set->alphabet));
      mtf_estim_backgd_mn(seqset, backgd_mnmod);
    }
    else {
      backgd_mod = tm_new(tr_create_copy(tree), NULL, NULL, F81, 
                          pmsa->pooled_msa->alphabet, 1, 0, NULL, -1);
      tm_fit(backgd_mod, pmsa->pooled_msa, 
             tm_params_init(backgd_mod, .1, 5, 0), 
             -1, OPT_MED_PREC, NULL, 0, NULL);
    }
  }

  /* select subset of init strings, if necessary */
  if (nbest > 0 && init_list != NULL) {
    fprintf(stderr, "Winnowing candidate start strings ...\n");
    tmpl = lst_new_ptr(nbest);
    mtf_winnow_starts(meme_mode ? (void*)seqset : (void*)pmsa,
                      init_list, nbest, tmpl, !meme_mode, size, tree,
                      meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, 
                      has_motif);
    lst_free(init_list);
    init_list = tmpl;
  }

  /* Now find motifs */
  motifs = mtf_find(meme_mode ? (void*)seqset : (void*)pmsa, 
                    !meme_mode, size, nmotifs, tree,
                    meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, 
                    has_motif, prior, nrestarts, init_list, sample_parms, 
                    npseudocounts);
     
  fprintf(stderr, "\n\n");
  if (do_bed)
    bedfeats = gff_new_set_init("phast_motif", "0.1b");

  /* generate output */
  for (i = 0; i < lst_size(motifs); i++) {
    Motif *m = lst_get_ptr(motifs, i);

    if (!suppress_stdout) {
      if (lst_size(motifs) > 1) 
        printf("\n**********\nMOTIF #%d\n**********\n\n", i+1);

      mtf_print(stdout, m);
    }

    if (do_html) {
      String *fname = str_dup(output_prefix);
      str_append_int(fname, i+1);
      str_append_charstr(fname, ".html");
      mtf_print_html(phast_fopen(fname->chars, "w+"), m);
      str_free(fname);
    }

    if (do_bed) 
      mtf_add_features(m, bedfeats);
  }
  if (do_html) {
    String *fname = str_dup(output_prefix);
    str_append_charstr(fname, "index.html");
    mtf_print_summary_html(phast_fopen(fname->chars, "w+"), 
                           motifs, output_prefix);
    str_free(fname);
  }
  if (do_bed) {
    String *fname = str_dup(output_prefix);
    str_append_charstr(fname, "bed");
    gff_print_bed(phast_fopen(fname->chars, "w+"),
                  bedfeats, FALSE);
    str_free(fname);
  }

  return 0;
}