void DynamicProgram<T>::min(Parts& parts, vector2DMat& scores, vector4DMat& Ix, vector4DMat& Iy, vector4DMat& Ik, vector2DMat& rootv, vector2DMat& rooti) {

	// initialize the outputs, preallocate vectors to make them thread safe
	// TODO: better initialisation of Ix, Iy, Ik
	const int nscales = scores.size();
	const int ncomponents = parts.ncomponents();
	Ix.resize(nscales, vector3DMat(ncomponents));
	Iy.resize(nscales, vector3DMat(ncomponents));
	Ik.resize(nscales, vector3DMat(ncomponents));
	rootv.resize(nscales, vectorMat(ncomponents));
	rooti.resize(nscales, vectorMat(ncomponents));

	// for each scale, and each component, update the scores through message passing
	#ifdef _OPENMP
	#pragma omp parallel for
	#endif
	for (int nc = 0; nc < nscales*ncomponents; ++nc) {

		// calculate the inner loop variables from the dual variables
		const int n = floor(nc / ncomponents);
		const int c = nc % ncomponents;

		// allocate the inner loop variables
		Ix[n][c].resize(parts.nparts(c));
		Iy[n][c].resize(parts.nparts(c));
		Ik[n][c].resize(parts.nparts(c));
		vectorMat ncscores(scores[n].size());

		for (int p = parts.nparts(c)-1; p > 0; --p) {

			// get the component part (which may have multiple mixtures associated with it)
			ComponentPart cpart = parts.component(c, p);
			int nmixtures       = cpart.nmixtures();
			Ix[n][c][p].resize(nmixtures);
			Iy[n][c][p].resize(nmixtures);
			Ik[n][c][p].resize(nmixtures);

			// intermediate results for mixtures of this part
			vectorMat scoresp;
			vectorMat Ixp;
			vectorMat Iyp;

			for (int m = 0; m < nmixtures; ++m) {

				// raw score outputs
				Mat score_in, score_dt, Ix_dt, Iy_dt;
				if (cpart.score(ncscores, m).empty()) {
					score_in = cpart.score(scores[n], m);
				} else {
					score_in = cpart.score(ncscores, m);
				}

				// get the anchor position
				Point anchor = cpart.anchor(m);

				// compute the distance transform
				distanceTransform(score_in, cpart.defw(m), anchor, score_dt, Ix_dt, Iy_dt);
				scoresp.push_back(score_dt);
				Ixp.push_back(Ix_dt);
				Iyp.push_back(Iy_dt);
				//cout << score_dt(Range(0,10), Range(0,10)) << endl;

				// calculate a valid region of interest for the scores
				/*
				int X = score_in.cols;
				int Y = score_in.rows;
				int xmin = std::max(std::min(anchor.x, X), 0);
				int ymin = std::max(std::min(anchor.y, Y), 0);
				int xmax = std::min(std::max(anchor.x+X, 0), X);
				int ymax = std::min(std::max(anchor.y+Y, 0), Y);
				int xoff = std::max(-anchor.x,    0);
				int yoff = std::max(-anchor.y,    0);

				// shift the score by the Part's offset from its parent
				Mat scorem = -numeric_limits<T>::infinity() * Mat::ones(score_dt.size(), score_dt.type());
				Mat Ixm    = Mat::zeros(Ix_dt.size(), Ix_dt.type());
				Mat Iym    = Mat::zeros(Iy_dt.size(), Iy_dt.type());
				if (xoff < X && yoff < Y && (ymax - ymin) > 0 && (xmax - xmin) > 0) {
					Mat score_dt_range 	= score_dt(Range(ymin, ymax),         Range(xmin, xmax));
					Mat score_range    	= scorem(Range(yoff, yoff+ymax-ymin), Range(xoff, xoff+xmax-xmin));
					Mat Ix_dt_range 	= Ix_dt(Range(ymin, ymax),            Range(xmin, xmax));
					Mat Ixm_range 		= Ixm(Range(yoff, yoff+ymax-ymin),    Range(xoff, xoff+xmax-xmin));
					Mat Iy_dt_range 	= Iy_dt(Range(ymin, ymax),            Range(xmin, xmax));
					Mat Iym_range 		= Iym(Range(yoff, yoff+ymax-ymin),    Range(xoff, xoff+xmax-xmin));
					score_dt_range.copyTo(score_range);
					Ix_dt_range.copyTo(Ixm_range);
					Iy_dt_range.copyTo(Iym_range);
				}

				// push the scores onto the intermediate vectors
				scoresp.push_back(scorem);
				Ixp.push_back(Ixm);
				Iyp.push_back(Iym);
				*/
			}

			nmixtures = cpart.parent().nmixtures();
			for (int m = 0; m < nmixtures; ++m) {
				vectorMat weighted;
				// weight each of the child scores
				// TODO: More elegant way of handling bias
				for (int mm = 0; mm < cpart.nmixtures(); ++mm) {
					weighted.push_back(scoresp[mm] + cpart.bias(mm)[m]);
				}
				// compute the max over the mixtures
				Mat maxv, maxi;
				reduceMax(weighted, maxv, maxi);

				// choose the best indices
				Mat Ixm, Iym;
				reducePickIndex<int>(Ixp, maxi, Ixm);
				reducePickIndex<int>(Iyp, maxi, Iym);
				Ix[n][c][p][m] = Ixm;
				Iy[n][c][p][m] = Iym;
				Ik[n][c][p][m] = maxi;

				// update the parent's score
				ComponentPart parent = cpart.parent();
				if (parent.score(ncscores,m).empty()) parent.score(scores[n],m).copyTo(parent.score(ncscores,m));
				parent.score(ncscores,m) += maxv;
				//cout << parent.score(ncscores,m)(Range(0,10),Range(0,10)) << endl << endl;
				if (parent.self() == 0) {
					ComponentPart root = parts.component(c);
					//cout << root.score(ncscores,m)(Range(0,10),Range(0,10)) << endl << endl;
				}
				//cout <<parent.self() << endl;
			}
		}
		// add bias to the root score and find the best mixture
		ComponentPart root = parts.component(c);
		//cout << root.self() << endl;
		Mat rncscore = root.score(ncscores,0);
		//cout << rncscore(Range(1,10),Range(1,10)) << endl;
		T bias = root.bias(0)[0];
		vectorMat weighted;
		// weight each of the child scores
		for (int m = 0; m < root.nmixtures(); ++m) {
			weighted.push_back(root.score(ncscores,m) + bias);
		}
		reduceMax(weighted, rootv[n][c], rooti[n][c]);
	}
}
void DynamicProgram<T>::argmin(Parts& parts, const vector2DMat& rootv, const vector2DMat& rooti, const vectorf scales, const vector4DMat& Ix, const vector4DMat& Iy, const vector4DMat& Ik, vectorCandidate& candidates) {

	// for each scale, and each component, traverse back down the tree to retrieve the part positions
	int nscales = scales.size();
	#ifdef _OPENMP
	#pragma omp parallel for
	#endif
	for (int n = 0; n < nscales; ++n) {
		T scale = scales[n];
		for (int c = 0; c < parts.ncomponents(); ++c) {

			// get the scores and indices for this tree of parts
			const vector2DMat& Iknc = Ik[n][c];
			const vector2DMat& Ixnc = Ix[n][c];
			const vector2DMat& Iync = Iy[n][c];
			int nparts = parts.nparts(c);

			// threshold the root score
			Mat over_thresh = rootv[n][c] > thresh_;
			Mat rootmix     = rooti[n][c];
			vectorPoint inds;
			find(over_thresh, inds);

			for (int i = 0; i < inds.size(); ++i) {
				Candidate candidate;
				vectori     xv(nparts);
				vectori     yv(nparts);
				vectori     mv(nparts);
				for (int p = 0; p < nparts; ++p) {
					ComponentPart part = parts.component(c, p);
					// calculate the child's points from the parent's points
					int x, y, m;
					if (part.isRoot()) {
						x = xv[0] = inds[i].x;
						y = yv[0] = inds[i].y;
						m = mv[0] = rootmix.at<int>(inds[i]);
					} else {
						int idx = part.parent().self();
						x = xv[idx];
						y = yv[idx];
						m = mv[idx];
						xv[p] = Ixnc[p][m].at<int>(y,x);
						yv[p] = Iync[p][m].at<int>(y,x);
						mv[p] = Iknc[p][m].at<int>(y,x);
					}

					// calculate the bounding rectangle and add it to the Candidate
					Point ptwo = Point(2,2);
					Point pone = Point(1,1);
					Point xy1 = (Point(xv[p],yv[p])-ptwo)*scale;
					Point xy2 = xy1 + Point(part.xsize(m), part.ysize(m))*scale - pone;
					if (part.isRoot()) candidate.addPart(Rect(xy1, xy2), rootv[n][c].at<T>(inds[i]));
					else candidate.addPart(Rect(xy1, xy2), 0.0);
				}
				#ifdef _OPENMP
				#pragma omp critical(addcandidate)
				#endif
				{
					candidates.push_back(candidate);
				}
			}
		}
	}
}
void DynamicProgram<T>::min(Parts& parts, vector2DMat& scores, vector4DMat& Ix, vector4DMat& Iy, vector4DMat& Ik, vector2DMat& rootv, vector2DMat& rooti) {

	// initialize the outputs, preallocate vectors to make them thread safe
	// TODO: better initialisation of Ix, Iy, Ik
	const unsigned int nscales = scores.size();
	const unsigned int ncomponents = parts.ncomponents();
	Ix.resize(nscales, vector3DMat(ncomponents));
	Iy.resize(nscales, vector3DMat(ncomponents));
	Ik.resize(nscales, vector3DMat(ncomponents));
	rootv.resize(nscales, vectorMat(ncomponents));
	rooti.resize(nscales, vectorMat(ncomponents));

	// for each scale, and each component, update the scores through message passing
	#ifdef _OPENMP
	#pragma omp parallel for
	#endif
	for (unsigned int nc = 0; nc < nscales*ncomponents; ++nc) {

		// calculate the inner loop variables from the dual variables
		const unsigned int n = floor(nc / ncomponents);
		const unsigned int c = nc % ncomponents;

		// allocate the inner loop variables
		Ix[n][c].resize(parts.nparts(c));
		Iy[n][c].resize(parts.nparts(c));
		Ik[n][c].resize(parts.nparts(c));
		vectorMat ncscores(scores[n].size());

		for (int p = parts.nparts(c)-1; p > 0; --p) {

			// get the component part (which may have multiple mixtures associated with it)
			ComponentPart cpart = parts.component(c, p);
			const unsigned int nmixtures  = cpart.nmixtures();
			const unsigned int pnmixtures = cpart.parent().nmixtures();
			Ix[n][c][p].resize(pnmixtures);
			Iy[n][c][p].resize(pnmixtures);
			Ik[n][c][p].resize(pnmixtures);

			// intermediate results for mixtures of this part
			vectorMat scoresp;
			vectorMat Ixp;
			vectorMat Iyp;

			for (unsigned int m = 0; m < nmixtures; ++m) {

				// raw score outputs
				Mat_<T> score_in, score_dt;
				Mat_<int> Ix_dt, Iy_dt;
				if (cpart.score(ncscores, m).empty()) {
					score_in = cpart.score(scores[n], m);
				} else {
					score_in = cpart.score(ncscores, m);
				}

				// get the anchor position
				Point anchor = cpart.anchor(m);

				// compute the distance transform
				vectorf w = cpart.defw(m);
				Quadratic fx(-w[0], -w[1]);
				Quadratic fy(-w[2], -w[3]);
				dt_.compute(score_in, fx, fy, anchor, score_dt, Ix_dt, Iy_dt);
				scoresp.push_back(score_dt);
				Ixp.push_back(Ix_dt);
				Iyp.push_back(Iy_dt);
			}

			for (unsigned int m = 0; m < pnmixtures; ++m) {
				vectorMat weighted;
				// weight each of the child scores
				// TODO: More elegant way of handling bias
				for (unsigned int mm = 0; mm < nmixtures; ++mm) {
					weighted.push_back(scoresp[mm] + cpart.bias(mm)[m]);
				}
				// compute the max over the mixtures
				Mat maxv, maxi;
				Math::reduceMax<T>(weighted, maxv, maxi);

				// choose the best indices
				Mat Ixm, Iym;
				Math::reducePickIndex<int>(Ixp, maxi, Ixm);
				Math::reducePickIndex<int>(Iyp, maxi, Iym);
				Ix[n][c][p][m] = Ixm;
				Iy[n][c][p][m] = Iym;
				Ik[n][c][p][m] = maxi;

				// update the parent's score
				ComponentPart parent = cpart.parent();
				if (parent.score(ncscores,m).empty()) parent.score(scores[n],m).copyTo(parent.score(ncscores,m));
				parent.score(ncscores,m) += maxv;
				if (parent.self() == 0) {
					ComponentPart root = parts.component(c);
				}
			}
		}
		// add bias to the root score and find the best mixture
		ComponentPart root = parts.component(c);
		Mat rncscore = root.score(ncscores,0);
		T bias = root.bias(0)[0];
		vectorMat weighted;
		// weight each of the child scores
		for (unsigned int m = 0; m < root.nmixtures(); ++m) {
			weighted.push_back(root.score(ncscores,m) + bias);
		}
		Math::reduceMax<T>(weighted, rootv[n][c], rooti[n][c]);
	}
}