int ChLcpSystemDescriptor::BuildDiVector(
								ChMatrix<>& Dvector	
						)
{
	n_q=CountActiveVariables();
	n_c=CountActiveConstraints();

	Dvector.Reset(n_q+n_c,1);		// fast! Reset() method does not realloc if size doesn't change

	// Fills the 'f' vector part
	#pragma omp parallel for num_threads(this->num_threads)
	for (int iv = 0; iv< (int)vvariables.size(); iv++)
	{
		if (vvariables[iv]->IsActive())
		{
			Dvector.PasteMatrix(&vvariables[iv]->Get_fb(), vvariables[iv]->GetOffset(), 0);
		}
	}
	// Fill the '-b' vector (with flipped sign!)
	#pragma omp parallel for num_threads(this->num_threads)
	for (int ic = 0; ic< (int)vconstraints.size(); ic++)
	{
		if (vconstraints[ic]->IsActive())
		{
			Dvector(vconstraints[ic]->GetOffset() + n_q) = - vconstraints[ic]->Get_b_i();
		}
	}

	return  n_q+n_c; 
}
int ChLcpSystemDescriptor::BuildFbVector(
								ChMatrix<>& Fvector	///< matrix which will contain the entire vector of 'f'
						)
{
	n_q=CountActiveVariables();
	Fvector.Reset(n_q,1);		// fast! Reset() method does not realloc if size doesn't change

	// Fills the 'f' vector
	#pragma omp parallel for num_threads(this->num_threads)
	for (int iv = 0; iv< (int)vvariables.size(); iv++)
	{
		if (vvariables[iv]->IsActive())
		{
			Fvector.PasteMatrix(&vvariables[iv]->Get_fb(), vvariables[iv]->GetOffset(), 0);
		}
	}
	return  this->n_q;
}
int  ChLcpSystemDescriptor::BuildDiagonalVector(
								ChMatrix<>& Diagonal_vect  	///< matrix which will contain the entire vector of terms on M and E diagonal
							)
{
	n_q=CountActiveVariables();
	n_c=CountActiveConstraints();

	Diagonal_vect.Reset(n_q+n_c,1);		// fast! Reset() method does not realloc if size doesn't change

	// Fill the diagonal values given by stiffness blocks, if any
	// (This cannot be easily parallelized because of possible write concurrency).
	for (int is = 0; is< (int)vstiffness.size(); is++)
	{
		vstiffness[is]->DiagonalAdd(Diagonal_vect);
	}

	// Get the 'M' diagonal terms
	#pragma omp parallel for num_threads(this->num_threads)
	for (int iv = 0; iv< (int)vvariables.size(); iv++)
	{
		if (vvariables[iv]->IsActive())
		{
			vvariables[iv]->DiagonalAdd(Diagonal_vect);
		}
	}

	// Get the 'E' diagonal terms (note the sign: E_i = -cfm_i )
	#pragma omp parallel for num_threads(this->num_threads)
	for (int ic = 0; ic< (int)vconstraints.size(); ic++)
	{
		if (vconstraints[ic]->IsActive())
		{
			Diagonal_vect(vconstraints[ic]->GetOffset() + n_q) = - vconstraints[ic]->Get_cfm_i();
		}
	}
	return n_q+n_c;
}
void ChLcpSystemDescriptor::SystemProduct(	
								ChMatrix<>&	result,			///< matrix which contains the result of matrix by x 
								ChMatrix<>* x		        ///< optional matrix with the vector to be multiplied (if null, use current l_i and q)
								// std::vector<bool>* enabled=0 ///< optional: vector of enable flags, one per scalar constraint. true=enable, false=disable (skip)
								)
{
	n_q = this->CountActiveVariables();
	n_c = this->CountActiveConstraints();

	ChMatrix<>* x_ql = 0;

	ChMatrix<>* vect;

	if (x)
	{
		#ifdef CH_DEBUG
			assert(x->GetRows()   == n_q+n_c);
			assert(x->GetColumns()== 1);
		#endif
		vect = x;
	}
	else
	{
		x_ql = new ChMatrixDynamic<double>(n_q+n_c,1);
		vect = x_ql;
		this->FromUnknownsToVector(*vect);
	}

	result.Reset(n_q+n_c,1); // fast! Reset() method does not realloc if size doesn't change

	// 1) First row: result.q part =  [M + K]*x.q + [Cq']*x.l

	// 1.1)  do  M*x.q
	#pragma omp parallel for num_threads(this->num_threads)
	for (int iv = 0; iv< (int)vvariables.size(); iv++)
		if (vvariables[iv]->IsActive())
		{
			vvariables[iv]->MultiplyAndAdd(result,*x);
		}

	// 1.2)  add also K*x.q  (NON straight parallelizable - risk of concurrency in writing)
	for (int ik = 0; ik< (int)vstiffness.size(); ik++)
	{
		vstiffness[ik]->MultiplyAndAdd(result,*x);
	}

	// 1.3)  add also [Cq]'*x.l  (NON straight parallelizable - risk of concurrency in writing)
	for (int ic = 0; ic < (int)vconstraints.size(); ic++)
	{	
		if (vconstraints[ic]->IsActive())
		{
			vconstraints[ic]->MultiplyTandAdd(result,  (*x)(vconstraints[ic]->GetOffset()+n_q));
		}
	}

	// 2) Second row: result.l part =  [C_q]*x.q + [E]*x.l
	#pragma omp parallel for num_threads(this->num_threads)
	for (int ic = 0; ic < (int)vconstraints.size(); ic++)
	{	
		if (vconstraints[ic]->IsActive())
		{
			int s_c = vconstraints[ic]->GetOffset() + n_q;
			vconstraints[ic]->MultiplyAndAdd(result(s_c), (*x));     // result.l_i += [C_q_i]*x.q
			result(s_c) -= vconstraints[ic]->Get_cfm_i()* (*x)(s_c); // result.l_i += [E]*x.l_i  NOTE:  cfm = -E
		}
	}		
	


	// if a temp vector has been created because x was not provided, then delete it
	if (x_ql)
		delete x_ql;
}
void ChLcpSystemDescriptor::ShurComplementProduct(	
								ChMatrix<>&	result,	
								ChMatrix<>* lvector,
								std::vector<bool>* enabled  
								)
{
	#ifdef CH_DEBUG
		assert(this->vstiffness.size() == 0); // currently, the case with ChLcpKblock items is not supported (only diagonal M is supported, no K)
		int n_c=CountActiveConstraints();
		assert(lvector->GetRows()   == n_c);
		assert(lvector->GetColumns()== 1);
		if (enabled) assert(enabled->size() == n_c);
	#endif

	result.Reset(n_c,1);  // fast! Reset() method does not realloc if size doesn't change


	// Performs the sparse product    result = [N]*l = [ [Cq][M^(-1)][Cq'] - [E] ] *l
	// in different phases:

	// 1 - set the qb vector (aka speeds, in each ChLcpVariable sparse data) as zero

	#pragma omp parallel for num_threads(this->num_threads)
	for (int iv = 0; iv< (int)vvariables.size(); iv++)
	{
		if (vvariables[iv]->IsActive())
			vvariables[iv]->Get_qb().FillElem(0);
	}

	// 2 - performs    qb=[M^(-1)][Cq']*l  by
	//     iterating over all constraints (when implemented in parallel this
	//     could be non-trivial because race conditions might occur -> reduction buffer etc.)
	//     Also, begin to add the cfm term ( -[E]*l ) to the result.

	//#pragma omp parallel for num_threads(this->num_threads)  ***NOT POSSIBLE!!! concurrent write to same q may happen
	for (int ic = 0; ic < (int)vconstraints.size(); ic++)
	{	
		if (vconstraints[ic]->IsActive())
		{
			int s_c = vconstraints[ic]->GetOffset();

			bool process=true;
			if (enabled)
				if ((*enabled)[s_c]==false)
					process = false;

			if (process) 
			{
				double li;
				if (lvector)
					li = (*lvector)(s_c,0);
				else
					li = vconstraints[ic]->Get_l_i();

				// Compute qb += [M^(-1)][Cq']*l_i
				//  NOTE! parallel update to same q data, risk of collision if parallel!!
				vconstraints[ic]->Increment_q(li);	// <----!!!  fpu intensive

				// Add constraint force mixing term  result = cfm * l_i = -[E]*l_i
				result(s_c,0) =  vconstraints[ic]->Get_cfm_i() * li;

			}

		}
	}

	// 3 - performs    result=[Cq']*qb    by
	//     iterating over all constraints 

	#pragma omp parallel for num_threads(this->num_threads)
	for (int ic = 0; ic < (int)vconstraints.size(); ic++)
	{	
		if (vconstraints[ic]->IsActive())
		{
			bool process=true;
			if (enabled)
				if ((*enabled)[vconstraints[ic]->GetOffset()]==false)
					process = false;
			
			if (process) 
				result(vconstraints[ic]->GetOffset(),0)+= vconstraints[ic]->Compute_Cq_q();	// <----!!!  fpu intensive
			else
				result(vconstraints[ic]->GetOffset(),0)= 0; // not enabled constraints, just set to 0 result 
		}
	}


}