cplx CombinedRepresentation<Rank>::InnerProduct(const Wavefunction<Rank>& w1, const Wavefunction<Rank>& w2)
{
	blitz::Array<cplx, Rank> d1(w1.GetData());
	blitz::Array<cplx, Rank> d2(w2.GetData());

	/*
	 * Algorithm1 is faster for orthogonal basises
	 * Algorithm2 is faster for nonorthogonal basies
	 * Algorithm3 is the only one working for parallel problems, but require
	 *    more memory
	 *
	 * For a combination of orthogonal and non-orthogonal
	 * basises, 1 and 2 are most likely almost equally fast 
	 *
	 * Conclusion: Algo 3 is default
	 */

	if (Algorithm == 1)
	{
		return InnerProductImpl_Algo1(d1, d2);
	}
	else if (Algorithm == 2)
	{
		return InnerProductImpl_Algo2(d1, d2);
	}
	else if (Algorithm == 3)
	{
		blitz::TinyVector<int, Rank> shape = d1.shape();
		
		blitz::Array<cplx, Rank> temp1;
		blitz::Array<cplx, Rank> temp2;
		
		int tempName[2];
		int tempNamePsi[2];
		Wavefunction<Rank>* psiList[2];
		for (int i=0; i<2; i++)
		{
			tempName[i] = -1;
			tempNamePsi[i] = -1;
		}
		psiList[0] = const_cast<Wavefunction<Rank>*>(&w1);
		psiList[1] = const_cast<Wavefunction<Rank>*>(&w2);

		//Find any available buffers of correct size on any of the wavefunctions
		for (int i=0; i<2; i++)
		{
			//See if there is an available buffer in psi j
			for (int j=0; j<2; j++)
			{
				int name = psiList[j]->GetAvailableDataBufferName(shape);
				if (name != -1)
				{
					tempName[i] = name;
					tempNamePsi[i] = j;
					psiList[j]->LockBuffer(name);
					break;
				}
			}
		}

		//If we didnt find two available buffers, we must allocate
		//We'll allocate on w2
		for (int i=0; i<2; i++)
		{
			if (tempName[i] == -1)
			{
				tempName[i] = psiList[1]->AllocateData(shape);
				tempNamePsi[i] = 1;
				psiList[1]->LockBuffer(tempName[i]);
			}
		}

		//Get the actual data buffers
		temp1.reference(psiList[tempNamePsi[0]]->GetData(tempName[0]));
		temp2.reference(psiList[tempNamePsi[1]]->GetData(tempName[1]));

		//Perform MatrixVector multiplication
		//first step
		//
		
		for (int i=0; i<Rank; i++)
		{
			if (this->GetDistributedModel()->IsDistributedRank(i) && !this->IsOrthogonalBasis(i))
			{
				throw std::runtime_error("This inner product only supports parallelization for orthogonal ranks");
			}

			if (this->IsOrthogonalBasis(i))
			{
				if (i == 0)
				{
					temp1 = d2;
				}
				//TODO: Make this faster by moving it to TensorMultiply
				blitz::Array<double, 1> weights = this->GetLocalWeights(i);
				blitz::Array<cplx, 3> temp3d = MapToRank3(temp1, i, 1);
				temp3d *= weights(blitz::tensor::j) + 0*blitz::tensor::k;
			}
			else
			{
				blitz::Array<cplx, 2> overlapMatrix = this->GetGlobalOverlapMatrix(i)->GetOverlapHermitianLower();
				//Reshape the overlap matrix into a N-d array suitable for TensorPotentialMultiply
				blitz::TinyVector<int, Rank> overlapShape = 1;
				overlapShape(i) = overlapMatrix.size();
				blitz::TinyVector<int, Rank> overlapStride = 1;
				for (int j=0; j<i; j++)
				{
					overlapStride(j) = overlapMatrix.size();
				}
				blitz::Array<cplx, Rank> overlapTensor(overlapMatrix.data(), overlapShape, overlapStride, blitz::neverDeleteData);
		
				if (i==0)
				{
					temp1 = 0;
					TensorPotentialMultiply_Rank1_Band(i, overlapTensor, 1.0, d2, temp1);
				}
				else
				{
					temp2 = 0;
					TensorPotentialMultiply_Rank1_Band(i, overlapTensor, 1.0, temp1, temp2);
					blitz::swap(temp1, temp2);
				}
			}
		}

		//Calculate inner product by overlap of the vectors
		cplx innerProduct = VectorInnerProduct(d1, temp1);

		for (int i=0; i<2; i++)
		{
			psiList[tempNamePsi[i]]->UnLockBuffer(tempName[i]);
		}
	
		return innerProduct;
	}
	else if (Algorithm == 4) //Using DistributedOverlapMatrix / Trilinos
	{
		
		Wavefunction<Rank>* psiLeft = const_cast<Wavefunction<Rank>*>(&w1);
		Wavefunction<Rank>* psiRight = const_cast<Wavefunction<Rank>*>(&w2);

		//Get name of available databuffer for "left" wavefunction
		int nameLeft = psiLeft->GetAvailableDataBufferName(psiLeft->GetData().shape());
		
		//Is a buffer available? If not, create one
		if (nameLeft == -1)
		{
			nameLeft = psiLeft->AllocateData(psiLeft->GetData().shape());
		}

		int oldNameLeft = psiLeft->SetActiveBuffer(nameLeft);

		//Lock old buffer
		psiLeft->LockBuffer(oldNameLeft);

		//Copy data from old buffer to work buffer
		psiLeft->GetData()(blitz::Range::all()) = psiLeft->GetData(oldNameLeft)(blitz::Range::all());

		//Similar procedure for "right" wavefunction
		int nameRight = psiRight->GetAvailableDataBufferName(psiRight->GetData().shape());
		if (nameRight == -1)
		{
			nameRight = psiRight->AllocateData(psiRight->GetData().shape());
		}

		int oldNameRight = psiRight->SetActiveBuffer(nameRight);
		
		//Lock original data buffer
		psiRight->LockBuffer(oldNameRight);
		
		//Copy data from old to new (work) buffer
		psiRight->GetData(nameRight)(blitz::Range::all()) = psiRight->GetData(oldNameRight)(blitz::Range::all());

		//Multiply integration weights
		MultiplyIntegrationWeights(*psiRight);

		//Calculate inner product by overlap of the vectors
		cplx innerProduct = VectorInnerProduct(psiLeft->GetData(oldNameLeft), psiRight->GetData(nameRight));
		
		//Unlock and restore original buffers
		psiRight->UnLockBuffer(oldNameRight);
		psiRight->SetActiveBuffer(oldNameRight);
		psiLeft->UnLockBuffer(oldNameLeft);
		psiLeft->SetActiveBuffer(oldNameLeft);

		return innerProduct;
	}
	else
	{
		cout << "Unknown InnerProduct algorithm " << Algorithm << endl;
		throw std::runtime_error("Unknown InnerProduct algorithm");
	}

}