Exemplo n.º 1
0
	LNSMul::LNSMul(Target * target, int wE, int wF) :
		Operator(target), wE(wE), wF(wF)
	{
		ostringstream name;
		/* The name has the format: LNSMul_wE_wF where: 
			wE = width of the integral part of the exponent
			wF = width of the fractional part of the exponent */
		name << "LNSMul_" << wE << "_" << wF; 
		setName(name.str());
		setCopyrightString("Jérémie Detrey, Florent de Dinechin (2003-2004), Sylvain Collange (2008)");
		addInput ("nA", wE + wF + 3);
		addInput ("nB", wE + wF + 3);
		addOutput("nR", wE + wF + 3);
		
		addConstant("wE", "positive", wE);
		addConstant("wF", "positive", wF);
		
		//vhdl << tab << declare("eRn", wE+wF+1) << " <= (nA(wE+wF-1) & nA(wE+wF-1 downto 0)) + (nB(wE+wF-1) & nB(wE+wF-1 downto 0));\n";

		IntAdder *my_adder = new IntAdder(target, wE+wF+1);
		oplist.push_back(my_adder);
		vhdl << tab << declare("X", wE+wF+1) << "<= nA(wE+wF-1) & nA(wE+wF-1 downto 0);\n";
		vhdl << tab << declare("Y", wE+wF+1) << "<= nB(wE+wF-1) & nB(wE+wF-1 downto 0);\n";
		inPortMap   (my_adder, "X", "X");
		inPortMap   (my_adder, "Y", "Y");
		inPortMapCst(my_adder, "Cin", "'0'");
		outPortMap (my_adder, "R","eRn");
		vhdl << instance(my_adder, "my_add");	
		
		vhdl << tab << declare("sRn") << " <= nA(wE+wF) xor nB(wE+wF);\n";
		vhdl << tab << declare("xRn", 2) << " <= \"00\" when eRn(wE+wF downto wE+wF-1) = \"10\" else\n"
			<< tab << "	 \"10\" when eRn(wE+wF downto wE+wF-1) = \"01\" else\n"
			<< tab << "	 \"01\";\n";
		vhdl << tab << declare("nRn", wE+wF+3) << " <= xRn & sRn & eRn(wE+wF-1 downto 0);\n";

		vhdl << tab << declare("xA", 2) << " <= nA(wE+wF+2 downto wE+wF+1);\n";
		vhdl << tab << declare("xB", 2) << " <= nB(wE+wF+2 downto wE+wF+1);\n";
		vhdl << tab << declare("xAB", 4) << " <= xA & xB when xA >= xB else\n"
			<< tab << "	 xB & xA;\n";
		vhdl
			<< tab << "with xAB select\n"
			<< tab << tab << "nR(wE+wF+2 downto wE+wF+1) <= xRn  when \"0101\",\n"
			<< tab << "	                                    \"00\" when \"0000\" | \"0100\",\n"
			<< tab << "	                                    \"10\" when \"1001\" | \"1010\",\n"
			<< tab << "	                                    \"11\" when others;\n"
			<< tab << "\n"
			<< tab << "nR(wE+wF downto 0) <= nRn(wE+wF downto 0);\n";
	}
Exemplo n.º 2
0
	LongIntAdderMuxNetwork::LongIntAdderMuxNetwork(Target* target, int wIn, map<string, double> inputDelays, int regular):
		Operator(target), wIn_(wIn), inputDelays_(inputDelays) 
	{
		srcFileName="LongIntAdderMuxNetwork";
		setName(join("LongIntAdderMuxNetwork_", wIn_));

		// Set up the IO signals
		for (int i=0; i<2; i++)
			addInput ( join("X",i) , wIn_, true);
		addInput("Cin");
		addOutput("R"  , wIn_, true, 1);

				//compute the maximum input delay
				maxInputDelay = getMaxInputDelays(inputDelays);
				
				
				if (false){
					if (verbose)
						cout << "The maximum input delay is "<<	maxInputDelay<<endl;
					
					cSize = new int[2000];
					REPORT(3, "-- The new version: direct mapping without 0/1 padding, IntAdders instantiated");
					double	objectivePeriod = double(1) / target->frequency();
					REPORT(2, "Objective period is "<< objectivePeriod <<" at an objective frequency of "<<target->frequency());
					target->suggestSubaddSize(chunkSize_ ,wIn_);
					REPORT(2, "The chunkSize for first two chunks is: " << chunkSize_ );
					
					if (2*chunkSize_ >= wIn_){
						cerr << "ERROR FOR NOW -- instantiate int adder, dimmension too small for LongIntAdderMuxNetwork" << endl;
						exit(0);
					}
					
					cSize[0] = chunkSize_;
					cSize[1] = chunkSize_;
					
					bool finished = false; /* detect when finished the first the first
					phase of the chunk selection algo */
					int width = wIn_ - 2*chunkSize_; /* remaining size to split into chunks */
					int propagationSize = 0; /* carry addition size */
					int chunkIndex = 2; /* the index of the chunk for which the size is
					to be determined at the current step */
					bool invalid = false; /* the result of the first phase of the algo */
					
					/* FIRST PHASE */
					REPORT(3, "FIRST PHASE chunk splitting");
					while (not (finished))	 {
						REPORT(2, "The width is " << width);
						propagationSize+=2;
						double delay = objectivePeriod - target->adderDelay(width)- target->adderDelay(propagationSize); //2*target->localWireDelay()  -
						REPORT(2, "The value of the delay at step " << chunkIndex << " is " << delay);
						if ((delay > 0) || (width < 4)) {
							REPORT(2, "finished -> found last chunk of size: " << width);
							cSize[chunkIndex] = width;
							finished = true;
						}else{
							REPORT(2, "Found regular chunk ");
							int cs; 
							double slack =  target->adderDelay(propagationSize) ; //+ 2*target->localWireDelay()
							REPORT(2, "slack is: " << slack);
							REPORT(2, "adderDelay of " << propagationSize << " is " << target->adderDelay(propagationSize) );
							target->suggestSlackSubaddSize( cs, width, slack);
							REPORT(2, "size of the regular chunk is : " << cs);
							width = width - cs;
							cSize[chunkIndex] = cs;
							
							if ( (cSize[chunkIndex-1]<=2) && (cSize[chunkIndex-1]<=2) && ( invalid == false) ){
								REPORT(1, "[WARNING] Register level inserted after carry-propagation chain");
								invalid = true; /* invalidate the current splitting */
							}
							chunkIndex++; /* as this is not the last pair of chunks,
							pass to the next pair */
						}
					}
					REPORT(2, "First phase return valid result: " << invalid);
					
					/* SECOND PHASE: 
					only if first phase is cannot return a valid chunk size
					decomposition */
					if (invalid){
						REPORT(2,"SECOND PHASE chunk splitting ...");
						target->suggestSubaddSize(chunkSize_ ,wIn_);
						lastChunkSize_ = (wIn_% chunkSize_ ==0 ? chunkSize_ :wIn_% chunkSize_);
						
						/* the index of the last chunk pair */
						chunkIndex = (wIn_% chunkSize_ ==0 ? ( wIn_ / chunkSize_) - 1 :  (wIn_-lastChunkSize_) / chunkSize_ ); 								
						for (int i=0; i < chunkIndex; i++)
							cSize[i] = chunkSize_;
						/* last chunk is handled separately  */
						cSize[chunkIndex] = lastChunkSize_;
					}
					
					/* VERIFICATION PHASE: check if decomposition is correct */		
					REPORT(2, "found " << chunkIndex + 1  << " chunks ");
					nbOfChunks = chunkIndex + 1; 
					int sum = 0;
					ostringstream chunks;
					for (int i=chunkIndex; i>=0; i--){
						chunks << cSize[i] << " ";
						sum+=cSize[i];
					}
					chunks << endl;
					REPORT(2, "Chunks are: " << chunks.str());
					REPORT(2, "The chunk size sum is " << sum << " and initial width was " << wIn_);
					if (sum != wIn_){
						cerr << "ERROR: check the algo" << endl; /*should never get here ... */
						exit(0);
					}
				}
				
				
				int ll,l0;
				// double xordelay;
				// double dcarry;
				// double muxcystoo;
				// double fdcq;
				double muxcystooOut;

				int fanOutWeight;

				if (target->getID()=="Virtex5"){
					// fdcq = 0.396e-9; 
					// xordelay = 0.300e-9;
					// dcarry = 0.023e-9;
					// muxcystoo = 0.305e-9;
					muxcystooOut = 0.504e-9; 
					fanOutWeight = 45;					
				}else{ 
					if (target->getID()=="Virtex6"){
						// fdcq = 0.280e-9;
						// xordelay = 0.180e-9;
						// dcarry = 0.015e-9;
						// muxcystoo =	0.219e-9;
						muxcystooOut = 0.373e-9;
						fanOutWeight = 51;
					}else{ 
						if (target->getID()=="Virtex4"){
							// fdcq = 0.272e-9;
							// xordelay = 0.273e-9;
							// dcarry = 0.034e-9;
							// muxcystoo = 0.278e-9;
							muxcystooOut = 0.524e-9;
							fanOutWeight = 60;
						}
					}
				}
				int lkm1;
				

	double iDelay = getMaxInputDelays(inputDelays);
				
#ifdef MAXSIZE
	for (int aa=25; aa<=500; aa+=5){
		target->setFrequency(double(aa)*1000000.0);

#endif
bool nogo = false;
double t = 1.0 / target->frequency();

				if (!target->suggestSlackSubaddSize(lkm1, wIn, iDelay /*fdcq + target->localWireDelay()*/ + target->localWireDelay() + target->lutDelay())){
//					cerr << "Impossible 1" << endl;
					nogo = true;
				}
//				cout << "lkm1 = " << lkm1 << endl;
				
				double z =				iDelay +
										/*fdcq + target->localWireDelay() +*/
										target->lutDelay() + //xordelay +
										muxcystooOut + // the select to output line of the carry chain multiplexer. 
													// usually this delay for the 1-bit addition which is not overlapping   
										target->localWireDelay() + 
										target->localWireDelay(fanOutWeight) + //final multiplexer delay. Fan-out of the CGC bits is accounted for
										target->lutDelay();
#ifdef DEBUGN
				cerr << "lut             delay = " << target->lutDelay() << endl;
				cerr << "muxcystooOut    delay = " << muxcystooOut << endl;
				cerr << "localWireDelay  delay = " << target->localWireDelay() << endl;
				cerr << "localWireDelay2 delay = " << target->localWireDelay(fanOutWeight) << endl;
				cerr << "z slack = " << z << endl;
#endif
				nogo = nogo | (!target->suggestSlackSubaddSize(ll, wIn, z));
#ifdef DEBUGN
				cerr << "ll is = "<<ll << endl;
#endif				
				/*nogo = nogo | (!*/target->suggestSlackSubaddSize(l0, wIn, t - (2*target->lutDelay()+ muxcystooOut/* xordelay*/)); //);
				
				REPORT(INFO, "l0="<<l0);
				
				
				int maxAdderSize = lkm1 + ll*(ll+1)/2 + l0;
				if (nogo) 
					maxAdderSize = -1;
				REPORT(INFO, "ll="<<ll);
				REPORT(INFO, "max adder size is="<< maxAdderSize);
				

#ifdef MAXSIZE
cout << " f="<<aa<<" s="<<maxAdderSize<<endl;
}
exit(1);
#endif					
				cSize = new int[100];
				
				if (regular>0) {
					int c = regular;
					cout << "c="<<c<<endl; 
					int s = wIn_;
					int j=0;
					while (s>0){
						if (s-c>0){
							cSize[j]=c;
							s-=c;
						}else{
							cSize[j]=s;
							s=0;	
						}
						j++;	
					}
					nbOfChunks = j;	
				}else{
					int td = wIn;
					cSize[0] = l0;
					cSize[1] = 1;
					td -= (l0+1);
					nbOfChunks = 2;
					while (td > 0){
						int nc = cSize[nbOfChunks-1] + 1;
						int nnc = lkm1;
				
						REPORT(INFO,"nc="<<nc);
						REPORT(INFO,"nnc="<<nnc);
				
						if (nc + nnc >= td){
							REPORT(INFO, "Finish");
							//we can finish it now;
							if (nc>=td)
								nc = td-1;
							cSize[nbOfChunks] = nc;
							nbOfChunks++;
							td-=nc;
							cSize[nbOfChunks] = td;
							nbOfChunks++;
							td=0;
						}else{
							REPORT(INFO, "run");
							//not possible to finish chunk splitting now
							cSize[nbOfChunks] = nc;
							nbOfChunks++;
							td-=nc;
						}
					}
				}	
				
				for (int i=0; i<nbOfChunks; i++)
					REPORT(INFO, "cSize["<<i<<"]="<<cSize[i]);
				
//#define test512
#ifdef test512				
				nbOfChunks = 16;
				for (int i=1;i<=16;i++)
					cSize[i-1]=32;
		
#endif		
				//=================================================
				//split the inputs ( this should be reusable )
				vhdl << tab << "--split the inputs into chunks of bits depending on the frequency" << endl;
				for (int i=0;i<2;i++)
					for (int j=0; j<nbOfChunks; j++){
						ostringstream name;
						//the naming standard: sX j _ i _ l
						//j=the chunk index i is the input index and l is the current level
						name << "sX"<<j<<"_"<<i<<"_l"<<0;
						int low=0, high=0;
						for (int k=0;k<=j;k++)
							high+=cSize[k];
						for (int k=0;k<=j-1;k++)
							low+=cSize[k];
						vhdl << tab << declare (name.str(),cSize[j],true) << " <=  X"<<i<<range(high-1,low)<<";"<<endl;
					}
			
				int l=1;
				for (int j=0; j<nbOfChunks; j++){
					//code for adder instantiation to stop ise from "optimizing"
					IntAdderSpecific *adder = new IntAdderSpecific(target, cSize[j]);
					oplist.push_back(adder);

					if (j>0){ //for all chunks greater than zero we perform this additions
						inPortMap(adder, "X", join("sX",j,"_0_l",l-1) );
						inPortMap(adder, "Y", join("sX",j,"_1_l",l-1) );
						inPortMapCst(adder, "Cin", "'0'");
						outPortMap(adder, "R",    join("sX",j,"_0_l",l,"_Zero") );
						outPortMap(adder, "Cout", join("coutX",j,"_0_l",l,"_Zero") );
						vhdl << instance(adder, join("adderZ",j) );

						inPortMapCst(adder, "Cin", "'1'");
						outPortMap(adder, "R", join("sX",j,"_0_l",l,"_One"));
						outPortMap(adder, "Cout", join("coutX",j,"_0_l",l,"_One"));
						vhdl << instance( adder, join("adderO",j) );
					}else{
						vhdl << tab << "-- the carry resulting from the addition of the chunk + Cin is obtained directly" << endl;
						inPortMap(adder, "X", join("sX",j,"_0_l",l-1) );
						inPortMap(adder, "Y", join("sX",j,"_1_l",l-1) );
						inPortMapCst(adder, "Cin", "Cin");
						outPortMap(adder, "R",    join("sX",j,"_0_l",l,"_Cin") );
						outPortMap(adder, "Cout", join("coutX",j,"_0_l",l,"_Cin") );
						vhdl << instance(adder, join("adderCin",j) );
					}
				}
			
				vhdl << tab <<"--form the two carry string"<<endl;
				vhdl << tab << declare("carryStringZero",nbOfChunks-2) << " <= "; 
				for (int i=nbOfChunks-3; i>=0; i--) {
					vhdl << "coutX"<<i+1<<"_0_l"<<l<<"_Zero"<< (i>0?" & ":";") ;
				} vhdl << endl;
	
				vhdl << tab << declare("carryStringOne",  nbOfChunks-2) << "  <= "; 
				for (int i=nbOfChunks-3; i>=0; i--) {
					vhdl << "coutX"<<i+1<<"_0_l"<<l<<"_One" << " " << (i>0?" & ":";");
				} vhdl << endl;

				//multiplexer network
				for (int i=0; i<=nbOfChunks-3; i++){
					if (i==0)	
						vhdl << tab << declare( join("c",i+1) ) << " <= carryStringOne"<<of(i)<<" when Cin='1' else carryStringZero"<<of(i)<<";"<<endl; 
					else
						vhdl << tab << declare( join("c",i+1) ) << " <= carryStringOne"<<of(i)<<" when "<<join("c",i)<<"='1' else carryStringZero"<<of(i)<<";"<<endl;
				}
				
				for (int i=0; i< nbOfChunks; i++){
					if (i==0)
						vhdl << tab << declare( join("res",i), cSize[i],true) << " <= " << join("sX",i,"_0_l",1,"_Cin") << ";" << endl;
					else if (i==1)
						vhdl << tab << declare( join("res",i), cSize[i],true) << " <= " << join("sX",i,"_0_l",1,"_Zero") << " when "<<join("coutX",0,"_0_l",1,"_Cin")<<"='0' else "<< join("sX",i,"_0_l",1,"_One") << ";" << endl;
					else
						vhdl << tab << declare( join("res",i), cSize[i],true) << " <= " << join("sX",i,"_0_l",1,"_Zero") << " when "<<join("c",i-1)<<"='0' else "<< join("sX",i,"_0_l",1,"_One") << ";" << endl;	
				}
				

//				if (target->getVendor()== "Xilinx"){
//					//////////////////////////////////////////////////////
//					vhdl << tab << "--perform the short carry additions" << endl;
//					CarryGenerationCircuit *cgc = new CarryGenerationCircuit(target,nbOfChunks-2);
//					oplist.push_back(cgc);
//				
//					inPortMap(cgc, "X", "carryStringZero" );
//					inPortMap(cgc, "Y", "carryStringOne" );
//					inPortMapCst(cgc, "Cin", join("coutX",0,"_0_l",1,"_Cin"));
//					outPortMap(cgc, "R",    "rawCarrySum" );
//					vhdl << instance(cgc, "cgc");

//				
//					vhdl << tab <<"--get the final pipe results"<<endl;
//					for ( int i=0; i<nbOfChunks; i++){
//						if (i==0) 
//							vhdl << tab << declare(join("res",i),cSize[i],true) << " <= sX0_0_l1_Cin;" << endl;
//						else {
//							if (i==1) vhdl << tab << declare(join("res",i),cSize[i],true) << " <= " << join("sX",i,"_0_l",l,"_Zero") << " when " << join("coutX",0,"_0_l",l,"_Cin")<<"='0' else "<<join("sX",i,"_0_l",l,"_One")<<";"<<endl;
//							else      vhdl << tab << declare(join("res",i),cSize[i],true) << " <= " << join("sX",i,"_0_l",l,"_Zero") << " when rawCarrySum"<<of(i-2)<<"='0' else "<<join("sX",i,"_0_l",l,"_One")<<";"<<endl;
//						}
//					}
//			
//				}else{ //Altera /////////////////////////////////////////////////////////////////////
//					vhdl << tab << "--perform the short carry additions" << endl;
//					IntAdderSpecific *cgc = new IntAdderSpecific(target,nbOfChunks-2);
//					oplist.push_back(cgc);
//				
//					inPortMap(cgc, "X", "carryStringZero" );
//					inPortMap(cgc, "Y", "carryStringOne" );
//					inPortMapCst(cgc, "Cin", join("coutX",0,"_0_l",1,"_Cin"));
//					outPortMap(cgc, "R",    "rawCarrySum" );
//					outPortMap(cgc, "Cout", "cgcCout");
//					vhdl << instance(cgc, "cgc");

//					vhdl << tab <<"--get the final pipe results"<<endl;
//					for ( int i=0; i<nbOfChunks; i++){
//						if (i==0) 
//							vhdl << tab << declare(join("res",i),cSize[i],true) << " <= sX0_0_l1_Cin;" << endl;
//						else {
//							if (i==1) vhdl << tab << declare(join("res",i),cSize[i],true) << " <= " << join("sX",i,"_0_l",l,"_Zero") << " when " << join("coutX",0,"_0_l",l,"_Cin")<<"='0' else "<<join("sX",i,"_0_l",l,"_One")<<";"<<endl;
//							else      vhdl << tab << declare(join("res",i),cSize[i],true) << " <= " << join("sX",i,"_0_l",l,"_One") << " when  ((not(rawCarrySum"<<of(i-2)<<") and carryStringOne"<<of(i-2)<<") or carryStringZero"<<of(i-2)<<")='1' else "<<join("sX",i,"_0_l",l,"_Zero")<<";"<<endl;
//						}
//					}
//				}
				
				vhdl << tab << "R <= ";
				int k=0;
				for (int i=nbOfChunks-1; i>=0; i--){
					vhdl << join("res",i);
					if (i > 0) vhdl << " & ";
					k++;
				}
				vhdl << ";" <<endl;


			///////////////////////////////////////////////////////////////////////////////////////////////////////////////
			////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			///////////////////////////////////////////////////////////////////////////////////////////////////////////////
			///////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
	}
Exemplo n.º 3
0
	FPSumOf3Squares::FPSumOf3Squares(Target* target, int wE, int wF, int optimize)
		: Operator(target), wE(wE), wF(wF)
	{
		setCopyrightString("F. de Dinechin, Bogdan Pasca (2011)");
		srcFileName="FPSumOf3Squares";
		ostringstream o;
		o << "FPSumOf3Squares_" << wE << "_" << wF;
		if(!optimize)
			o << "_FP";
		setName(o.str());

		addFPInput("X", wE, wF);
		addFPInput("Y", wE, wF);
		addFPInput("Z", wE, wF);
		addFPOutput("R", wE, wF, 2); // This 2 means: we will allow two possible inputs (faithful rounding)

		if(!optimize) {
			//////////////////////////////////////////////////////////////////:
			//            A version that assembles FP operators             //
			//////////////////////////////////////////////////////////////////:

			FPMult* mult = new FPMult(target, wE, wF, wE, wF, wE, wF, 1);
			oplist.push_back(mult);
			FPAddSinglePath* add =  new FPAddSinglePath(target, wE, wF, wE, wF, wE, wF);
			oplist.push_back(add);
		
			inPortMap (mult, "X", "X");
			inPortMap (mult, "Y", "X");
			outPortMap(mult, "R", "X2");
			vhdl << instance(mult, "multx");
		
			inPortMap (mult, "X", "Y");
			inPortMap (mult, "Y", "Y");
			outPortMap(mult, "R", "Y2");
			vhdl << instance(mult, "multy");
		
			inPortMap (mult, "X", "Z");
			inPortMap (mult, "Y", "Z");
			outPortMap(mult, "R", "Z2");
			vhdl << instance(mult, "multz");
		
			syncCycleFromSignal("Z2", false);
			nextCycle(); 
		
			inPortMap (add, "X", "X2");
			inPortMap (add, "Y", "Y2");
			outPortMap(add, "R", "X2PY2");
			vhdl << instance(add, "add1");
		
			syncCycleFromSignal("X2PY2", false);
			nextCycle(); 
		
			inPortMap (add, "X", "X2PY2");
			inPortMap (add, "Y", "Z2");
			outPortMap(add, "R", "X2PY2PZ2");
			vhdl << instance(add, "add2");
		
			syncCycleFromSignal("X2PY2PZ2", false);
			setCriticalPath(add->getOutputDelay("R"));
			vhdl << tab << "R <= X2PY2PZ2;"<<endl;
			outDelayMap["R"]=getCriticalPath();
		}
		else { ////////////////// here comes the FloPoCo version	//////////////////////////:
			// Error analysis
			// 3 ulps(wF+g) in the multiplier truncation
			// Again 2 ulps(wF+g) in the shifter output truncation
			// Normalisation truncation: either 0 (total 5), or 1 ulp(wF+g) but dividing the previous by 2 (total 3.5)
			// Total max 5 ulps, we're safe with 3 guard bits

			// guard bits for a faithful result
			int g=3; 

			// The exponent datapath

			// setCriticalPath( getMaxInputDelays(inputDelays) + target->localWireDelay());
			setCriticalPath(0);

			manageCriticalPath(  target->adderDelay(wE+1) // subtractions 
													 + target->localWireDelay(wE) // fanout of XltY etc
													 + target->lutDelay()         // & and mux
													 );

			//---------------------------------------------------------------------
			// extract the three biased exponents. 
			vhdl << tab << declare("EX", wE) << " <=  X" << range(wE+wF-1, wF) << ";" << endl;
			vhdl << tab << declare("EY", wE) << " <=  Y" << range(wE+wF-1, wF) << ";" << endl;
			vhdl << tab << declare("EZ", wE) << " <=  Z" << range(wE+wF-1, wF) << ";" << endl;
		
			// determine the max of the exponents
			vhdl << tab << declare("DEXY", wE+1) << " <=   ('0' & EX) - ('0' & EY);" << endl;
			vhdl << tab << declare("DEYZ", wE+1) << " <=   ('0' & EY) - ('0' & EZ);" << endl;
			vhdl << tab << declare("DEXZ", wE+1) << " <=   ('0' & EX) - ('0' & EZ);" << endl;
			vhdl << tab << declare("XltY") << " <=   DEXY("<< wE<<");" << endl;
			vhdl << tab << declare("YltZ") << " <=   DEYZ("<< wE<<");" << endl;
			vhdl << tab << declare("XltZ") << " <=   DEXZ("<< wE<<");" << endl;
		
			// rename the exponents  to A,B,C with A>=(B,C)
			vhdl << tab << declare("EA", wE)  << " <= " << endl
				  << tab << tab << "EZ when (XltZ='1') and (YltZ='1')  else " << endl
				  << tab << tab << "EY when (XltY='1') and (YltZ='0')  else " << endl
				  << tab << tab << "EX; " << endl;
			vhdl << tab << declare("EB", wE)  << " <= " << endl
				  << tab << tab << "EX when (XltZ='1') and (YltZ='1')  else " << endl
				  << tab << tab << "EZ when (XltY='1') and (YltZ='0')  else " << endl
				  << tab << tab << "EY; " << endl;
			vhdl << tab << declare("EC", wE)  << " <= " << endl
				  << tab << tab << "EY when (XltZ='1') and (YltZ='1')  else " << endl
				  << tab << tab << "EX when (XltY='1') and (YltZ='0')  else " << endl
				  << tab << tab << "EZ; " << endl;
		
			//---------------------------------------------------------------------
			// Now recompute our two shift values -- they were already computed at cycle 0 but it is cheaper this way, otherwise we have to register, negate and mux them.
			manageCriticalPath(  target->adderDelay(wE-1) );

			vhdl << tab << declare("fullShiftValB", wE) << " <=  (EA" << range(wE-2,0) << " - EB" << range(wE-2,0) << ") & '0' ; -- positive result, no overflow " << endl;
			vhdl << tab << declare("fullShiftValC", wE) << " <=  (EA" << range(wE-2,0) << " - EC" << range(wE-2,0) << ") & '0' ; -- positive result, no overflow " << endl;
	
			double cpfullShiftValC = getCriticalPath();
			//---------------------------------------------------------------------
			Shifter* rightShifterDummy = new Shifter(target,wF+g+2, wF+g+2, Shifter::Right);
			int sizeRightShift = rightShifterDummy->getShiftInWidth(); 

			//-- Manage the shift value of the mantissa of B --------
			manageCriticalPath( target->localWireDelay() + target->lutDelay());
			vhdl<<tab<<declare("shiftedOutB") << " <= "; 
			if (wE>sizeRightShift){
				for (int i=wE-1;i>=sizeRightShift;i--) {
					vhdl<< "fullShiftValB("<<i<<")";
					if (i>sizeRightShift)
						vhdl<< " or ";
				}
				vhdl<<";"<<endl;
			}
			else
				vhdl<<tab<<"'0';"<<endl; 
			
			if (wE>sizeRightShift) {
				manageCriticalPath( target->localWireDelay() + target->lutDelay());
				vhdl<<tab<<declare("shiftValB",sizeRightShift) << " <=  fullShiftValB("<< sizeRightShift-1<<" downto 0)"
					  << " when shiftedOutB='0'"<<endl
					  <<tab << tab << "    else CONV_STD_LOGIC_VECTOR("<<wF+g+1<<","<<sizeRightShift<<") ;" << endl; 
			}else if (wE==sizeRightShift) {
				vhdl<<tab<<declare("shiftValB",sizeRightShift) << " <= fullShiftValB;" << endl ;
			}else { //  wE< sizeRightShift
				vhdl<<tab<<declare("shiftValB",sizeRightShift) << " <= CONV_STD_LOGIC_VECTOR(0,"<<sizeRightShift-wE <<") & fullShiftValB;" <<	endl;
			}
			double cpshiftValB = getCriticalPath();
			
			//-- Manage the shift value of the mantissa of C --------
			manageCriticalPath( target->localWireDelay() + target->lutDelay()); 
			//FIXME possible fixme needed when or does not fit on lut
			vhdl<<tab<<declare("shiftedOutC") << " <= "; 
			if (wE>sizeRightShift){
				for (int i=wE-1;i>=sizeRightShift;i--) {
					vhdl<< "fullShiftValC("<<i<<")";
					if (i>sizeRightShift)
						vhdl<< " or ";
				}
				vhdl<<";"<<endl;
			}
			else
				vhdl<<tab<<"'0';"<<endl; 
		
			setCycleFromSignal("fullShiftValC",cpfullShiftValC);
			if (wE>sizeRightShift) {
				manageCriticalPath( target->localWireDelay() + target->lutDelay());//the mux delay
				vhdl<<tab<<declare("shiftValC",sizeRightShift) << " <= fullShiftValC("<< sizeRightShift-1<<" downto 0)"
					  << " when shiftedOutC='0'"<<endl
					  <<tab << tab << "    else CONV_STD_LOGIC_VECTOR("<<wF+g+1<<","<<sizeRightShift<<") ;" << endl; 
			} else if (wE==sizeRightShift) {
				vhdl<<tab<<declare("shiftValC",sizeRightShift) << " <= fullShiftValC;" << endl ;
			} else 	{ //  wE< sizeRightShift
				vhdl<<tab<<declare("shiftValC",sizeRightShift) << " <= CONV_STD_LOGIC_VECTOR(0,"<<sizeRightShift-wE <<") & fullShiftValC;" <<	endl;
			}

			// Back to cycle 0 for the significand datapath
			setCycle(0);
			//FIXME add inDelayMap for use within hierarchies of components 
			// Square the significands 
#define USE_SQUARER 1
#if  USE_SQUARER
			IntSquarer* mult = new IntSquarer(target,  1+ wF);
#else
			IntMultiplier* mult = new IntMultiplier(target, 1+ wF, 1+ wF);
#endif
			oplist.push_back(mult);
		
			vhdl << tab << declare("mX", wF+1)  << " <= '1' & X" << range(wF-1, 0) << "; " << endl;
		
			inPortMap (mult, "X", "mX");
#if  !USE_SQUARER
			inPortMap (mult, "Y", "mX");
#endif
			outPortMap(mult, "R", "mX2");
			vhdl << instance(mult, "multx");
	
			vhdl << tab << declare("mY", wF+1)  << " <= '1' & Y" << range(wF-1, 0) << "; " << endl;

			inPortMap (mult, "X", "mY");
#if  !USE_SQUARER	
			inPortMap (mult, "Y", "mY");
#endif
			outPortMap(mult, "R", "mY2");
			vhdl << instance(mult, "multy");
		
			vhdl << tab << declare("mZ", wF+1)  << " <= '1' & Z" << range(wF-1, 0) << "; " << endl;
		
			inPortMap (mult, "X", "mZ");
#if  !USE_SQUARER	
			inPortMap (mult, "Y", "mZ");
#endif
			outPortMap(mult, "R", "mZ2");
			vhdl << instance(mult, "multz");

			syncCycleFromSignal("mZ2", false);
			setCriticalPath(mult->getOutputDelay("R"));
			// truncate the three results to wF+g+2
			int prodsize = 2+2*wF;
			vhdl << tab << declare("X2t", wF+g+2)  << " <= mX2" << range(prodsize-1, prodsize - wF-g-2) << "; " << endl;
			vhdl << tab << declare("Y2t", wF+g+2)  << " <= mY2" << range(prodsize-1, prodsize - wF-g-2) << "; " << endl;
			vhdl << tab << declare("Z2t", wF+g+2)  << " <= mZ2" << range(prodsize-1, prodsize - wF-g-2) << "; " << endl;
	
			// Now we have our three FP squares, we rename them to A,B,C with A>=(B,C) 
			// only 3 3-muxes
			manageCriticalPath(target->localWireDelay(wF) + target->lutDelay());  
			vhdl << tab << declare("MA", wF+g+2)  << " <= " << endl
				  << tab << tab << "Z2t when (XltZ='1') and (YltZ='1')  else " << endl
				  << tab << tab << "Y2t when (XltY='1') and (YltZ='0')  else " << endl
				  << tab << tab << "X2t; " << endl;
			vhdl << tab << declare("MB", wF+g+2)  << " <= " << endl
				  << tab << tab << "X2t when (XltZ='1') and (YltZ='1')  else " << endl
				  << tab << tab << "Z2t when (XltY='1') and (YltZ='0')  else " << endl
				  << tab << tab << "Y2t; " << endl;
			vhdl << tab << declare("MC", wF+g+2)  << " <= " << endl
				  << tab << tab << "Y2t when (XltZ='1') and (YltZ='1')  else " << endl
				  << tab << tab << "X2t when (XltY='1') and (YltZ='0')  else " << endl
				  << tab << tab << "Z2t; " << endl;
			
			//Synchronize exponent and significand datapath
			syncCycleFromSignal("shiftValB", cpshiftValB, false);

			// B and C right shifters are the same
			Shifter* rightShifter = new Shifter(target,wF+g+2, wF+g+2, Shifter::Right, inDelayMap("X",target->localWireDelay()+getCriticalPath())); 
			oplist.push_back(rightShifter);

			inPortMap  (rightShifter, "X", "MB");
			inPortMap  (rightShifter, "S", "shiftValB");
			outPortMap (rightShifter, "R","shiftedB");
			vhdl << instance(rightShifter, "ShifterForB");

			inPortMap  (rightShifter, "X", "MC");
			inPortMap  (rightShifter, "S", "shiftValC");
			outPortMap (rightShifter, "R","shiftedC");
			vhdl << instance(rightShifter, "ShifterForC");
		
			// superbly ignore the bits that are shifted out
			syncCycleFromSignal("shiftedB", false);
			setCriticalPath( rightShifter->getOutputDelay("R"));
			
			int shiftedB_size = getSignalByName("shiftedB")->width();
			vhdl << tab << declare("alignedB", wF+g+2)  << " <= shiftedB" << range(shiftedB_size-1, shiftedB_size -(wF+g+2)) << "; " << endl;
			vhdl << tab << declare("alignedC", wF+g+2)  << " <= shiftedC" << range(shiftedB_size-1, shiftedB_size -(wF+g+2)) << "; " << endl;
		
			vhdl << tab << declare("paddedA", wF+g+4)  << " <= \"00\" & MA; " << endl;
			vhdl << tab << declare("paddedB", wF+g+4)  << " <= \"00\" & alignedB; " << endl;
			vhdl << tab << declare("paddedC", wF+g+4)  << " <= \"00\" & alignedC; " << endl;
		
			IntMultiAdder* adder = new IntMultiAdder(target,wF+g+4, 3, inDelayMap("X0", target->localWireDelay() + getCriticalPath() ));
			oplist.push_back(adder);

			inPortMap   (adder, "X0", "paddedA");
			inPortMap   (adder, "X1", "paddedB");
			inPortMap   (adder, "X2", "paddedC");
			inPortMapCst(adder, "Cin", "'0'"); // a 1 would compensate the two truncations in the worst case -- to explore
			outPortMap  (adder, "R","sum");
			vhdl << instance(adder, "adder1");

			syncCycleFromSignal("sum", false);
			setCriticalPath(adder->getOutputDelay("R"));

			manageCriticalPath(target->localWireDelay() + target->lutDelay());
			// Possible 3-bit normalisation, with a truncation
			vhdl << tab << declare("finalFraction", wF+g)  << " <= " << endl
				  << tab << tab << "sum" << range(wF+g+2,3) << "   when sum(" << wF+g+3 << ")='1'    else " << endl
				  << tab << tab << "sum" << range(wF+g+1, 2) <<  "   when (sum" << range(wF+g+3, wF+g+2) << "=\"01\")     else " << endl
				  << tab << tab << "sum" << range(wF+g, 1) <<  "   when (sum" << range(wF+g+3, wF+g+1) << "=\"001\")     else " << endl
				  << tab << tab << "sum" << range(wF+g-1, 0) << "; " << endl;

			// Exponent datapath. We have to compute 2*EA - bias + an update corresponding to the normalisatiobn
			// since (1.m)*(1.m) = xx.xxxxxx sum is xxxx.xxxxxx
			// All the following ignores overflows, infinities, zeroes, etc for the sake of simplicity.
			manageCriticalPath(target->localWireDelay() + target->lutDelay());
			int bias = (1<<(wE-1))-1;
			vhdl << tab << declare("exponentUpdate", wE+1)  << " <= " << endl
				  << tab << tab << "CONV_STD_LOGIC_VECTOR(" << bias-3 << ", "<< wE+1 <<")  when sum(" << wF+g+3 << ")='1'    else " << endl
				  << tab << tab << "CONV_STD_LOGIC_VECTOR(" << bias-2 << ", "<< wE+1 <<")  when (sum" << range(wF+g+3, wF+g+2) << "=\"01\")     else " << endl
				  << tab << tab << "CONV_STD_LOGIC_VECTOR(" << bias-1 << ", "<< wE+1 <<")  when (sum" << range(wF+g+3, wF+g+1) << "=\"001\")     else " << endl
				  << tab << tab << "CONV_STD_LOGIC_VECTOR(" << bias   << ", "<< wE+1 <<")  ; " << endl;
		
			manageCriticalPath( target->localWireDelay() + target->adderDelay(wE+1));
			vhdl << tab << declare("finalExp", wE+1)  << " <= (EA & '0') - exponentUpdate ; " << endl;
			
			IntAdder *roundingAdder = new IntAdder(target, wE +1 + wF);
			oplist.push_back(roundingAdder);
			
			vhdl << tab << declare("roundingOp",wE+1 + wF) << "<= finalExp & finalFraction"<<range(wF+g-1,g)<<";"<<endl;
			
			inPortMap     ( roundingAdder, "X", "roundingOp");
			inPortMapCst  ( roundingAdder, "Y", zg(wE+1+wF));
			inPortMapCst   ( roundingAdder, "Cin", "'1'");
			outPortMap    ( roundingAdder, "R", "expFrac");
			vhdl << tab << instance( roundingAdder, "RoundingAdder"); 
			syncCycleFromSignal("expFrac");
			setCriticalPath( roundingAdder->getOutputDelay("R"));
				
			//TODO 		
			vhdl << tab << declare("rExc",2) << " <= \"01\" when  expFrac"<<of(wE+wF)<<"='0' else \"10\";"<<endl;
			vhdl << tab << "R <= rExc & '0' & expFrac"<<range(wE+1 + wF-2,0)<<";"<<endl;
		}
	
	}