Пример #1
0
/* Tag array wordline delay (see section 6.3 of tech report) */
double SIM_wordline_tag_delay(int C, int A, int Ntspd, int Ntwl, double inrisetime, double *outrisetime)
{
	double tf,m,a,b,c;
	double Cline,Rline,Ceq,nextinputtime;
	int tagbits;
	double Tworddrivedel,Twordchargedel;

	/* number of tag bits */

	tagbits = PARM(ADDRESS_BITS)+2-(int)logtwo((double)C)+(int)logtwo((double)A);

	/* first stage */

	Ceq = SIM_draincap(Wdecinvn,NCH,1) + SIM_draincap(Wdecinvp,PCH,1) +
		SIM_gatecap(Wdecinvn+Wdecinvp,20.0);
	tf = SIM_transreson(Wdecinvn,NCH,1)*Ceq;

	Tworddrivedel = SIM_horowitz(inrisetime,tf,PARM(VSINV),PARM(VSINV),RISE);
	nextinputtime = Tworddrivedel/(1.0-PARM(VSINV));

	/* second stage */
	Cline = (SIM_gatecappass(Wmemcella,(BitWidth-2*Wmemcella)/2.0)+
			SIM_gatecappass(Wmemcella,(BitWidth-2*Wmemcella)/2.0)+
			Cwordmetal)*tagbits*A*Ntspd/Ntwl+
		SIM_draincap(Wdecinvn,NCH,1) + SIM_draincap(Wdecinvp,PCH,1);
	Rline = Rwordmetal*tagbits*A*Ntspd/(2*Ntwl);
	tf = (SIM_transreson(Wdecinvp,PCH,1)+Rline)*Cline;
	Twordchargedel = SIM_horowitz(nextinputtime,tf,PARM(VSINV),PARM(VSINV),FALL);
	*outrisetime = Twordchargedel/PARM(VSINV);
	return(Tworddrivedel+Twordchargedel);
}
Пример #2
0
void tsim_cache_power_model::calculate_decoder_power(){
  // based on Wattch's model
  double total_cap = 0.0, temp_cap = 0.0;
  int tot_ports = 0, decode_bits = 0;

  // total ports
  tot_ports = num_read_ports + num_write_ports;

  // number of decode bits needed
  decode_bits = (int)ceil(logtwo(num_rows));

  //TODO: Adjust this equation after figuring what
  //WATTCH is doing : add both drain and gate caps here
  //setting it to some random value for now
  temp_cap = 200e-15;

  total_cap += tot_ports * decode_bits * temp_cap;

  // TODO: ignoring the NOR/NAND part for the first cut
  // add this model later
  tot_dec_cap = total_cap;
  
  decoder_power = get_act_factor() * total_cap * get_power_factor(); 

}
Пример #3
0
main()
{
  double predeclength, wordlinelength, bitlinelength;

  double regfile_power, regfile_decoder, regfile_wordline, regfile_wordline16, regfile_wordline33, regfile_bitline;

  int scale_factor;
  int data_width;
  int rports, wports;

  int switch_arg;

  printf("1. Simple Register File\n");
  printf("2. Simple Cache Structure\n");
  printf("3. Simple CAM Structure\n");
  printf("4. Complex Cache (Auto-Sized)\n");

  scanf("%d",&switch_arg);

  printf("note these are MAX powers (assuming full switching)\n");

  switch(switch_arg){
    case 1:
      {
	printf("Enter Reg File Params:\n");
	printf("Number of Registers: ");
	scanf("%d",&num_regs);
	printf("Data Width: ");
	scanf("%d",&data_width);
	printf("Number of Read Ports: ");
	scanf("%d",&rports);
	printf("Number of Write Ports: ");
	scanf("%d",&wports);
	
	printf("%d-entryx%d-width,%d-rdport,%d-wrport: %f (W)\n",num_regs,data_width,rports,wports,simple_array_power(num_regs,data_width,rports,wports,0));
	printf(" decode_power (W): %f\n",simple_array_decoder_power(num_regs,data_width,rports,wports,0));
	printf(" wordline_power (W): %f\n",simple_array_wordline_power(num_regs,data_width,rports,wports,0));
	printf(" bitline_power (W): %f\n",simple_array_bitline_power(num_regs,data_width,rports,wports,0));
	break;
      }

    case 2:
      {
	printf("Enter Cache Params:\n");
	printf("Size of cache: ");
	scanf("%d",&num_regs);
	printf("Data Width: ");
	scanf("%d",&data_width);
	printf("Number of Read Ports: ");
	scanf("%d",&rports);
	printf("Number of Write Ports: ");
	scanf("%d",&wports);
	
	printf("%d-entryx%d-width,%d-rdport,%d-wrport: %f (W)\n",num_regs,data_width,rports,wports,simple_array_power(num_regs,data_width,rports,wports,1));
	printf(" decode_power (W): %f\n",simple_array_decoder_power(num_regs,data_width,rports,wports,1));
	printf(" wordline_power (W): %f\n",simple_array_wordline_power(num_regs,data_width,rports,wports,1));
	printf(" bitline_power (W): %f\n",simple_array_bitline_power(num_regs,data_width,rports,wports,1));
	break;
      }

    case 3:
      {
	printf("Enter CAM Params:\n");
	printf("Entries in CAM: ");
	scanf("%d",&num_regs);
	printf("Tag Width: ");
	scanf("%d",&data_width);
	printf("Number of Read Ports: ");
	scanf("%d",&rports);
	printf("Number of Write Ports: ");
	scanf("%d",&wports);
	
	printf("%d-entryx%d-tagwidth,%d-rdport,%d-wrport: %f (W)\n",num_regs,data_width,rports,wports,cam_array(num_regs,data_width,rports,wports));
	printf(" tagdrive_power (W): %f\n",cam_tagdrive(num_regs,data_width,rports,wports));
	printf(" tagmatch_power (W): %f\n",cam_tagmatch(num_regs,data_width,rports,wports));
	break;
      }

    case 4:
      {
	int nsets, bsize, assoc,res_memport, tagsize;
	int ndwl, ndbl, nspd, ntwl, ntbl, ntspd, c,b,a,cache, rowsb, colsb, trowsb, tcolsb;
	double cache_decoder, cache_wordline, cache_bitline, 
	  cache_senseamp, cache_tagarray, total_cache_power;
	time_result_type time_result;
	time_parameter_type time_parameters;
	int va_size = 48;

	printf("Enter Cache Params:\n");
	printf("Number of Sets in cache: ");
	scanf("%d",&nsets);
	printf("Block Size (bytes): ");
	scanf("%d",&bsize);
	printf("Associativity: ");
	scanf("%d",&assoc);
	printf("Number of Memory Ports: ");
	scanf("%d",&res_memport);

	printf("note tagarray size is estimated based on assuming 48-bit virtual addresses\n");

	cache = 1;

	time_parameters.cache_size = nsets * bsize * assoc; /* C */
	time_parameters.block_size = bsize; /* B */
	time_parameters.associativity = assoc; /* A */
	time_parameters.number_of_sets = nsets; /* C/(B*A) */

	calculate_time(&time_result,&time_parameters);
	output_data(&time_result,&time_parameters);

	ndwl=time_result.best_Ndwl;
	ndbl=time_result.best_Ndbl;
	nspd=time_result.best_Nspd;
	ntwl=time_result.best_Ntwl;
	ntbl=time_result.best_Ntbl;
	ntspd=time_result.best_Ntspd;
	c = time_parameters.cache_size;
	b = time_parameters.block_size;
	a = time_parameters.associativity; 

	rowsb = c/(8*b*a*ndbl*nspd);
	colsb = 8*b*a*nspd/ndwl;

	tagsize = va_size - ((int)logtwo(nsets) + (int)logtwo(bsize));
	trowsb = c/(8*b*a*ntbl*ntspd);
	tcolsb = a * (tagsize + 1 + 6) * ntspd/ntwl;

	predeclength = rowsb * (RegCellHeight + WordlineSpacing);
	wordlinelength = colsb *  (RegCellWidth + BitlineSpacing);
	bitlinelength = rowsb * (RegCellHeight + WordlineSpacing);

	cache_decoder = res_memport*ndwl*ndbl*array_decoder_power(rowsb,colsb,predeclength,1,1,cache);
	cache_wordline = res_memport*ndwl*ndbl*array_wordline_power(rowsb,colsb,wordlinelength,1,1,cache);
	cache_bitline = res_memport*ndwl*ndbl*array_bitline_power(rowsb,colsb,bitlinelength,1,1,cache);
	cache_senseamp = res_memport*ndwl*ndbl*senseamp_power(colsb);
	cache_tagarray = res_memport*ntwl*ntbl*(simple_array_power(trowsb,tcolsb,1,1,cache));
	
	total_cache_power = cache_decoder + cache_wordline + cache_bitline + cache_senseamp + cache_tagarray;

	fprintf(stderr,"%d KB %d-way cache (%d-byte block size):\n",c,a,b);
	fprintf(stderr,"ndwl == %d, ndbl == %d, nspd == %d\n",ndwl,ndbl,nspd);
	fprintf(stderr,"%d sets of %d rows x %d cols\n",ndwl*ndbl,rowsb,colsb);
	fprintf(stderr,"tagsize == %d\n",tagsize);
	fprintf(stderr,"\nntwl == %d, ntbl == %d, ntspd == %d\n",ntwl,ntbl,ntspd);
	fprintf(stderr,"%d sets of %d rows x %d cols\n",ntwl*ntbl,trowsb,tcolsb);
	
	printf("Total Power (W): %f\n",total_cache_power);
	printf(" decode_power (W): %f\n",cache_decoder);
	printf(" wordline_power (W): %f\n",cache_wordline);
	printf(" bitline_power (W): %f\n",cache_bitline);
	printf(" senseamp_power (W): %f\n",cache_senseamp);
	printf(" tagarray_power (W): %f\n",cache_tagarray);
	break;
      }
 default:

  }


}
Пример #4
0
total_result_type
cacti_interface (int cache_size,
		 int line_size,
		 int associativity,
		 int rw_ports,
		 int excl_read_ports,
		 int excl_write_ports,
		 int single_ended_read_ports,
		 int banks,
		 double tech_node,
		 int output_width,
		 int specific_tag,
		 int tag_width, int access_mode, int pure_sram)
{
  int C, B, A, ERP, EWP, RWP, NSER;
  double tech;
  double logbanks;
  double logbanksfloor;
  int seq_access = 0;
  int fast_access = 0;
  int bits_output = output_width;
  int nr_args = 9;
  double NSubbanks = (double) banks;

  double ratioofbankstoports;

  extern int force_tag, force_tag_size;



  total_result_type endresult;
  endresult.result.subbanks = 0.0;

  result_type result;
  arearesult_type arearesult;
  area_type arearesult_subbanked;
  parameter_type parameters;

  /* input parameters:
     C B A ERP EWP */

  /*dt: make sure we're using some simple leakage reduction */
  dualVt = FALSE;

//#ifdef XCACTI
  //parameters.latchsa    = 0;
  //parameters.ignore_tag = 0;
//#endif
  force_tag = 0;
  parameters.force_tag = 0;

  if (specific_tag)
    {
      force_tag = 1;
      force_tag_size = tag_width;
      parameters.force_tag = 1;
      parameters.tag_size = tag_width;
      //parameters.ignore_tag = 1;
    }


  switch (access_mode)
    {
    case 0:
      seq_access = fast_access = FALSE;
      break;
    case 1:
      seq_access = TRUE;
      fast_access = FALSE;
      break;
    case 2:
      seq_access = FALSE;
      fast_access = TRUE;
      break;
    }

  C = cache_size;
  A = associativity;
  B = line_size;
  if ((B < 1))
    {
      printf ("Block size must >=1\n");
      return endresult;
      //exit(1);
    }

  if ((B * 8 < bits_output))
    {
      printf ("Block size must be at least %d\n", bits_output / 8);
      return endresult;
      //exit(1);
    }


  tech = tech_node;
  if ((tech <= 0))
    {
      printf ("Feature size must be > 0\n");
      return endresult;
      //exit(1);
    }
  if ((tech > 0.8))
    {
      printf ("Feature size must be <= 0.80 (um)\n");
      return endresult;
      //exit(1);
    }

  if (nr_args == 6)
    {
      RWP = 1;
      ERP = 0;
      EWP = 0;
      NSER = 0;
    }
  else if (nr_args == 8)
    {
      RWP = 1;
      ERP = 0;
      EWP = 0;
      NSER = 0;
      bits_output = output_width;
      seq_access = 1;
    }
  else if (nr_args == 9)
    {
      RWP = rw_ports;
      ERP = excl_read_ports;
      EWP = excl_write_ports;
      NSER = single_ended_read_ports;
    }
  else if (nr_args >= 10)
    {
      RWP = rw_ports;
      ERP = excl_read_ports;
      EWP = excl_write_ports;
      NSER = single_ended_read_ports;
      seq_access = 1;
    }
  if ((RWP < 0) || (EWP < 0) || (ERP < 0))
    {
      printf ("Ports must >=0\n");
      return endresult;
      //exit(1);
    }
  if (RWP > 2)
    {
      printf ("Maximum of 2 read/write ports\n");
      return endresult;
      //exit(1);
    }
  if ((RWP + ERP + EWP) < 1)
    {
      printf ("Must have at least one port\n");
      return endresult;
      //exit(1);
    }

  if (NSubbanks < 1)
    {
      printf
	("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n");
      return endresult;
      //exit(1);
    }

  logbanks = logtwo ((double) (NSubbanks));
  logbanksfloor = floor (logbanks);

  if (logbanks > logbanksfloor)
    {
      printf
	("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n");
      return endresult;
      //exit(1);
    }

  if (C == B * A)
    {
      parameters.fully_assoc = 1;
      A = C / B;
    }
  else
    {
      parameters.fully_assoc = 0;
    }


  C = cache_size / ((int) (NSubbanks));
  if ((C < 64))
    {
      printf ("Cache size must >=64\n");
      return endresult;
      //exit(1);
    }

  //A = C/B;
  if (A > 16)
    {
      parameters.fully_assoc = 1;
      A = 16;
    }

  /*if ((associativity == 0) || (A == C/B)) {
     A = C/B;
     parameters.fully_assoc = 1;
     } else {
     if (associativity == 1)
     {
     A=1;
     parameters.fully_assoc = 0;
     }
     else
     {
     parameters.fully_assoc = 0;
     A = associativity;
     if ((A < 1)) {
     printf("Associativity must >= 1\n");
     return endresult;
     //exit(1);
     }
     assoc = logtwo((double)(A));
     assocfloor = floor(assoc);

     if(assoc > assocfloor){
     printf("Associativity should be a power of 2\n");
     return endresult;
     //exit(1);
     }

     if ((A > 32)) {
     printf("Associativity must <= 32\n or try FA (fully associative)\n");
     return endresult;
     //exit(1);
     }
     }
     }

     if (C/(B*A)<=1 && !parameters.fully_assoc) {
     //printf("Number of sets is too small:\n  Need to either increase cache size, or decrease associativity or block size\n  (or use fully associative cache)\n");
     //return endresult;
     A = C/B;
     parameters.fully_assoc = 1;
     //exit(1);
     } */

  printf ("\n########### Printing input for params for testing...###");
  printf ("\n C = %d, B = %d, A = %d", C, B, A);
  printf ("\n RWP = %d, ERP = %d, EWP = %d, NSER = %d", RWP, ERP, EWP, NSER);
  printf
    ("\n banks = %d, tech = %f, bits_output = %d, fast_access = %d, pure_sram = %d",
     banks, tech, bits_output, fast_access, pure_sram);
  printf ("\n force_tag = %d, force_tag_size = %d", force_tag,
	  force_tag_size);
  printf ("\n #################\n");

  parameters.cache_size = C;
  parameters.block_size = B;

  parameters.nr_bits_out = bits_output;
  /*dt: testing sequential access mode */
  if (seq_access)
    {
      parameters.tag_associativity = A;
      parameters.data_associativity = 1;
      parameters.sequential_access = 1;
    }
  else
    {
      parameters.tag_associativity = parameters.data_associativity = A;
      parameters.sequential_access = 0;
    }
  if (fast_access)
    {
      parameters.fast_access = 1;
    }
  else
    {
      parameters.fast_access = 0;
    }
  parameters.num_readwrite_ports = RWP;
  parameters.num_read_ports = ERP;
  parameters.num_write_ports = EWP;
  parameters.NSubbanks = banks;
  parameters.num_single_ended_read_ports = NSER;
  parameters.number_of_sets = C / (B * A);
  parameters.fudgefactor = .8 / tech;
  parameters.tech_size = (double) tech;
  parameters.pure_sram = pure_sram;
  //If multiple banks and multiple ports are specified, then if number of banks/total number
  //of ports > 1 then assume that the multiple ports are implemented via the multiple banks.
  //Also assume that each bank has only 1 RWP port. There are some problems with this logic that
  //will be fixed in v5.0
  ratioofbankstoports = NSubbanks / (RWP + ERP + EWP);
  if (ratioofbankstoports >= 1.0)
    {
      //We assume that each bank has 1 RWP port.
      parameters.num_readwrite_ports = 1;
      parameters.num_read_ports = 0;
      parameters.num_write_ports = 0;
      parameters.num_single_ended_read_ports = 0;
    }

  if (parameters.number_of_sets < 1)
    {
      printf ("Less than one set...\n");
      return endresult;
      //exit(1);
    }

  parameters.VddPow = 4.5 / (pow (parameters.fudgefactor, (2.0 / 3.0)));
  if (parameters.VddPow < 0.7)
    parameters.VddPow = 0.7;
  if (parameters.VddPow > 5.0)
    parameters.VddPow = 5.0;

  printf ("\n##### Printing parameters for testing...#####\n");
  output_params (&parameters);

  init_tech_params_default_process ();	//v4.1: First initialize all tech variables
  //to 0.8 micron values. init_tech_params function below then reinitializes tech variables to
  //given process values
  init_tech_params (parameters.tech_size);
  calculate_time (&result, &arearesult, &arearesult_subbanked, &parameters,
		  &NSubbanks);

  //v4.1: No longer using calculate_area function as area has already been
  //computed for the given tech node

  /*arearesult.dataarray_area.scaled_area = calculate_area(arearesult.dataarray_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE;
     arearesult.datapredecode_area.scaled_area = calculate_area(arearesult.datapredecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE;
     arearesult.datacolmuxpredecode_area.scaled_area = calculate_area(arearesult.datacolmuxpredecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE;
     arearesult.datacolmuxpostdecode_area.scaled_area = calculate_area(arearesult.datacolmuxpostdecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE;
     arearesult.datawritesig_area.scaled_area = (parameters.num_readwrite_ports+parameters.num_read_ports+parameters.num_write_ports)*calculate_area(arearesult.datawritesig_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE;

     arearesult.tagarray_area.scaled_area = calculate_area(arearesult.tagarray_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE;
     arearesult.tagpredecode_area.scaled_area = calculate_area(arearesult.tagpredecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE;
     arearesult.tagcolmuxpredecode_area.scaled_area = calculate_area(arearesult.tagcolmuxpredecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE;
     arearesult.tagcolmuxpostdecode_area.scaled_area = calculate_area(arearesult.tagcolmuxpostdecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE;
     arearesult.tagoutdrvdecode_area.scaled_area = calculate_area(arearesult.tagoutdrvdecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE;
     arearesult.tagoutdrvsig_area.scaled_area = (parameters.num_readwrite_ports+parameters.num_read_ports+parameters.num_write_ports)*
     calculate_area(arearesult.tagoutdrvsig_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE;

     arearesult.perc_data = 100*area_all_dataramcells/(arearesult.totalarea*CONVERT_TO_MMSQUARE);
     arearesult.perc_tag  = 100*area_all_tagramcells/(arearesult.totalarea*CONVERT_TO_MMSQUARE);
     arearesult.perc_cont = 100*(arearesult.totalarea*CONVERT_TO_MMSQUARE-area_all_dataramcells-area_all_tagramcells)/(arearesult.totalarea*CONVERT_TO_MMSQUARE);
     arearesult.sub_eff   = (area_all_dataramcells+area_all_tagramcells)*100/(arearesult.totalarea/100000000.0);
     arearesult.total_eff = (NSubbanks)*(area_all_dataramcells+area_all_tagramcells)*100/
     (calculate_area(arearesult_subbanked,parameters.fudgefactor)*CONVERT_TO_MMSQUARE);
     arearesult.totalarea *= CONVERT_TO_MMSQUARE;
     arearesult.subbankarea = calculate_area(arearesult_subbanked,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; */

  arearesult.dataarray_area.scaled_area =
    arearesult.dataarray_area.height * arearesult.dataarray_area.width *
    CONVERT_TO_MMSQUARE;
  arearesult.datapredecode_area.scaled_area =
    arearesult.datapredecode_area.height *
    arearesult.datapredecode_area.width * CONVERT_TO_MMSQUARE;
  arearesult.datacolmuxpredecode_area.scaled_area =
    arearesult.datacolmuxpredecode_area.height *
    arearesult.datacolmuxpredecode_area.width * CONVERT_TO_MMSQUARE;
  arearesult.datacolmuxpostdecode_area.scaled_area =
    arearesult.datacolmuxpostdecode_area.height *
    arearesult.datacolmuxpostdecode_area.width * CONVERT_TO_MMSQUARE;
  arearesult.datawritesig_area.scaled_area =
    (parameters.num_readwrite_ports + parameters.num_read_ports +
     parameters.num_write_ports) * arearesult.datawritesig_area.height *
    arearesult.datawritesig_area.width * CONVERT_TO_MMSQUARE;

  arearesult.tagarray_area.scaled_area =
    arearesult.tagarray_area.height * arearesult.tagarray_area.width *
    CONVERT_TO_MMSQUARE;
  arearesult.tagpredecode_area.scaled_area =
    arearesult.tagpredecode_area.height * arearesult.tagpredecode_area.width *
    CONVERT_TO_MMSQUARE;
  arearesult.tagcolmuxpredecode_area.scaled_area =
    arearesult.tagcolmuxpredecode_area.height *
    arearesult.tagcolmuxpredecode_area.width * CONVERT_TO_MMSQUARE;
  arearesult.tagcolmuxpostdecode_area.scaled_area =
    arearesult.tagcolmuxpostdecode_area.height *
    arearesult.tagcolmuxpostdecode_area.width * CONVERT_TO_MMSQUARE;
  arearesult.tagoutdrvdecode_area.scaled_area =
    arearesult.tagoutdrvdecode_area.height *
    arearesult.tagoutdrvdecode_area.width * CONVERT_TO_MMSQUARE;
  arearesult.tagoutdrvsig_area.scaled_area =
    (parameters.num_readwrite_ports + parameters.num_read_ports +
     parameters.num_write_ports) * arearesult.tagoutdrvsig_area.height *
    arearesult.tagoutdrvsig_area.width * CONVERT_TO_MMSQUARE;

  arearesult.perc_data =
    100 * area_all_dataramcells / (arearesult.totalarea *
				   CONVERT_TO_MMSQUARE);
  arearesult.perc_tag =
    100 * area_all_tagramcells / (arearesult.totalarea * CONVERT_TO_MMSQUARE);
  arearesult.perc_cont =
    100 * (arearesult.totalarea * CONVERT_TO_MMSQUARE -
	   area_all_dataramcells -
	   area_all_tagramcells) / (arearesult.totalarea *
				    CONVERT_TO_MMSQUARE);
  arearesult.sub_eff =
    (area_all_dataramcells +
     area_all_tagramcells) * 100 / (arearesult.totalarea / 100000000.0);
  arearesult.total_eff =
    (NSubbanks) * (area_all_dataramcells +
		   area_all_tagramcells) * 100 /
    (arearesult_subbanked.height * arearesult_subbanked.width *
     CONVERT_TO_MMSQUARE);
  arearesult.totalarea *= CONVERT_TO_MMSQUARE;
  arearesult.subbankarea =
    arearesult_subbanked.height * arearesult_subbanked.width *
    CONVERT_TO_MMSQUARE;


  if (result.bitline_delay_data < 0.0)
    {
      result.bitline_delay_data = 10 ^ -12;
    }
  if (result.bitline_delay_tag < 0.0)
    {
      result.bitline_delay_tag = 10 ^ -13;
    }
  endresult.result = result;
  endresult.result.subbanks = banks;
  endresult.area = arearesult;
  endresult.params = parameters;

  return endresult;
}
Пример #5
0
int
input_data (int argc, char *argv[])
{
  int C, B, A, ERP, EWP, RWP, NSER, NSubbanks, fully_assoc;
  double tech;
  double logbanks, assoc;
  double logbanksfloor, assocfloor;
  int bits_output = 64;

  if ((argc != 6) && (argc != 9) && (argc != 15))
    {
      printf ("Cmd-line parameters: C B A TECH NSubbanks\n");
      printf ("                 OR: C B A TECH RWP ERP EWP NSubbanks\n");
      exit (1);
    }

  B = atoi (argv[2]);
  if ((B < 1))
    {
      printf ("Block size must >=1\n");
      exit (1);
    }

  if (argc == 9)
    {
      if ((B * 8 < bits_output))
	{
	  printf ("Block size must be at least %d\n", bits_output / 8);
	  exit (1);
	}

      tech = atof (argv[4]);
      if ((tech <= 0))
	{
	  printf ("Feature size must be > 0\n");
	  exit (1);
	}
      if ((tech > 0.8))
	{
	  printf ("Feature size must be <= 0.80 (um)\n");
	  exit (1);
	}

      RWP = atoi (argv[5]);
      ERP = atoi (argv[6]);
      EWP = atoi (argv[7]);
      NSER = 0;

      if ((RWP < 0) || (EWP < 0) || (ERP < 0))
	{
	  printf ("Ports must >=0\n");
	  exit (1);
	}
      if (RWP > 2)
	{
	  printf ("Maximum of 2 read/write ports\n");
	  exit (1);
	}
      if ((RWP + ERP + EWP) < 1)
	{
	  printf ("Must have at least one port\n");
	  exit (1);
	}

      NSubbanks = atoi (argv[8]);

      if (NSubbanks < 1)
	{
	  printf
	    ("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n");
	  exit (1);
	}

      logbanks = logtwo ((double) (NSubbanks));
      logbanksfloor = floor (logbanks);

      if (logbanks > logbanksfloor)
	{
	  printf
	    ("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n");
	  exit (1);
	}

    }

  else if (argc == 6)
    {

      if ((B * 8 < bits_output))
	{
	  printf ("Block size must be at least %d\n", bits_output / 8);
	  exit (1);
	}

      tech = atof (argv[4]);
      if ((tech <= 0))
	{
	  printf ("Feature size must be > 0\n");
	  exit (1);
	}

      if ((tech > 0.8))
	{
	  printf ("Feature size must be <= 0.80 (um)\n");
	  exit (1);
	}

      RWP = 1;
      ERP = 0;
      EWP = 0;
      NSER = 0;

      NSubbanks = atoi (argv[5]);
      if (NSubbanks < 1)
	{
	  printf
	    ("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n");
	  exit (1);
	}
      logbanks = logtwo ((double) (NSubbanks));
      logbanksfloor = floor (logbanks);

      if (logbanks > logbanksfloor)
	{
	  printf
	    ("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n");
	  exit (1);
	}

    }
  else
    {
      tech = atof (argv[9]);
      NSubbanks = atoi (argv[8]);
      if ((tech <= 0))
	{
	  printf ("Feature size must be > 0\n");
	  exit (1);
	}

      if ((tech > 0.8))
	{
	  printf ("Feature size must be <= 0.80 (um)\n");
	  exit (1);
	}
    }

  C = atoi (argv[1]) / ((int) (NSubbanks));
  if (atoi (argv[1]) < 64)
    {
      printf ("Cache size must be greater than 32!\n");
      exit (1);
    }

  if ((strcmp (argv[3], "FA") == 0) || (argv[3][0] == '0'))
    {
      A = C / B;
      fully_assoc = 1;
    }
  else
    {
      if (strcmp (argv[3], "DM") == 0)
	{
	  A = 1;
	  fully_assoc = 0;
	}
      else
	{
	  fully_assoc = 0;
	  A = atoi (argv[3]);
	  if ((A < 0) || (A > 16))
	    {
	      printf
		("Associativity must be  1,2,4,8,16 or 0(fully associative)\n");
	      exit (1);
	    }
	  assoc = logtwo ((double) (A));
	  assocfloor = floor (assoc);

	  if (assoc > assocfloor)
	    {
	      printf ("Associativity should be a power of 2\n");
	      exit (1);
	    }

	}
    }

  if (!fully_assoc && C / (B * A) < 1)
    {
      printf
	("Number of sets is less than 1:\n  Need to either increase cache size, or decrease associativity or block size\n  (or use fully associative cache)\n");
      exit (1);
    }

  return (OK);
}
Пример #6
0
/*
 * Version - 6.0
 *
 * Perform exhaustive search across different bank organizatons,
 * router configurations, grid organizations, and wire models and
 * find an optimal NUCA organization
 * For different bank count values
 * 1. Optimal bank organization is calculated
 * 2. For each bank organization, find different NUCA organizations
 *    using various router configurations, grid organizations,
 *    and wire models.
 * 3. NUCA model with the least cost is picked for
 *    this particular bank count
 * Finally include contention statistics and find the optimal
 *    NUCA configuration
 */
  void
Nuca::sim_nuca()
{
  /* temp variables */
  int it, ro, wr;
  int num_cyc;
  unsigned int i, j;
  unsigned int r, c;
  int l2_c;
  int bank_count = 0;
  uca_org_t ures;
  nuca_org_t *opt_n;
  mem_array tag, data;
  list<nuca_org_t *> nuca_list;
  Router *router_s[ROUTER_TYPES];
  router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global));
  router_s[0]->print_router();
  router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global));
  router_s[1]->print_router();
  router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global));
  router_s[2]->print_router();

  int core_in; // to store no. of cores

  /* to search diff grid organizations */
  double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat,
         curr_acclat;
  double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power,
         avg_leakage_power;

  double opt_acclat = INF;
  //double opt_avg_lat = INF;
  //double opt_tot_lat = INF;
  int opt_rows = 0;
  int opt_columns = 0;
  //double opt_totno_hops = 0;
  double opt_avg_hop = 0;
  double opt_dyn_power = 0, opt_leakage_power = 0;
  min_values_t minval;

  int bank_start = 0;

  int flit_width = 0;

  /* vertical and horizontal hop latency values */
  int ver_hop_lat, hor_hop_lat; /* in cycles */


  /* no. of different bank sizes to consider */
  int iterations;


  g_ip->nuca_cache_sz = g_ip->cache_sz;
  nuca_list.push_back(new nuca_org_t());

  if (g_ip->cache_level == 0) l2_c = 1;
  else l2_c = 0;

  if (g_ip->cores <= 4) core_in = 2;
  else if (g_ip->cores <= 8) core_in = 3;
  else if (g_ip->cores <= 16) core_in = 4;
  else {cout << "Number of cores should be <= 16!\n"; exit(0);}


  // set the lower bound to an appropriate value. this depends on cache associativity
  if (g_ip->assoc > 2) {
    i = 2;
    while (i != g_ip->assoc) {
      MIN_BANKSIZE *= 2;
      i *= 2;
    }
  }

  iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE);

  if (g_ip->force_wiretype)
  {
    if (g_ip->wt == Low_swing) {
      wt_min = Low_swing;
      wt_max = Low_swing;
    }
    else {
      wt_min = Global;
      wt_max = Low_swing-1;
    }
  }
  else {
    wt_min = Global;
    wt_max = Low_swing;
  }
  if (g_ip->nuca_bank_count != 0) { // simulate just one bank
    if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 &&
        g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 &&
        g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) {
      fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n");
    }
    bank_start = (int)logtwo((double)g_ip->nuca_bank_count);
    iterations = bank_start+1;
    g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count;
  }
  cout << "Simulating various NUCA configurations\n";
  for (it=bank_start; it<iterations; it++) { /* different bank count values */
    ures.tag_array2 = &tag;
    ures.data_array2 = &data;
    /*
     * find the optimal bank organization
     */
    solve(&ures);
//    output_UCA(&ures);
    bank_count = g_ip->nuca_cache_sz/g_ip->cache_sz;
    cout << "====" <<  g_ip->cache_sz << "\n";

    for (wr=wt_min; wr<=wt_max; wr++) {

      for (ro=0; ro<ROUTER_TYPES; ro++)
      {
        flit_width = (int) router_s[ro]->flit_size; //initialize router
        nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time;

        /* calculate router and wire parameters */

        double vlength = ures.cache_ht; /* length of the wire (u)*/
        double hlength = ures.cache_len; // u

        /* find delay, area, and power for wires */
        wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength);
        wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength);


        hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay,
            1/(nuca_list.back()->nuca_pda.cycle_time*.001));
        ver_hop_lat = calc_cycles(wire_vertical[wr]->delay,
            1/(nuca_list.back()->nuca_pda.cycle_time*.001));

        /*
         * assume a grid like topology and explore for optimal network
         * configuration using different row and column count values.
         */
        for (c=1; c<=(unsigned int)bank_count; c++) {
          while (bank_count%c != 0) c++;
          r = bank_count/c;

          /*
           * to find the avg access latency of a NUCA cache, uncontended
           * access time to each bank from the
           * cache controller is calculated.
           * avg latency =
           * sum of the access latencies to individual banks)/bank
           * count value.
           */
          totno_hops = totno_hhops = totno_vhops = tot_lat = 0;
         // k = 1;
          for (i=0; i<r; i++) {
            for (j=0; j<c; j++) {
              /*
               * vertical hops including the
               * first hop from the cache controller
               */
              curr_hop = i + 1;
              curr_hop += j; /* horizontal hops */
              totno_hhops += j;
              totno_vhops += (i+1);
              curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT +
                  j * hor_hop_lat);

              tot_lat += curr_acclat;
              totno_hops += curr_hop;
            }
          }
          avg_lat = tot_lat/bank_count;
          avg_hop = totno_hops/bank_count;
          avg_hhop = totno_hhops/bank_count;
          avg_vhop = totno_vhops/bank_count;

          /* net access latency */
          curr_acclat = 2*avg_lat + 2*(router_s[ro]->delay*avg_hop) +
            calc_cycles(ures.access_time,
                1/(nuca_list.back()->nuca_pda.cycle_time*.001));

          /* avg access lat of nuca */
          avg_dyn_power =
            avg_hop *
            (router_s[ro]->power.readOp.dynamic) + avg_hhop *
            (wire_horizontal[wr]->power.readOp.dynamic) *
            (g_ip->block_sz*8 + 64) + avg_vhop *
            (wire_vertical[wr]->power.readOp.dynamic) *
            (g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic;

          avg_leakage_power =
            bank_count * router_s[ro]->power.readOp.leakage +
            avg_hhop * (wire_horizontal[wr]->power.readOp.leakage*
                wire_horizontal[wr]->delay) * flit_width +
            avg_vhop * (wire_vertical[wr]->power.readOp.leakage *
                wire_horizontal[wr]->delay);

          if (curr_acclat < opt_acclat) {
            opt_acclat = curr_acclat;
            //opt_tot_lat = tot_lat;
            //opt_avg_lat = avg_lat;
            //opt_totno_hops = totno_hops;
            opt_avg_hop = avg_hop;
            opt_rows = r;
            opt_columns = c;
            opt_dyn_power = avg_dyn_power;
            opt_leakage_power = avg_leakage_power;
          }
          totno_hops = 0;
          tot_lat = 0;
          totno_hhops = 0;
          totno_vhops = 0;
        }
        nuca_list.back()->wire_pda.power.readOp.dynamic =
          opt_avg_hop * flit_width *
          (wire_horizontal[wr]->power.readOp.dynamic +
           wire_vertical[wr]->power.readOp.dynamic);
        nuca_list.back()->avg_hops = opt_avg_hop;
        /* network delay/power */
        nuca_list.back()->h_wire = wire_horizontal[wr];
        nuca_list.back()->v_wire = wire_vertical[wr];
        nuca_list.back()->router = router_s[ro];
        /* bank delay/power */

        nuca_list.back()->bank_pda.delay = ures.access_time;
        nuca_list.back()->bank_pda.power = ures.power;
        nuca_list.back()->bank_pda.area.h = ures.cache_ht;
        nuca_list.back()->bank_pda.area.w = ures.cache_len;
        nuca_list.back()->bank_pda.cycle_time = ures.cycle_time;

        num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/,
            1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/));
        if(num_cyc%2 != 0) num_cyc++;
        if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles

        if (it < 7) {
          nuca_list.back()->nuca_pda.delay = opt_acclat +
            cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
          nuca_list.back()->contention =
            cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
        }
        else {
          nuca_list.back()->nuca_pda.delay = opt_acclat +
            cont_stats[l2_c][core_in][ro][6][num_cyc/2-1];
          nuca_list.back()->contention =
            cont_stats[l2_c][core_in][ro][6][num_cyc/2-1];
        }
        nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power;
        nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power;

        /* array organization */
        nuca_list.back()->bank_count = bank_count;
        nuca_list.back()->rows = opt_rows;
        nuca_list.back()->columns = opt_columns;
        calculate_nuca_area (nuca_list.back());

        minval.update_min_values(nuca_list.back());
        nuca_list.push_back(new nuca_org_t());
        opt_acclat = BIGNUM;

      }
    }
    g_ip->cache_sz /= 2;
  }

  delete(nuca_list.back());
  nuca_list.pop_back();
  opt_n = find_optimal_nuca(&nuca_list, &minval);
  print_nuca(opt_n);
  g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count;

  list<nuca_org_t *>::iterator niter;
  for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter)
  {
    delete *niter;
  }
  nuca_list.clear();

  for(int i=0; i < ROUTER_TYPES; i++)
  {
    delete router_s[i];
  }
  g_ip->display_ip();
  //  g_ip->force_cache_config = true;
  //  g_ip->ndwl = 8;
  //  g_ip->ndbl = 16;
  //  g_ip->nspd = 4;
  //  g_ip->ndcm = 1;
  //  g_ip->ndsam1 = 8;
  //  g_ip->ndsam2 = 32;

}
Пример #7
0
/* Comparator Delay (see section 6.6) */
double SIM_compare_time(int C, int A, int Ntbl, int Ntspd, double inputtime, double *outputtime)
{
	double Req,Ceq,tf,st1del,st2del,st3del,nextinputtime,m;
	double c1,c2,r1,r2,tstep,a,b,c;
	double Tcomparatorni;
	int cols,tagbits;

	/* First Inverter */

	Ceq = SIM_gatecap(Wcompinvn2+Wcompinvp2,10.0) +
		SIM_draincap(Wcompinvp1,PCH,1) + SIM_draincap(Wcompinvn1,NCH,1);
	Req = SIM_transreson(Wcompinvp1,PCH,1);
	tf = Req*Ceq;
	st1del = SIM_horowitz(inputtime,tf,PARM(VTHCOMPINV),PARM(VTHCOMPINV),FALL);
	nextinputtime = st1del/PARM(VTHCOMPINV);

	/* Second Inverter */

	Ceq = SIM_gatecap(Wcompinvn3+Wcompinvp3,10.0) +
		SIM_draincap(Wcompinvp2,PCH,1) + SIM_draincap(Wcompinvn2,NCH,1);
	Req = SIM_transreson(Wcompinvn2,NCH,1);
	tf = Req*Ceq;
	st2del = SIM_horowitz(inputtime,tf,PARM(VTHCOMPINV),PARM(VTHCOMPINV),RISE);
	nextinputtime = st1del/(1.0-PARM(VTHCOMPINV));

	/* Third Inverter */

	Ceq = SIM_gatecap(Wevalinvn+Wevalinvp,10.0) +
		SIM_draincap(Wcompinvp3,PCH,1) + SIM_draincap(Wcompinvn3,NCH,1);
	Req = SIM_transreson(Wcompinvp3,PCH,1);
	tf = Req*Ceq;
	st3del = SIM_horowitz(nextinputtime,tf,PARM(VTHCOMPINV),PARM(VTHEVALINV),FALL);
	nextinputtime = st1del/(PARM(VTHEVALINV));

	/* Final Inverter (virtual ground driver) discharging compare part */

	tagbits = PARM(ADDRESS_BITS) - (int)logtwo((double)C) + (int)logtwo((double)A);
	cols = tagbits*Ntbl*Ntspd;

	r1 = SIM_transreson(Wcompn,NCH,2);
	r2 = SIM_transresswitch(Wevalinvn,NCH,1);
	c2 = (tagbits)*(SIM_draincap(Wcompn,NCH,1)+SIM_draincap(Wcompn,NCH,2))+
		SIM_draincap(Wevalinvp,PCH,1) + SIM_draincap(Wevalinvn,NCH,1);
	c1 = (tagbits)*(SIM_draincap(Wcompn,NCH,1)+SIM_draincap(Wcompn,NCH,2))
		+SIM_draincap(Wcompp,PCH,1) + SIM_gatecap(Wmuxdrv12n+Wmuxdrv12p,20.0) +
		cols*Cwordmetal;

	/* time to go to threshold of mux driver */

	tstep = (r2*c2+(r1+r2)*c1)*log(1.0/PARM(VTHMUXDRV1));

	/* take into account non-zero input rise time */

	m = Vdd/nextinputtime;

	if ((tstep) <= (0.5*(Vdd-Vt)/m)) {
		a = m;
		b = 2*((Vdd*PARM(VTHEVALINV))-Vt);
		c = -2*(tstep)*(Vdd-Vt)+1/m*((Vdd*PARM(VTHEVALINV))-Vt)*((Vdd*PARM(VTHEVALINV))-Vt);
		Tcomparatorni = (-b+sqrt(b*b-4*a*c))/(2*a);
	} else {
		Tcomparatorni = (tstep) + (Vdd+Vt)/(2*m) - (Vdd*PARM(VTHEVALINV))/m;
	}
	*outputtime = Tcomparatorni/(1.0-PARM(VTHMUXDRV1));

	return(Tcomparatorni+st1del+st2del+st3del);
}
Пример #8
0
/* Decoder delay in the tag array (see section 6.1 of tech report) */
double SIM_decoder_tag_delay(int C, int B, int A, int Ndwl, int Ndbl, int Nspd, int Ntwl, int Ntbl, int Ntspd, 
             double *Tdecdrive, double *Tdecoder1, double *Tdecoder2, double *outrisetime)
{
        double Ceq,Req,Rwire,rows,tf,nextinputtime,vth = 0,tstep,m,a,b,c;
        int numstack;


        /* Calculate rise time.  Consider two inverters */

        Ceq = SIM_draincap(Wdecdrivep,PCH,1)+SIM_draincap(Wdecdriven,NCH,1) +
              SIM_gatecap(Wdecdrivep+Wdecdriven,0.0);
        tf = Ceq*SIM_transreson(Wdecdriven,NCH,1);
        nextinputtime = SIM_horowitz(0.0,tf,PARM(VTHINV100x60),PARM(VTHINV100x60),FALL)/
                                  (PARM(VTHINV100x60));

        Ceq = SIM_draincap(Wdecdrivep,PCH,1)+SIM_draincap(Wdecdriven,NCH,1) +
              SIM_gatecap(Wdecdrivep+Wdecdriven,0.0);
        tf = Ceq*SIM_transreson(Wdecdriven,NCH,1);
        nextinputtime = SIM_horowitz(nextinputtime,tf,PARM(VTHINV100x60),PARM(VTHINV100x60),
                               RISE)/
                                  (1.0-PARM(VTHINV100x60));

        /* First stage: driving the decoders */

        rows = C/(8*B*A*Ntbl*Ntspd);
        Ceq = SIM_draincap(Wdecdrivep,PCH,1)+SIM_draincap(Wdecdriven,NCH,1) +
            4*SIM_gatecap(Wdec3to8n+Wdec3to8p,10.0)*(Ntwl*Ntbl)+
            Cwordmetal*0.25*8*B*A*Ntbl*Ntspd;
        Rwire = Rwordmetal*0.125*8*B*A*Ntbl*Ntspd;
        tf = (Rwire + SIM_transreson(Wdecdrivep,PCH,1))*Ceq;
        *Tdecdrive = SIM_horowitz(nextinputtime,tf,PARM(VTHINV100x60),PARM(VTHNAND60x90),
                     FALL);
        nextinputtime = *Tdecdrive/PARM(VTHNAND60x90);

        /* second stage: driving a bunch of nor gates with a nand */

        numstack =
          (int)(ceil((1.0/3.0)*logtwo( (double)((double)C/(double)(B*A*Ntbl*Ntspd)))));
        if (numstack==0) numstack = 1;
        if (numstack>5) numstack = 5;

        Ceq = 3*SIM_draincap(Wdec3to8p,PCH,1) +SIM_draincap(Wdec3to8n,NCH,3) +
              SIM_gatecap(WdecNORn+WdecNORp,((numstack*40)+20.0))*rows +
              Cbitmetal*rows*8;

        Rwire = Rbitmetal*rows*8/2;
        tf = Ceq*(Rwire+SIM_transreson(Wdec3to8n,NCH,3)); 

        /* we only want to charge the output to the threshold of the
           nor gate.  But the threshold depends on the number of inputs
           to the nor.  */

        switch(numstack) {
          case 1: vth = PARM(VTHNOR12x4x1); break;
          case 2: vth = PARM(VTHNOR12x4x2); break;
          case 3: vth = PARM(VTHNOR12x4x3); break;
          case 4: vth = PARM(VTHNOR12x4x4); break;
          case 5: vth = PARM(VTHNOR12x4x4); break;
          case 6: vth = PARM(VTHNOR12x4x4); break;
          default: printf("error:numstack=%d\n",numstack);
	}
        *Tdecoder1 = SIM_horowitz(nextinputtime,tf,PARM(VTHNAND60x90),vth,RISE);
        nextinputtime = *Tdecoder1/(1.0-vth);

        /* Final stage: driving an inverter with the nor */

        Req = SIM_transreson(WdecNORp,PCH,numstack);
        Ceq = (SIM_gatecap(Wdecinvn+Wdecinvp,20.0)+
              numstack*SIM_draincap(WdecNORn,NCH,1)+
                     SIM_draincap(WdecNORp,PCH,numstack));
        tf = Req*Ceq;
        *Tdecoder2 = SIM_horowitz(nextinputtime,tf,vth,PARM(VSINV),FALL);
        *outrisetime = *Tdecoder2/(PARM(VSINV));
        return(*Tdecdrive+*Tdecoder1+*Tdecoder2);
}