/* Tag array wordline delay (see section 6.3 of tech report) */ double SIM_wordline_tag_delay(int C, int A, int Ntspd, int Ntwl, double inrisetime, double *outrisetime) { double tf,m,a,b,c; double Cline,Rline,Ceq,nextinputtime; int tagbits; double Tworddrivedel,Twordchargedel; /* number of tag bits */ tagbits = PARM(ADDRESS_BITS)+2-(int)logtwo((double)C)+(int)logtwo((double)A); /* first stage */ Ceq = SIM_draincap(Wdecinvn,NCH,1) + SIM_draincap(Wdecinvp,PCH,1) + SIM_gatecap(Wdecinvn+Wdecinvp,20.0); tf = SIM_transreson(Wdecinvn,NCH,1)*Ceq; Tworddrivedel = SIM_horowitz(inrisetime,tf,PARM(VSINV),PARM(VSINV),RISE); nextinputtime = Tworddrivedel/(1.0-PARM(VSINV)); /* second stage */ Cline = (SIM_gatecappass(Wmemcella,(BitWidth-2*Wmemcella)/2.0)+ SIM_gatecappass(Wmemcella,(BitWidth-2*Wmemcella)/2.0)+ Cwordmetal)*tagbits*A*Ntspd/Ntwl+ SIM_draincap(Wdecinvn,NCH,1) + SIM_draincap(Wdecinvp,PCH,1); Rline = Rwordmetal*tagbits*A*Ntspd/(2*Ntwl); tf = (SIM_transreson(Wdecinvp,PCH,1)+Rline)*Cline; Twordchargedel = SIM_horowitz(nextinputtime,tf,PARM(VSINV),PARM(VSINV),FALL); *outrisetime = Twordchargedel/PARM(VSINV); return(Tworddrivedel+Twordchargedel); }
void tsim_cache_power_model::calculate_decoder_power(){ // based on Wattch's model double total_cap = 0.0, temp_cap = 0.0; int tot_ports = 0, decode_bits = 0; // total ports tot_ports = num_read_ports + num_write_ports; // number of decode bits needed decode_bits = (int)ceil(logtwo(num_rows)); //TODO: Adjust this equation after figuring what //WATTCH is doing : add both drain and gate caps here //setting it to some random value for now temp_cap = 200e-15; total_cap += tot_ports * decode_bits * temp_cap; // TODO: ignoring the NOR/NAND part for the first cut // add this model later tot_dec_cap = total_cap; decoder_power = get_act_factor() * total_cap * get_power_factor(); }
main() { double predeclength, wordlinelength, bitlinelength; double regfile_power, regfile_decoder, regfile_wordline, regfile_wordline16, regfile_wordline33, regfile_bitline; int scale_factor; int data_width; int rports, wports; int switch_arg; printf("1. Simple Register File\n"); printf("2. Simple Cache Structure\n"); printf("3. Simple CAM Structure\n"); printf("4. Complex Cache (Auto-Sized)\n"); scanf("%d",&switch_arg); printf("note these are MAX powers (assuming full switching)\n"); switch(switch_arg){ case 1: { printf("Enter Reg File Params:\n"); printf("Number of Registers: "); scanf("%d",&num_regs); printf("Data Width: "); scanf("%d",&data_width); printf("Number of Read Ports: "); scanf("%d",&rports); printf("Number of Write Ports: "); scanf("%d",&wports); printf("%d-entryx%d-width,%d-rdport,%d-wrport: %f (W)\n",num_regs,data_width,rports,wports,simple_array_power(num_regs,data_width,rports,wports,0)); printf(" decode_power (W): %f\n",simple_array_decoder_power(num_regs,data_width,rports,wports,0)); printf(" wordline_power (W): %f\n",simple_array_wordline_power(num_regs,data_width,rports,wports,0)); printf(" bitline_power (W): %f\n",simple_array_bitline_power(num_regs,data_width,rports,wports,0)); break; } case 2: { printf("Enter Cache Params:\n"); printf("Size of cache: "); scanf("%d",&num_regs); printf("Data Width: "); scanf("%d",&data_width); printf("Number of Read Ports: "); scanf("%d",&rports); printf("Number of Write Ports: "); scanf("%d",&wports); printf("%d-entryx%d-width,%d-rdport,%d-wrport: %f (W)\n",num_regs,data_width,rports,wports,simple_array_power(num_regs,data_width,rports,wports,1)); printf(" decode_power (W): %f\n",simple_array_decoder_power(num_regs,data_width,rports,wports,1)); printf(" wordline_power (W): %f\n",simple_array_wordline_power(num_regs,data_width,rports,wports,1)); printf(" bitline_power (W): %f\n",simple_array_bitline_power(num_regs,data_width,rports,wports,1)); break; } case 3: { printf("Enter CAM Params:\n"); printf("Entries in CAM: "); scanf("%d",&num_regs); printf("Tag Width: "); scanf("%d",&data_width); printf("Number of Read Ports: "); scanf("%d",&rports); printf("Number of Write Ports: "); scanf("%d",&wports); printf("%d-entryx%d-tagwidth,%d-rdport,%d-wrport: %f (W)\n",num_regs,data_width,rports,wports,cam_array(num_regs,data_width,rports,wports)); printf(" tagdrive_power (W): %f\n",cam_tagdrive(num_regs,data_width,rports,wports)); printf(" tagmatch_power (W): %f\n",cam_tagmatch(num_regs,data_width,rports,wports)); break; } case 4: { int nsets, bsize, assoc,res_memport, tagsize; int ndwl, ndbl, nspd, ntwl, ntbl, ntspd, c,b,a,cache, rowsb, colsb, trowsb, tcolsb; double cache_decoder, cache_wordline, cache_bitline, cache_senseamp, cache_tagarray, total_cache_power; time_result_type time_result; time_parameter_type time_parameters; int va_size = 48; printf("Enter Cache Params:\n"); printf("Number of Sets in cache: "); scanf("%d",&nsets); printf("Block Size (bytes): "); scanf("%d",&bsize); printf("Associativity: "); scanf("%d",&assoc); printf("Number of Memory Ports: "); scanf("%d",&res_memport); printf("note tagarray size is estimated based on assuming 48-bit virtual addresses\n"); cache = 1; time_parameters.cache_size = nsets * bsize * assoc; /* C */ time_parameters.block_size = bsize; /* B */ time_parameters.associativity = assoc; /* A */ time_parameters.number_of_sets = nsets; /* C/(B*A) */ calculate_time(&time_result,&time_parameters); output_data(&time_result,&time_parameters); ndwl=time_result.best_Ndwl; ndbl=time_result.best_Ndbl; nspd=time_result.best_Nspd; ntwl=time_result.best_Ntwl; ntbl=time_result.best_Ntbl; ntspd=time_result.best_Ntspd; c = time_parameters.cache_size; b = time_parameters.block_size; a = time_parameters.associativity; rowsb = c/(8*b*a*ndbl*nspd); colsb = 8*b*a*nspd/ndwl; tagsize = va_size - ((int)logtwo(nsets) + (int)logtwo(bsize)); trowsb = c/(8*b*a*ntbl*ntspd); tcolsb = a * (tagsize + 1 + 6) * ntspd/ntwl; predeclength = rowsb * (RegCellHeight + WordlineSpacing); wordlinelength = colsb * (RegCellWidth + BitlineSpacing); bitlinelength = rowsb * (RegCellHeight + WordlineSpacing); cache_decoder = res_memport*ndwl*ndbl*array_decoder_power(rowsb,colsb,predeclength,1,1,cache); cache_wordline = res_memport*ndwl*ndbl*array_wordline_power(rowsb,colsb,wordlinelength,1,1,cache); cache_bitline = res_memport*ndwl*ndbl*array_bitline_power(rowsb,colsb,bitlinelength,1,1,cache); cache_senseamp = res_memport*ndwl*ndbl*senseamp_power(colsb); cache_tagarray = res_memport*ntwl*ntbl*(simple_array_power(trowsb,tcolsb,1,1,cache)); total_cache_power = cache_decoder + cache_wordline + cache_bitline + cache_senseamp + cache_tagarray; fprintf(stderr,"%d KB %d-way cache (%d-byte block size):\n",c,a,b); fprintf(stderr,"ndwl == %d, ndbl == %d, nspd == %d\n",ndwl,ndbl,nspd); fprintf(stderr,"%d sets of %d rows x %d cols\n",ndwl*ndbl,rowsb,colsb); fprintf(stderr,"tagsize == %d\n",tagsize); fprintf(stderr,"\nntwl == %d, ntbl == %d, ntspd == %d\n",ntwl,ntbl,ntspd); fprintf(stderr,"%d sets of %d rows x %d cols\n",ntwl*ntbl,trowsb,tcolsb); printf("Total Power (W): %f\n",total_cache_power); printf(" decode_power (W): %f\n",cache_decoder); printf(" wordline_power (W): %f\n",cache_wordline); printf(" bitline_power (W): %f\n",cache_bitline); printf(" senseamp_power (W): %f\n",cache_senseamp); printf(" tagarray_power (W): %f\n",cache_tagarray); break; } default: } }
total_result_type cacti_interface (int cache_size, int line_size, int associativity, int rw_ports, int excl_read_ports, int excl_write_ports, int single_ended_read_ports, int banks, double tech_node, int output_width, int specific_tag, int tag_width, int access_mode, int pure_sram) { int C, B, A, ERP, EWP, RWP, NSER; double tech; double logbanks; double logbanksfloor; int seq_access = 0; int fast_access = 0; int bits_output = output_width; int nr_args = 9; double NSubbanks = (double) banks; double ratioofbankstoports; extern int force_tag, force_tag_size; total_result_type endresult; endresult.result.subbanks = 0.0; result_type result; arearesult_type arearesult; area_type arearesult_subbanked; parameter_type parameters; /* input parameters: C B A ERP EWP */ /*dt: make sure we're using some simple leakage reduction */ dualVt = FALSE; //#ifdef XCACTI //parameters.latchsa = 0; //parameters.ignore_tag = 0; //#endif force_tag = 0; parameters.force_tag = 0; if (specific_tag) { force_tag = 1; force_tag_size = tag_width; parameters.force_tag = 1; parameters.tag_size = tag_width; //parameters.ignore_tag = 1; } switch (access_mode) { case 0: seq_access = fast_access = FALSE; break; case 1: seq_access = TRUE; fast_access = FALSE; break; case 2: seq_access = FALSE; fast_access = TRUE; break; } C = cache_size; A = associativity; B = line_size; if ((B < 1)) { printf ("Block size must >=1\n"); return endresult; //exit(1); } if ((B * 8 < bits_output)) { printf ("Block size must be at least %d\n", bits_output / 8); return endresult; //exit(1); } tech = tech_node; if ((tech <= 0)) { printf ("Feature size must be > 0\n"); return endresult; //exit(1); } if ((tech > 0.8)) { printf ("Feature size must be <= 0.80 (um)\n"); return endresult; //exit(1); } if (nr_args == 6) { RWP = 1; ERP = 0; EWP = 0; NSER = 0; } else if (nr_args == 8) { RWP = 1; ERP = 0; EWP = 0; NSER = 0; bits_output = output_width; seq_access = 1; } else if (nr_args == 9) { RWP = rw_ports; ERP = excl_read_ports; EWP = excl_write_ports; NSER = single_ended_read_ports; } else if (nr_args >= 10) { RWP = rw_ports; ERP = excl_read_ports; EWP = excl_write_ports; NSER = single_ended_read_ports; seq_access = 1; } if ((RWP < 0) || (EWP < 0) || (ERP < 0)) { printf ("Ports must >=0\n"); return endresult; //exit(1); } if (RWP > 2) { printf ("Maximum of 2 read/write ports\n"); return endresult; //exit(1); } if ((RWP + ERP + EWP) < 1) { printf ("Must have at least one port\n"); return endresult; //exit(1); } if (NSubbanks < 1) { printf ("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n"); return endresult; //exit(1); } logbanks = logtwo ((double) (NSubbanks)); logbanksfloor = floor (logbanks); if (logbanks > logbanksfloor) { printf ("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n"); return endresult; //exit(1); } if (C == B * A) { parameters.fully_assoc = 1; A = C / B; } else { parameters.fully_assoc = 0; } C = cache_size / ((int) (NSubbanks)); if ((C < 64)) { printf ("Cache size must >=64\n"); return endresult; //exit(1); } //A = C/B; if (A > 16) { parameters.fully_assoc = 1; A = 16; } /*if ((associativity == 0) || (A == C/B)) { A = C/B; parameters.fully_assoc = 1; } else { if (associativity == 1) { A=1; parameters.fully_assoc = 0; } else { parameters.fully_assoc = 0; A = associativity; if ((A < 1)) { printf("Associativity must >= 1\n"); return endresult; //exit(1); } assoc = logtwo((double)(A)); assocfloor = floor(assoc); if(assoc > assocfloor){ printf("Associativity should be a power of 2\n"); return endresult; //exit(1); } if ((A > 32)) { printf("Associativity must <= 32\n or try FA (fully associative)\n"); return endresult; //exit(1); } } } if (C/(B*A)<=1 && !parameters.fully_assoc) { //printf("Number of sets is too small:\n Need to either increase cache size, or decrease associativity or block size\n (or use fully associative cache)\n"); //return endresult; A = C/B; parameters.fully_assoc = 1; //exit(1); } */ printf ("\n########### Printing input for params for testing...###"); printf ("\n C = %d, B = %d, A = %d", C, B, A); printf ("\n RWP = %d, ERP = %d, EWP = %d, NSER = %d", RWP, ERP, EWP, NSER); printf ("\n banks = %d, tech = %f, bits_output = %d, fast_access = %d, pure_sram = %d", banks, tech, bits_output, fast_access, pure_sram); printf ("\n force_tag = %d, force_tag_size = %d", force_tag, force_tag_size); printf ("\n #################\n"); parameters.cache_size = C; parameters.block_size = B; parameters.nr_bits_out = bits_output; /*dt: testing sequential access mode */ if (seq_access) { parameters.tag_associativity = A; parameters.data_associativity = 1; parameters.sequential_access = 1; } else { parameters.tag_associativity = parameters.data_associativity = A; parameters.sequential_access = 0; } if (fast_access) { parameters.fast_access = 1; } else { parameters.fast_access = 0; } parameters.num_readwrite_ports = RWP; parameters.num_read_ports = ERP; parameters.num_write_ports = EWP; parameters.NSubbanks = banks; parameters.num_single_ended_read_ports = NSER; parameters.number_of_sets = C / (B * A); parameters.fudgefactor = .8 / tech; parameters.tech_size = (double) tech; parameters.pure_sram = pure_sram; //If multiple banks and multiple ports are specified, then if number of banks/total number //of ports > 1 then assume that the multiple ports are implemented via the multiple banks. //Also assume that each bank has only 1 RWP port. There are some problems with this logic that //will be fixed in v5.0 ratioofbankstoports = NSubbanks / (RWP + ERP + EWP); if (ratioofbankstoports >= 1.0) { //We assume that each bank has 1 RWP port. parameters.num_readwrite_ports = 1; parameters.num_read_ports = 0; parameters.num_write_ports = 0; parameters.num_single_ended_read_ports = 0; } if (parameters.number_of_sets < 1) { printf ("Less than one set...\n"); return endresult; //exit(1); } parameters.VddPow = 4.5 / (pow (parameters.fudgefactor, (2.0 / 3.0))); if (parameters.VddPow < 0.7) parameters.VddPow = 0.7; if (parameters.VddPow > 5.0) parameters.VddPow = 5.0; printf ("\n##### Printing parameters for testing...#####\n"); output_params (¶meters); init_tech_params_default_process (); //v4.1: First initialize all tech variables //to 0.8 micron values. init_tech_params function below then reinitializes tech variables to //given process values init_tech_params (parameters.tech_size); calculate_time (&result, &arearesult, &arearesult_subbanked, ¶meters, &NSubbanks); //v4.1: No longer using calculate_area function as area has already been //computed for the given tech node /*arearesult.dataarray_area.scaled_area = calculate_area(arearesult.dataarray_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; arearesult.datapredecode_area.scaled_area = calculate_area(arearesult.datapredecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; arearesult.datacolmuxpredecode_area.scaled_area = calculate_area(arearesult.datacolmuxpredecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; arearesult.datacolmuxpostdecode_area.scaled_area = calculate_area(arearesult.datacolmuxpostdecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; arearesult.datawritesig_area.scaled_area = (parameters.num_readwrite_ports+parameters.num_read_ports+parameters.num_write_ports)*calculate_area(arearesult.datawritesig_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; arearesult.tagarray_area.scaled_area = calculate_area(arearesult.tagarray_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; arearesult.tagpredecode_area.scaled_area = calculate_area(arearesult.tagpredecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; arearesult.tagcolmuxpredecode_area.scaled_area = calculate_area(arearesult.tagcolmuxpredecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; arearesult.tagcolmuxpostdecode_area.scaled_area = calculate_area(arearesult.tagcolmuxpostdecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; arearesult.tagoutdrvdecode_area.scaled_area = calculate_area(arearesult.tagoutdrvdecode_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; arearesult.tagoutdrvsig_area.scaled_area = (parameters.num_readwrite_ports+parameters.num_read_ports+parameters.num_write_ports)* calculate_area(arearesult.tagoutdrvsig_area,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; arearesult.perc_data = 100*area_all_dataramcells/(arearesult.totalarea*CONVERT_TO_MMSQUARE); arearesult.perc_tag = 100*area_all_tagramcells/(arearesult.totalarea*CONVERT_TO_MMSQUARE); arearesult.perc_cont = 100*(arearesult.totalarea*CONVERT_TO_MMSQUARE-area_all_dataramcells-area_all_tagramcells)/(arearesult.totalarea*CONVERT_TO_MMSQUARE); arearesult.sub_eff = (area_all_dataramcells+area_all_tagramcells)*100/(arearesult.totalarea/100000000.0); arearesult.total_eff = (NSubbanks)*(area_all_dataramcells+area_all_tagramcells)*100/ (calculate_area(arearesult_subbanked,parameters.fudgefactor)*CONVERT_TO_MMSQUARE); arearesult.totalarea *= CONVERT_TO_MMSQUARE; arearesult.subbankarea = calculate_area(arearesult_subbanked,parameters.fudgefactor)*CONVERT_TO_MMSQUARE; */ arearesult.dataarray_area.scaled_area = arearesult.dataarray_area.height * arearesult.dataarray_area.width * CONVERT_TO_MMSQUARE; arearesult.datapredecode_area.scaled_area = arearesult.datapredecode_area.height * arearesult.datapredecode_area.width * CONVERT_TO_MMSQUARE; arearesult.datacolmuxpredecode_area.scaled_area = arearesult.datacolmuxpredecode_area.height * arearesult.datacolmuxpredecode_area.width * CONVERT_TO_MMSQUARE; arearesult.datacolmuxpostdecode_area.scaled_area = arearesult.datacolmuxpostdecode_area.height * arearesult.datacolmuxpostdecode_area.width * CONVERT_TO_MMSQUARE; arearesult.datawritesig_area.scaled_area = (parameters.num_readwrite_ports + parameters.num_read_ports + parameters.num_write_ports) * arearesult.datawritesig_area.height * arearesult.datawritesig_area.width * CONVERT_TO_MMSQUARE; arearesult.tagarray_area.scaled_area = arearesult.tagarray_area.height * arearesult.tagarray_area.width * CONVERT_TO_MMSQUARE; arearesult.tagpredecode_area.scaled_area = arearesult.tagpredecode_area.height * arearesult.tagpredecode_area.width * CONVERT_TO_MMSQUARE; arearesult.tagcolmuxpredecode_area.scaled_area = arearesult.tagcolmuxpredecode_area.height * arearesult.tagcolmuxpredecode_area.width * CONVERT_TO_MMSQUARE; arearesult.tagcolmuxpostdecode_area.scaled_area = arearesult.tagcolmuxpostdecode_area.height * arearesult.tagcolmuxpostdecode_area.width * CONVERT_TO_MMSQUARE; arearesult.tagoutdrvdecode_area.scaled_area = arearesult.tagoutdrvdecode_area.height * arearesult.tagoutdrvdecode_area.width * CONVERT_TO_MMSQUARE; arearesult.tagoutdrvsig_area.scaled_area = (parameters.num_readwrite_ports + parameters.num_read_ports + parameters.num_write_ports) * arearesult.tagoutdrvsig_area.height * arearesult.tagoutdrvsig_area.width * CONVERT_TO_MMSQUARE; arearesult.perc_data = 100 * area_all_dataramcells / (arearesult.totalarea * CONVERT_TO_MMSQUARE); arearesult.perc_tag = 100 * area_all_tagramcells / (arearesult.totalarea * CONVERT_TO_MMSQUARE); arearesult.perc_cont = 100 * (arearesult.totalarea * CONVERT_TO_MMSQUARE - area_all_dataramcells - area_all_tagramcells) / (arearesult.totalarea * CONVERT_TO_MMSQUARE); arearesult.sub_eff = (area_all_dataramcells + area_all_tagramcells) * 100 / (arearesult.totalarea / 100000000.0); arearesult.total_eff = (NSubbanks) * (area_all_dataramcells + area_all_tagramcells) * 100 / (arearesult_subbanked.height * arearesult_subbanked.width * CONVERT_TO_MMSQUARE); arearesult.totalarea *= CONVERT_TO_MMSQUARE; arearesult.subbankarea = arearesult_subbanked.height * arearesult_subbanked.width * CONVERT_TO_MMSQUARE; if (result.bitline_delay_data < 0.0) { result.bitline_delay_data = 10 ^ -12; } if (result.bitline_delay_tag < 0.0) { result.bitline_delay_tag = 10 ^ -13; } endresult.result = result; endresult.result.subbanks = banks; endresult.area = arearesult; endresult.params = parameters; return endresult; }
int input_data (int argc, char *argv[]) { int C, B, A, ERP, EWP, RWP, NSER, NSubbanks, fully_assoc; double tech; double logbanks, assoc; double logbanksfloor, assocfloor; int bits_output = 64; if ((argc != 6) && (argc != 9) && (argc != 15)) { printf ("Cmd-line parameters: C B A TECH NSubbanks\n"); printf (" OR: C B A TECH RWP ERP EWP NSubbanks\n"); exit (1); } B = atoi (argv[2]); if ((B < 1)) { printf ("Block size must >=1\n"); exit (1); } if (argc == 9) { if ((B * 8 < bits_output)) { printf ("Block size must be at least %d\n", bits_output / 8); exit (1); } tech = atof (argv[4]); if ((tech <= 0)) { printf ("Feature size must be > 0\n"); exit (1); } if ((tech > 0.8)) { printf ("Feature size must be <= 0.80 (um)\n"); exit (1); } RWP = atoi (argv[5]); ERP = atoi (argv[6]); EWP = atoi (argv[7]); NSER = 0; if ((RWP < 0) || (EWP < 0) || (ERP < 0)) { printf ("Ports must >=0\n"); exit (1); } if (RWP > 2) { printf ("Maximum of 2 read/write ports\n"); exit (1); } if ((RWP + ERP + EWP) < 1) { printf ("Must have at least one port\n"); exit (1); } NSubbanks = atoi (argv[8]); if (NSubbanks < 1) { printf ("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n"); exit (1); } logbanks = logtwo ((double) (NSubbanks)); logbanksfloor = floor (logbanks); if (logbanks > logbanksfloor) { printf ("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n"); exit (1); } } else if (argc == 6) { if ((B * 8 < bits_output)) { printf ("Block size must be at least %d\n", bits_output / 8); exit (1); } tech = atof (argv[4]); if ((tech <= 0)) { printf ("Feature size must be > 0\n"); exit (1); } if ((tech > 0.8)) { printf ("Feature size must be <= 0.80 (um)\n"); exit (1); } RWP = 1; ERP = 0; EWP = 0; NSER = 0; NSubbanks = atoi (argv[5]); if (NSubbanks < 1) { printf ("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n"); exit (1); } logbanks = logtwo ((double) (NSubbanks)); logbanksfloor = floor (logbanks); if (logbanks > logbanksfloor) { printf ("Number of subbanks should be greater than or equal to 1 and should be a power of 2\n"); exit (1); } } else { tech = atof (argv[9]); NSubbanks = atoi (argv[8]); if ((tech <= 0)) { printf ("Feature size must be > 0\n"); exit (1); } if ((tech > 0.8)) { printf ("Feature size must be <= 0.80 (um)\n"); exit (1); } } C = atoi (argv[1]) / ((int) (NSubbanks)); if (atoi (argv[1]) < 64) { printf ("Cache size must be greater than 32!\n"); exit (1); } if ((strcmp (argv[3], "FA") == 0) || (argv[3][0] == '0')) { A = C / B; fully_assoc = 1; } else { if (strcmp (argv[3], "DM") == 0) { A = 1; fully_assoc = 0; } else { fully_assoc = 0; A = atoi (argv[3]); if ((A < 0) || (A > 16)) { printf ("Associativity must be 1,2,4,8,16 or 0(fully associative)\n"); exit (1); } assoc = logtwo ((double) (A)); assocfloor = floor (assoc); if (assoc > assocfloor) { printf ("Associativity should be a power of 2\n"); exit (1); } } } if (!fully_assoc && C / (B * A) < 1) { printf ("Number of sets is less than 1:\n Need to either increase cache size, or decrease associativity or block size\n (or use fully associative cache)\n"); exit (1); } return (OK); }
/* * Version - 6.0 * * Perform exhaustive search across different bank organizatons, * router configurations, grid organizations, and wire models and * find an optimal NUCA organization * For different bank count values * 1. Optimal bank organization is calculated * 2. For each bank organization, find different NUCA organizations * using various router configurations, grid organizations, * and wire models. * 3. NUCA model with the least cost is picked for * this particular bank count * Finally include contention statistics and find the optimal * NUCA configuration */ void Nuca::sim_nuca() { /* temp variables */ int it, ro, wr; int num_cyc; unsigned int i, j; unsigned int r, c; int l2_c; int bank_count = 0; uca_org_t ures; nuca_org_t *opt_n; mem_array tag, data; list<nuca_org_t *> nuca_list; Router *router_s[ROUTER_TYPES]; router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global)); router_s[0]->print_router(); router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global)); router_s[1]->print_router(); router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global)); router_s[2]->print_router(); int core_in; // to store no. of cores /* to search diff grid organizations */ double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat, curr_acclat; double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power, avg_leakage_power; double opt_acclat = INF; //double opt_avg_lat = INF; //double opt_tot_lat = INF; int opt_rows = 0; int opt_columns = 0; //double opt_totno_hops = 0; double opt_avg_hop = 0; double opt_dyn_power = 0, opt_leakage_power = 0; min_values_t minval; int bank_start = 0; int flit_width = 0; /* vertical and horizontal hop latency values */ int ver_hop_lat, hor_hop_lat; /* in cycles */ /* no. of different bank sizes to consider */ int iterations; g_ip->nuca_cache_sz = g_ip->cache_sz; nuca_list.push_back(new nuca_org_t()); if (g_ip->cache_level == 0) l2_c = 1; else l2_c = 0; if (g_ip->cores <= 4) core_in = 2; else if (g_ip->cores <= 8) core_in = 3; else if (g_ip->cores <= 16) core_in = 4; else {cout << "Number of cores should be <= 16!\n"; exit(0);} // set the lower bound to an appropriate value. this depends on cache associativity if (g_ip->assoc > 2) { i = 2; while (i != g_ip->assoc) { MIN_BANKSIZE *= 2; i *= 2; } } iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE); if (g_ip->force_wiretype) { if (g_ip->wt == Low_swing) { wt_min = Low_swing; wt_max = Low_swing; } else { wt_min = Global; wt_max = Low_swing-1; } } else { wt_min = Global; wt_max = Low_swing; } if (g_ip->nuca_bank_count != 0) { // simulate just one bank if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 && g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 && g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) { fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n"); } bank_start = (int)logtwo((double)g_ip->nuca_bank_count); iterations = bank_start+1; g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count; } cout << "Simulating various NUCA configurations\n"; for (it=bank_start; it<iterations; it++) { /* different bank count values */ ures.tag_array2 = &tag; ures.data_array2 = &data; /* * find the optimal bank organization */ solve(&ures); // output_UCA(&ures); bank_count = g_ip->nuca_cache_sz/g_ip->cache_sz; cout << "====" << g_ip->cache_sz << "\n"; for (wr=wt_min; wr<=wt_max; wr++) { for (ro=0; ro<ROUTER_TYPES; ro++) { flit_width = (int) router_s[ro]->flit_size; //initialize router nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time; /* calculate router and wire parameters */ double vlength = ures.cache_ht; /* length of the wire (u)*/ double hlength = ures.cache_len; // u /* find delay, area, and power for wires */ wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength); wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength); hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay, 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); ver_hop_lat = calc_cycles(wire_vertical[wr]->delay, 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); /* * assume a grid like topology and explore for optimal network * configuration using different row and column count values. */ for (c=1; c<=(unsigned int)bank_count; c++) { while (bank_count%c != 0) c++; r = bank_count/c; /* * to find the avg access latency of a NUCA cache, uncontended * access time to each bank from the * cache controller is calculated. * avg latency = * sum of the access latencies to individual banks)/bank * count value. */ totno_hops = totno_hhops = totno_vhops = tot_lat = 0; // k = 1; for (i=0; i<r; i++) { for (j=0; j<c; j++) { /* * vertical hops including the * first hop from the cache controller */ curr_hop = i + 1; curr_hop += j; /* horizontal hops */ totno_hhops += j; totno_vhops += (i+1); curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT + j * hor_hop_lat); tot_lat += curr_acclat; totno_hops += curr_hop; } } avg_lat = tot_lat/bank_count; avg_hop = totno_hops/bank_count; avg_hhop = totno_hhops/bank_count; avg_vhop = totno_vhops/bank_count; /* net access latency */ curr_acclat = 2*avg_lat + 2*(router_s[ro]->delay*avg_hop) + calc_cycles(ures.access_time, 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); /* avg access lat of nuca */ avg_dyn_power = avg_hop * (router_s[ro]->power.readOp.dynamic) + avg_hhop * (wire_horizontal[wr]->power.readOp.dynamic) * (g_ip->block_sz*8 + 64) + avg_vhop * (wire_vertical[wr]->power.readOp.dynamic) * (g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic; avg_leakage_power = bank_count * router_s[ro]->power.readOp.leakage + avg_hhop * (wire_horizontal[wr]->power.readOp.leakage* wire_horizontal[wr]->delay) * flit_width + avg_vhop * (wire_vertical[wr]->power.readOp.leakage * wire_horizontal[wr]->delay); if (curr_acclat < opt_acclat) { opt_acclat = curr_acclat; //opt_tot_lat = tot_lat; //opt_avg_lat = avg_lat; //opt_totno_hops = totno_hops; opt_avg_hop = avg_hop; opt_rows = r; opt_columns = c; opt_dyn_power = avg_dyn_power; opt_leakage_power = avg_leakage_power; } totno_hops = 0; tot_lat = 0; totno_hhops = 0; totno_vhops = 0; } nuca_list.back()->wire_pda.power.readOp.dynamic = opt_avg_hop * flit_width * (wire_horizontal[wr]->power.readOp.dynamic + wire_vertical[wr]->power.readOp.dynamic); nuca_list.back()->avg_hops = opt_avg_hop; /* network delay/power */ nuca_list.back()->h_wire = wire_horizontal[wr]; nuca_list.back()->v_wire = wire_vertical[wr]; nuca_list.back()->router = router_s[ro]; /* bank delay/power */ nuca_list.back()->bank_pda.delay = ures.access_time; nuca_list.back()->bank_pda.power = ures.power; nuca_list.back()->bank_pda.area.h = ures.cache_ht; nuca_list.back()->bank_pda.area.w = ures.cache_len; nuca_list.back()->bank_pda.cycle_time = ures.cycle_time; num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/, 1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/)); if(num_cyc%2 != 0) num_cyc++; if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles if (it < 7) { nuca_list.back()->nuca_pda.delay = opt_acclat + cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; nuca_list.back()->contention = cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; } else { nuca_list.back()->nuca_pda.delay = opt_acclat + cont_stats[l2_c][core_in][ro][6][num_cyc/2-1]; nuca_list.back()->contention = cont_stats[l2_c][core_in][ro][6][num_cyc/2-1]; } nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power; nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power; /* array organization */ nuca_list.back()->bank_count = bank_count; nuca_list.back()->rows = opt_rows; nuca_list.back()->columns = opt_columns; calculate_nuca_area (nuca_list.back()); minval.update_min_values(nuca_list.back()); nuca_list.push_back(new nuca_org_t()); opt_acclat = BIGNUM; } } g_ip->cache_sz /= 2; } delete(nuca_list.back()); nuca_list.pop_back(); opt_n = find_optimal_nuca(&nuca_list, &minval); print_nuca(opt_n); g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count; list<nuca_org_t *>::iterator niter; for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter) { delete *niter; } nuca_list.clear(); for(int i=0; i < ROUTER_TYPES; i++) { delete router_s[i]; } g_ip->display_ip(); // g_ip->force_cache_config = true; // g_ip->ndwl = 8; // g_ip->ndbl = 16; // g_ip->nspd = 4; // g_ip->ndcm = 1; // g_ip->ndsam1 = 8; // g_ip->ndsam2 = 32; }
/* Comparator Delay (see section 6.6) */ double SIM_compare_time(int C, int A, int Ntbl, int Ntspd, double inputtime, double *outputtime) { double Req,Ceq,tf,st1del,st2del,st3del,nextinputtime,m; double c1,c2,r1,r2,tstep,a,b,c; double Tcomparatorni; int cols,tagbits; /* First Inverter */ Ceq = SIM_gatecap(Wcompinvn2+Wcompinvp2,10.0) + SIM_draincap(Wcompinvp1,PCH,1) + SIM_draincap(Wcompinvn1,NCH,1); Req = SIM_transreson(Wcompinvp1,PCH,1); tf = Req*Ceq; st1del = SIM_horowitz(inputtime,tf,PARM(VTHCOMPINV),PARM(VTHCOMPINV),FALL); nextinputtime = st1del/PARM(VTHCOMPINV); /* Second Inverter */ Ceq = SIM_gatecap(Wcompinvn3+Wcompinvp3,10.0) + SIM_draincap(Wcompinvp2,PCH,1) + SIM_draincap(Wcompinvn2,NCH,1); Req = SIM_transreson(Wcompinvn2,NCH,1); tf = Req*Ceq; st2del = SIM_horowitz(inputtime,tf,PARM(VTHCOMPINV),PARM(VTHCOMPINV),RISE); nextinputtime = st1del/(1.0-PARM(VTHCOMPINV)); /* Third Inverter */ Ceq = SIM_gatecap(Wevalinvn+Wevalinvp,10.0) + SIM_draincap(Wcompinvp3,PCH,1) + SIM_draincap(Wcompinvn3,NCH,1); Req = SIM_transreson(Wcompinvp3,PCH,1); tf = Req*Ceq; st3del = SIM_horowitz(nextinputtime,tf,PARM(VTHCOMPINV),PARM(VTHEVALINV),FALL); nextinputtime = st1del/(PARM(VTHEVALINV)); /* Final Inverter (virtual ground driver) discharging compare part */ tagbits = PARM(ADDRESS_BITS) - (int)logtwo((double)C) + (int)logtwo((double)A); cols = tagbits*Ntbl*Ntspd; r1 = SIM_transreson(Wcompn,NCH,2); r2 = SIM_transresswitch(Wevalinvn,NCH,1); c2 = (tagbits)*(SIM_draincap(Wcompn,NCH,1)+SIM_draincap(Wcompn,NCH,2))+ SIM_draincap(Wevalinvp,PCH,1) + SIM_draincap(Wevalinvn,NCH,1); c1 = (tagbits)*(SIM_draincap(Wcompn,NCH,1)+SIM_draincap(Wcompn,NCH,2)) +SIM_draincap(Wcompp,PCH,1) + SIM_gatecap(Wmuxdrv12n+Wmuxdrv12p,20.0) + cols*Cwordmetal; /* time to go to threshold of mux driver */ tstep = (r2*c2+(r1+r2)*c1)*log(1.0/PARM(VTHMUXDRV1)); /* take into account non-zero input rise time */ m = Vdd/nextinputtime; if ((tstep) <= (0.5*(Vdd-Vt)/m)) { a = m; b = 2*((Vdd*PARM(VTHEVALINV))-Vt); c = -2*(tstep)*(Vdd-Vt)+1/m*((Vdd*PARM(VTHEVALINV))-Vt)*((Vdd*PARM(VTHEVALINV))-Vt); Tcomparatorni = (-b+sqrt(b*b-4*a*c))/(2*a); } else { Tcomparatorni = (tstep) + (Vdd+Vt)/(2*m) - (Vdd*PARM(VTHEVALINV))/m; } *outputtime = Tcomparatorni/(1.0-PARM(VTHMUXDRV1)); return(Tcomparatorni+st1del+st2del+st3del); }
/* Decoder delay in the tag array (see section 6.1 of tech report) */ double SIM_decoder_tag_delay(int C, int B, int A, int Ndwl, int Ndbl, int Nspd, int Ntwl, int Ntbl, int Ntspd, double *Tdecdrive, double *Tdecoder1, double *Tdecoder2, double *outrisetime) { double Ceq,Req,Rwire,rows,tf,nextinputtime,vth = 0,tstep,m,a,b,c; int numstack; /* Calculate rise time. Consider two inverters */ Ceq = SIM_draincap(Wdecdrivep,PCH,1)+SIM_draincap(Wdecdriven,NCH,1) + SIM_gatecap(Wdecdrivep+Wdecdriven,0.0); tf = Ceq*SIM_transreson(Wdecdriven,NCH,1); nextinputtime = SIM_horowitz(0.0,tf,PARM(VTHINV100x60),PARM(VTHINV100x60),FALL)/ (PARM(VTHINV100x60)); Ceq = SIM_draincap(Wdecdrivep,PCH,1)+SIM_draincap(Wdecdriven,NCH,1) + SIM_gatecap(Wdecdrivep+Wdecdriven,0.0); tf = Ceq*SIM_transreson(Wdecdriven,NCH,1); nextinputtime = SIM_horowitz(nextinputtime,tf,PARM(VTHINV100x60),PARM(VTHINV100x60), RISE)/ (1.0-PARM(VTHINV100x60)); /* First stage: driving the decoders */ rows = C/(8*B*A*Ntbl*Ntspd); Ceq = SIM_draincap(Wdecdrivep,PCH,1)+SIM_draincap(Wdecdriven,NCH,1) + 4*SIM_gatecap(Wdec3to8n+Wdec3to8p,10.0)*(Ntwl*Ntbl)+ Cwordmetal*0.25*8*B*A*Ntbl*Ntspd; Rwire = Rwordmetal*0.125*8*B*A*Ntbl*Ntspd; tf = (Rwire + SIM_transreson(Wdecdrivep,PCH,1))*Ceq; *Tdecdrive = SIM_horowitz(nextinputtime,tf,PARM(VTHINV100x60),PARM(VTHNAND60x90), FALL); nextinputtime = *Tdecdrive/PARM(VTHNAND60x90); /* second stage: driving a bunch of nor gates with a nand */ numstack = (int)(ceil((1.0/3.0)*logtwo( (double)((double)C/(double)(B*A*Ntbl*Ntspd))))); if (numstack==0) numstack = 1; if (numstack>5) numstack = 5; Ceq = 3*SIM_draincap(Wdec3to8p,PCH,1) +SIM_draincap(Wdec3to8n,NCH,3) + SIM_gatecap(WdecNORn+WdecNORp,((numstack*40)+20.0))*rows + Cbitmetal*rows*8; Rwire = Rbitmetal*rows*8/2; tf = Ceq*(Rwire+SIM_transreson(Wdec3to8n,NCH,3)); /* we only want to charge the output to the threshold of the nor gate. But the threshold depends on the number of inputs to the nor. */ switch(numstack) { case 1: vth = PARM(VTHNOR12x4x1); break; case 2: vth = PARM(VTHNOR12x4x2); break; case 3: vth = PARM(VTHNOR12x4x3); break; case 4: vth = PARM(VTHNOR12x4x4); break; case 5: vth = PARM(VTHNOR12x4x4); break; case 6: vth = PARM(VTHNOR12x4x4); break; default: printf("error:numstack=%d\n",numstack); } *Tdecoder1 = SIM_horowitz(nextinputtime,tf,PARM(VTHNAND60x90),vth,RISE); nextinputtime = *Tdecoder1/(1.0-vth); /* Final stage: driving an inverter with the nor */ Req = SIM_transreson(WdecNORp,PCH,numstack); Ceq = (SIM_gatecap(Wdecinvn+Wdecinvp,20.0)+ numstack*SIM_draincap(WdecNORn,NCH,1)+ SIM_draincap(WdecNORp,PCH,numstack)); tf = Req*Ceq; *Tdecoder2 = SIM_horowitz(nextinputtime,tf,vth,PARM(VSINV),FALL); *outrisetime = *Tdecoder2/(PARM(VSINV)); return(*Tdecdrive+*Tdecoder1+*Tdecoder2); }