void count_routing_transistors (int num_switch, float R_minW_nmos, float R_minW_pmos) { /* Counts how many transistors are needed to implement the FPGA routing * * resources. Call this only when an rr_graph exists. It does not count * * the transistors used in logic blocks, but it counts the transistors in * * the input connection block multiplexers and in the output pin drivers and * * pass transistors. NB: this routine assumes pass transistors always * * generate two edges (one forward, one backward) between two nodes. * * Physically, this is what happens -- make sure your rr_graph does it. * * * * I assume a minimum width transistor takes 1 unit of area. A double-width * * transistor takes the twice the diffusion width, but the same spacing, so * * I assume it takes 1.5x the area of a minimum-width transitor. I always * * design tri-state buffers as a buffer followed by a pass transistor. * * I make Rbuffer = Rpass_transitor = 1/2 Rtri-state_buffer. * * I make the pull-up and pull-down sides of the buffer the same strength -- * * i.e. I make the p transistor R_minW_pmos / R_minW_nmos wider than the n * * transistor. * * * * I generate two area numbers in this routine: ntrans_sharing and * * ntrans_no_sharing. ntrans_sharing exactly reflects what the timing * * analyzer, etc. works with -- each switch is a completely self contained * * pass transistor or tri-state buffer. In the case of tri-state buffers * * this is rather pessimisitic. The inverter chain part of the buffer (as * * opposed to the pass transistor + SRAM output part) can be shared by * * several switches in the same location. Obviously all the switches from * * an OPIN can share one buffer. Also, CHANX and CHANY switches at the same * * spot (i,j) on a single segment can share a buffer. For a more realistic * * area number I assume all buffered switches from a node that are at the * * *same (i,j) location* can share one buffer. Only the lowest resistance * * (largest) buffer is implemented. In practice, you might want to build * * something that is 1.5x or 2x the largest buffer, so this may be a bit * * optimistic (but I still think it's pretty reasonable). */ int *num_inputs_to_cblock; /* [0..num_rr_nodes-1], but all entries not */ /* corresponding to IPINs will be 0. */ boolean *cblock_counted; /* [0..max(nx,ny)] -- 0th element unused. */ float *shared_buffer_trans; /* [0..max_nx,ny)] */ float *unsharable_switch_trans, *sharable_switch_trans; /* [0..num_switch-1] */ t_rr_type from_rr_type, to_rr_type; int from_node, to_node, iedge, num_edges, maxlen; int iswitch, i, j, iseg, max_inputs_to_cblock; float input_cblock_trans, shared_opin_buffer_trans; const float trans_sram_bit = 6.; /* Two variables below are the accumulator variables that add up all the * * transistors in the routing. Make doubles so that they don't stop * * incrementing once adding a switch makes a change of less than 1 part in * * 10^7 to the total. If this still isn't good enough (adding 1 part in * * 10^15 will still be thrown away), compute the transistor count in * * "chunks", by adding up inodes 1 to 1000, 1001 to 2000 and then summing * * the partial sums together. */ double ntrans_sharing, ntrans_no_sharing; /* Buffers from the routing to the ipin cblock inputs, and from the ipin * * cblock outputs to the logic block, respectively. Assume minimum size n * * transistors, and ptransistors sized to make the pull-up R = pull-down R. */ float trans_track_to_cblock_buf; float trans_cblock_to_lblock_buf; ntrans_sharing = 0.; ntrans_no_sharing = 0.; max_inputs_to_cblock = 0; /* Assume the two buffers below are 4x minimum drive strength (enough to * * drive a fanout of up to 16 pretty nicely -- should cover a reasonable * * wiring C plus the fanout. */ trans_track_to_cblock_buf = trans_per_buf (R_minW_nmos/4., R_minW_nmos, R_minW_pmos); trans_cblock_to_lblock_buf = trans_per_buf (R_minW_nmos/4., R_minW_nmos, R_minW_pmos); /* trans_track_to_cblock_buf = 1. + trans_per_R (R_minW_nmos, R_minW_pmos); trans_cblock_to_lblock_buf = 1. + trans_per_R (R_minW_nmos, R_minW_pmos); */ num_inputs_to_cblock = (int *) my_calloc (num_rr_nodes, sizeof (int)); maxlen = max (nx, ny) + 1; cblock_counted = (boolean *) my_calloc (maxlen, sizeof (boolean)); shared_buffer_trans = (float *) my_calloc (maxlen, sizeof (float)); unsharable_switch_trans = alloc_and_load_unsharable_switch_trans (num_switch, trans_sram_bit, R_minW_nmos); sharable_switch_trans = alloc_and_load_sharable_switch_trans (num_switch, trans_sram_bit, R_minW_nmos, R_minW_pmos); for (from_node=0; from_node<num_rr_nodes; from_node++) { from_rr_type = rr_node[from_node].type; switch (from_rr_type) { case CHANX: case CHANY: num_edges = rr_node[from_node].num_edges; for (iedge=0; iedge<num_edges; iedge++) { to_node = rr_node[from_node].edges[iedge]; to_rr_type = rr_node[to_node].type; switch (to_rr_type) { case CHANX: case CHANY: iswitch = rr_node[from_node].switches[iedge]; if (switch_inf[iswitch].buffered) { iseg = seg_index_of_sblock (from_node, to_node); shared_buffer_trans[iseg] = max (shared_buffer_trans[iseg], sharable_switch_trans[iswitch]); ntrans_no_sharing += unsharable_switch_trans[iswitch] + sharable_switch_trans[iswitch]; ntrans_sharing += unsharable_switch_trans[iswitch]; } else if (from_node < to_node) { /* Pass transistor shared by two edges -- only count once. * * Also, no part of a pass transistor is sharable. */ ntrans_no_sharing += unsharable_switch_trans[iswitch]; ntrans_sharing += unsharable_switch_trans[iswitch]; } break; case IPIN: num_inputs_to_cblock[to_node]++; max_inputs_to_cblock = max (max_inputs_to_cblock, num_inputs_to_cblock[to_node]); iseg = seg_index_of_cblock (from_rr_type, to_node); if (cblock_counted[iseg] == FALSE) { cblock_counted[iseg] = TRUE; ntrans_sharing += trans_track_to_cblock_buf; ntrans_no_sharing += trans_track_to_cblock_buf; } break; default: printf ("Error in count_routing_transistors: Unexpected \n" "connection from node %d (type %d) to node %d (type %d).\n", from_node, from_rr_type, to_node, to_rr_type); exit (1); break; } /* End switch on to_rr_type. */ } /* End for each edge. */ /* Now add in the shared buffer transistors, and reset some flags. */ if (from_rr_type == CHANX) { for (i=rr_node[from_node].xlow-1; i<=rr_node[from_node].xhigh; i++) { ntrans_sharing += shared_buffer_trans[i]; shared_buffer_trans[i] = 0.; } for (i=rr_node[from_node].xlow; i<=rr_node[from_node].xhigh; i++) cblock_counted[i] = FALSE; } else { /* CHANY */ for (j=rr_node[from_node].ylow-1; j<=rr_node[from_node].yhigh; j++) { ntrans_sharing += shared_buffer_trans[j]; shared_buffer_trans[j] = 0.; } for (j=rr_node[from_node].ylow; j<=rr_node[from_node].yhigh; j++) cblock_counted[j] = FALSE; } break; case OPIN: num_edges = rr_node[from_node].num_edges; shared_opin_buffer_trans = 0.; for (iedge=0; iedge<num_edges; iedge++) { iswitch = rr_node[from_node].switches[iedge]; ntrans_no_sharing += unsharable_switch_trans[iswitch] + sharable_switch_trans[iswitch]; ntrans_sharing += unsharable_switch_trans[iswitch]; shared_opin_buffer_trans = max (shared_opin_buffer_trans, sharable_switch_trans[iswitch]); } ntrans_sharing += shared_opin_buffer_trans; break; default: break; } /* End switch on from_rr_type */ } /* End for all nodes */ free (cblock_counted); free (shared_buffer_trans); free (unsharable_switch_trans); free (sharable_switch_trans); /* Now add in the input connection block transistors. */ input_cblock_trans = get_cblock_trans (num_inputs_to_cblock, max_inputs_to_cblock, trans_cblock_to_lblock_buf, trans_sram_bit); free (num_inputs_to_cblock); ntrans_sharing += input_cblock_trans; ntrans_no_sharing += input_cblock_trans; printf ("\nRouting area (in minimum width transistor areas):\n"); printf ("Assuming no buffer sharing (pessimistic). Total: %#g Per clb: " "%#g\n", ntrans_no_sharing, ntrans_no_sharing / (float) (nx * ny)); printf ("Assuming buffer sharing (slightly optimistic). Total: %#g Per clb: " "%#g\n\n", ntrans_sharing, ntrans_sharing / (float) (nx * ny)); }
void add_rr_graph_C_from_switches(float C_ipin_cblock) { /* This routine finishes loading the C elements of the rr_graph. It assumes * * that when you call it the CHANX and CHANY nodes have had their C set to * * their metal capacitance, and everything else has C set to 0. The graph * * connectivity (edges, switch types etc.) must all be loaded too. This * * routine will add in the capacitance on the CHANX and CHANY nodes due to: * * * * 1) The output capacitance of the switches coming from OPINs; * * 2) The input and output capacitance of the switches between the various * * wiring (CHANX and CHANY) segments; and * * 3) The input capacitance of the buffers separating routing tracks from * * the connection block inputs. */ int inode, iedge, switch_index, to_node, maxlen; int icblock, isblock, iseg_low, iseg_high; float Cin, Cout; t_rr_type from_rr_type, to_rr_type; boolean * cblock_counted; /* [0..max(nx,ny)] -- 0th element unused. */ float *buffer_Cin; /* [0..max(nx,ny)] */ boolean buffered; float *Couts_to_add; /* UDSD */ maxlen = std::max(nx, ny) + 1; cblock_counted = (boolean *) my_calloc(maxlen, sizeof(boolean)); buffer_Cin = (float *) my_calloc(maxlen, sizeof(float)); for (inode = 0; inode < num_rr_nodes; inode++) { from_rr_type = rr_node[inode].type; if (from_rr_type == CHANX || from_rr_type == CHANY) { for (iedge = 0; iedge < rr_node[inode].num_edges; iedge++) { to_node = rr_node[inode].edges[iedge]; to_rr_type = rr_node[to_node].type; if (to_rr_type == CHANX || to_rr_type == CHANY) { switch_index = rr_node[inode].switches[iedge]; Cin = switch_inf[switch_index].Cin; Cout = switch_inf[switch_index].Cout; buffered = switch_inf[switch_index].buffered; /* If both the switch from inode to to_node and the switch from * * to_node back to inode use bidirectional switches (i.e. pass * * transistors), there will only be one physical switch for * * both edges. Hence, I only want to count the capacitance of * * that switch for one of the two edges. (Note: if there is * * a pass transistor edge from x to y, I always build the graph * * so that there is a corresponding edge using the same switch * * type from y to x.) So, I arbitrarily choose to add in the * * capacitance in that case of a pass transistor only when * * processing the the lower inode number. * * If an edge uses a buffer I always have to add in the output * * capacitance. I assume that buffers are shared at the same * * (i,j) location, so only one input capacitance needs to be * * added for all the buffered switches at that location. If * * the buffers at that location have different sizes, I use the * * input capacitance of the largest one. */ if (!buffered && inode < to_node) { /* Pass transistor. */ rr_node[inode].C += Cin; rr_node[to_node].C += Cout; } else if (buffered) { /* Prevent double counting of capacitance for UDSD */ if (rr_node[to_node].drivers != SINGLE) { /* For multiple-driver architectures the output capacitance can * be added now since each edge is actually a driver */ rr_node[to_node].C += Cout; } isblock = seg_index_of_sblock(inode, to_node); buffer_Cin[isblock] = std::max(buffer_Cin[isblock], Cin); } } /* End edge to CHANX or CHANY node. */ else if (to_rr_type == IPIN) { /* Code below implements sharing of the track to connection * * box buffer. I assume there is one such buffer at every * * segment of the wire at which at least one logic block input * * connects. */ icblock = seg_index_of_cblock(from_rr_type, to_node); if (cblock_counted[icblock] == FALSE) { rr_node[inode].C += C_ipin_cblock; cblock_counted[icblock] = TRUE; } } } /* End loop over all edges of a node. */ /* Reset the cblock_counted and buffer_Cin arrays, and add buf Cin. */ /* Method below would be faster for very unpopulated segments, but I * * think it would be slower overall for most FPGAs, so commented out. */ /* for (iedge=0;iedge<rr_node[inode].num_edges;iedge++) { * to_node = rr_node[inode].edges[iedge]; * if (rr_node[to_node].type == IPIN) { * icblock = seg_index_of_cblock (from_rr_type, to_node); * cblock_counted[icblock] = FALSE; * } * } */ if (from_rr_type == CHANX) { iseg_low = rr_node[inode].xlow; iseg_high = rr_node[inode].xhigh; } else { /* CHANY */ iseg_low = rr_node[inode].ylow; iseg_high = rr_node[inode].yhigh; } for (icblock = iseg_low; icblock <= iseg_high; icblock++) { cblock_counted[icblock] = FALSE; } for (isblock = iseg_low - 1; isblock <= iseg_high; isblock++) { rr_node[inode].C += buffer_Cin[isblock]; /* Biggest buf Cin at loc */ buffer_Cin[isblock] = 0.; } } /* End node is CHANX or CHANY */ else if (from_rr_type == OPIN) { for (iedge = 0; iedge < rr_node[inode].num_edges; iedge++) { switch_index = rr_node[inode].switches[iedge]; /* UDSD by ICK Start */ to_node = rr_node[inode].edges[iedge]; to_rr_type = rr_node[to_node].type; assert(to_rr_type == CHANX || to_rr_type == CHANY || to_rr_type == IPIN); if (rr_node[to_node].drivers != SINGLE) { Cout = switch_inf[switch_index].Cout; to_node = rr_node[inode].edges[iedge]; /* Will be CHANX or CHANY or IPIN */ rr_node[to_node].C += Cout; } } } /* End node is OPIN. */ } /* End for all nodes. */ /* Now we need to add any cout loads for nets that we previously didn't process * Current structures only keep switch information from a node to the next node and * not the reverse. Therefore I need to go through all the possible edges to figure * out what the Cout's should be */ Couts_to_add = (float *) my_calloc(num_rr_nodes, sizeof(float)); for (inode = 0; inode < num_rr_nodes; inode++) { for (iedge = 0; iedge < rr_node[inode].num_edges; iedge++) { switch_index = rr_node[inode].switches[iedge]; to_node = rr_node[inode].edges[iedge]; to_rr_type = rr_node[to_node].type; if (to_rr_type == CHANX || to_rr_type == CHANY) { if (rr_node[to_node].drivers == SINGLE) { /* Cout was not added in these cases */ if (Couts_to_add[to_node] != 0) { /* We've already found a Cout to add to this node * We could take the max of all possibilities but * instead I will fail if there are conflicting Couts */ if (Couts_to_add[to_node] != switch_inf[switch_index].Cout) { vpr_printf(TIO_MESSAGE_ERROR, "A single driver resource (%i) is driven by different Cout's (%e!=%e)\n", to_node, Couts_to_add[to_node], switch_inf[switch_index].Cout); exit(1); } } Couts_to_add[to_node] = switch_inf[switch_index].Cout; } } } } for (inode = 0; inode < num_rr_nodes; inode++) { rr_node[inode].C += Couts_to_add[inode]; } free(Couts_to_add); free(cblock_counted); free(buffer_Cin); }