Beispiel #1
0
void count_routing_transistors (int num_switch, float R_minW_nmos,
                                float R_minW_pmos) {

    /* Counts how many transistors are needed to implement the FPGA routing      *
     * resources.  Call this only when an rr_graph exists.  It does not count    *
     * the transistors used in logic blocks, but it counts the transistors in    *
     * the input connection block multiplexers and in the output pin drivers and *
     * pass transistors.  NB:  this routine assumes pass transistors always      *
     * generate two edges (one forward, one backward) between two nodes.         *
     * Physically, this is what happens -- make sure your rr_graph does it.      *
     *                                                                           *
     * I assume a minimum width transistor takes 1 unit of area.  A double-width *
     * transistor takes the twice the diffusion width, but the same spacing, so  *
     * I assume it takes 1.5x the area of a minimum-width transitor.  I always   *
     * design tri-state buffers as a buffer followed by a pass transistor.       *
     * I make Rbuffer = Rpass_transitor = 1/2 Rtri-state_buffer.                 *
     * I make the pull-up and pull-down sides of the buffer the same strength -- *
     * i.e. I make the p transistor R_minW_pmos / R_minW_nmos wider than the n   *
     * transistor.                                                               *
     *                                                                           *
     * I generate two area numbers in this routine:  ntrans_sharing and          *
     * ntrans_no_sharing.  ntrans_sharing exactly reflects what the timing       *
     * analyzer, etc. works with -- each switch is a completely self contained   *
     * pass transistor or tri-state buffer.  In the case of tri-state buffers    *
     * this is rather pessimisitic.  The inverter chain part of the buffer (as   *
     * opposed to the pass transistor + SRAM output part) can be shared by       *
     * several switches in the same location.  Obviously all the switches from   *
     * an OPIN can share one buffer.  Also, CHANX and CHANY switches at the same *
     * spot (i,j) on a single segment can share a buffer.  For a more realistic  *
     * area number I assume all buffered switches from a node that are at the    *
     * *same (i,j) location* can share one buffer.  Only the lowest resistance   *
     * (largest) buffer is implemented.  In practice, you might want to build    *
     * something that is 1.5x or 2x the largest buffer, so this may be a bit     *
     * optimistic (but I still think it's pretty reasonable).                    */


    int *num_inputs_to_cblock;  /* [0..num_rr_nodes-1], but all entries not    */
    /* corresponding to IPINs will be 0.           */

    boolean *cblock_counted;          /* [0..max(nx,ny)] -- 0th element unused. */
    float *shared_buffer_trans;       /* [0..max_nx,ny)] */
    float *unsharable_switch_trans, *sharable_switch_trans; /* [0..num_switch-1] */

    t_rr_type from_rr_type, to_rr_type;
    int from_node, to_node, iedge, num_edges, maxlen;
    int iswitch, i, j, iseg, max_inputs_to_cblock;
    float input_cblock_trans, shared_opin_buffer_trans;
    const float trans_sram_bit = 6.;

    /* Two variables below are the accumulator variables that add up all the    *
     * transistors in the routing.  Make doubles so that they don't stop        *
     * incrementing once adding a switch makes a change of less than 1 part in  *
     * 10^7 to the total.  If this still isn't good enough (adding 1 part in    *
     * 10^15 will still be thrown away), compute the transistor count in        *
     * "chunks", by adding up inodes 1 to 1000, 1001 to 2000 and then summing   *
     * the partial sums together.                                               */

    double ntrans_sharing, ntrans_no_sharing;


    /* Buffers from the routing to the ipin cblock inputs, and from the ipin    *
     * cblock outputs to the logic block, respectively.  Assume minimum size n  *
     * transistors, and ptransistors sized to make the pull-up R = pull-down R. */

    float trans_track_to_cblock_buf;
    float trans_cblock_to_lblock_buf;


    ntrans_sharing = 0.;
    ntrans_no_sharing = 0.;
    max_inputs_to_cblock = 0;

    /* Assume the two buffers below are 4x minimum drive strength (enough to *
     * drive a fanout of up to 16 pretty nicely -- should cover a reasonable *
     * wiring C plus the fanout.                                             */

    trans_track_to_cblock_buf = trans_per_buf (R_minW_nmos/4., R_minW_nmos,
                                R_minW_pmos);

    trans_cblock_to_lblock_buf = trans_per_buf (R_minW_nmos/4., R_minW_nmos,
                                 R_minW_pmos);

    /* trans_track_to_cblock_buf = 1. + trans_per_R (R_minW_nmos, R_minW_pmos);
     trans_cblock_to_lblock_buf = 1. + trans_per_R (R_minW_nmos, R_minW_pmos); */

    num_inputs_to_cblock = (int *) my_calloc (num_rr_nodes, sizeof (int));

    maxlen = max (nx, ny) + 1;
    cblock_counted = (boolean *) my_calloc (maxlen, sizeof (boolean));
    shared_buffer_trans = (float *) my_calloc (maxlen, sizeof (float));

    unsharable_switch_trans = alloc_and_load_unsharable_switch_trans (num_switch,
                              trans_sram_bit, R_minW_nmos);

    sharable_switch_trans = alloc_and_load_sharable_switch_trans (num_switch,
                            trans_sram_bit, R_minW_nmos, R_minW_pmos);

    for (from_node=0; from_node<num_rr_nodes; from_node++) {

        from_rr_type = rr_node[from_node].type;

        switch (from_rr_type) {

        case CHANX:
        case CHANY:
            num_edges = rr_node[from_node].num_edges;

            for (iedge=0; iedge<num_edges; iedge++) {

                to_node = rr_node[from_node].edges[iedge];
                to_rr_type = rr_node[to_node].type;

                switch (to_rr_type) {

                case CHANX:
                case CHANY:
                    iswitch = rr_node[from_node].switches[iedge];

                    if (switch_inf[iswitch].buffered) {
                        iseg = seg_index_of_sblock (from_node, to_node);
                        shared_buffer_trans[iseg] = max (shared_buffer_trans[iseg],
                                                         sharable_switch_trans[iswitch]);

                        ntrans_no_sharing += unsharable_switch_trans[iswitch] +
                                             sharable_switch_trans[iswitch];
                        ntrans_sharing += unsharable_switch_trans[iswitch];
                    }
                    else if (from_node < to_node) {

                        /* Pass transistor shared by two edges -- only count once.  *
                         * Also, no part of a pass transistor is sharable.          */

                        ntrans_no_sharing += unsharable_switch_trans[iswitch];
                        ntrans_sharing += unsharable_switch_trans[iswitch];
                    }
                    break;

                case IPIN:
                    num_inputs_to_cblock[to_node]++;
                    max_inputs_to_cblock = max (max_inputs_to_cblock,
                                                num_inputs_to_cblock[to_node]);

                    iseg = seg_index_of_cblock (from_rr_type, to_node);

                    if (cblock_counted[iseg] == FALSE) {
                        cblock_counted[iseg] = TRUE;
                        ntrans_sharing += trans_track_to_cblock_buf;
                        ntrans_no_sharing += trans_track_to_cblock_buf;
                    }
                    break;

                default:
                    printf ("Error in count_routing_transistors:  Unexpected \n"
                            "connection from node %d (type %d) to node %d (type %d).\n",
                            from_node, from_rr_type, to_node, to_rr_type);
                    exit (1);
                    break;

                }   /* End switch on to_rr_type. */

            }   /* End for each edge. */

            /* Now add in the shared buffer transistors, and reset some flags. */

            if (from_rr_type == CHANX) {
                for (i=rr_node[from_node].xlow-1; i<=rr_node[from_node].xhigh; i++) {
                    ntrans_sharing += shared_buffer_trans[i];
                    shared_buffer_trans[i] = 0.;
                }

                for (i=rr_node[from_node].xlow; i<=rr_node[from_node].xhigh; i++)
                    cblock_counted[i] = FALSE;

            }
            else {  /* CHANY */
                for (j=rr_node[from_node].ylow-1; j<=rr_node[from_node].yhigh; j++) {
                    ntrans_sharing += shared_buffer_trans[j];
                    shared_buffer_trans[j] = 0.;
                }

                for (j=rr_node[from_node].ylow; j<=rr_node[from_node].yhigh; j++)
                    cblock_counted[j] = FALSE;

            }
            break;

        case OPIN:
            num_edges = rr_node[from_node].num_edges;
            shared_opin_buffer_trans = 0.;

            for (iedge=0; iedge<num_edges; iedge++) {
                iswitch = rr_node[from_node].switches[iedge];
                ntrans_no_sharing += unsharable_switch_trans[iswitch] +
                                     sharable_switch_trans[iswitch];
                ntrans_sharing += unsharable_switch_trans[iswitch];

                shared_opin_buffer_trans = max (shared_opin_buffer_trans,
                                                sharable_switch_trans[iswitch]);
            }

            ntrans_sharing += shared_opin_buffer_trans;
            break;

        default:
            break;

        }  /* End switch on from_rr_type */
    }  /* End for all nodes */

    free (cblock_counted);
    free (shared_buffer_trans);
    free (unsharable_switch_trans);
    free (sharable_switch_trans);

    /* Now add in the input connection block transistors. */

    input_cblock_trans = get_cblock_trans (num_inputs_to_cblock,
                                           max_inputs_to_cblock, trans_cblock_to_lblock_buf, trans_sram_bit);

    free (num_inputs_to_cblock);

    ntrans_sharing += input_cblock_trans;
    ntrans_no_sharing += input_cblock_trans;

    printf ("\nRouting area (in minimum width transistor areas):\n");
    printf ("Assuming no buffer sharing (pessimistic). Total: %#g  Per clb: "
            "%#g\n", ntrans_no_sharing, ntrans_no_sharing / (float) (nx * ny));
    printf ("Assuming buffer sharing (slightly optimistic). Total: %#g  Per clb: "
            "%#g\n\n", ntrans_sharing, ntrans_sharing / (float) (nx * ny));
}
void add_rr_graph_C_from_switches(float C_ipin_cblock) {

	/* This routine finishes loading the C elements of the rr_graph. It assumes *
	 * that when you call it the CHANX and CHANY nodes have had their C set to  *
	 * their metal capacitance, and everything else has C set to 0.  The graph  *
	 * connectivity (edges, switch types etc.) must all be loaded too.  This    *
	 * routine will add in the capacitance on the CHANX and CHANY nodes due to: *
	 *                                                                          *
	 * 1) The output capacitance of the switches coming from OPINs;             *
	 * 2) The input and output capacitance of the switches between the various  *
	 *    wiring (CHANX and CHANY) segments; and                                *
	 * 3) The input capacitance of the buffers separating routing tracks from   *
	 *    the connection block inputs.                                          */

	int inode, iedge, switch_index, to_node, maxlen;
	int icblock, isblock, iseg_low, iseg_high;
	float Cin, Cout;
	t_rr_type from_rr_type, to_rr_type;
	boolean * cblock_counted; /* [0..max(nx,ny)] -- 0th element unused. */
	float *buffer_Cin; /* [0..max(nx,ny)] */
	boolean buffered;
	float *Couts_to_add; /* UDSD */

	maxlen = std::max(nx, ny) + 1;
	cblock_counted = (boolean *) my_calloc(maxlen, sizeof(boolean));
	buffer_Cin = (float *) my_calloc(maxlen, sizeof(float));

	for (inode = 0; inode < num_rr_nodes; inode++) {

		from_rr_type = rr_node[inode].type;

		if (from_rr_type == CHANX || from_rr_type == CHANY) {

			for (iedge = 0; iedge < rr_node[inode].num_edges; iedge++) {

				to_node = rr_node[inode].edges[iedge];
				to_rr_type = rr_node[to_node].type;

				if (to_rr_type == CHANX || to_rr_type == CHANY) {

					switch_index = rr_node[inode].switches[iedge];
					Cin = switch_inf[switch_index].Cin;
					Cout = switch_inf[switch_index].Cout;
					buffered = switch_inf[switch_index].buffered;

					/* If both the switch from inode to to_node and the switch from *
					 * to_node back to inode use bidirectional switches (i.e. pass  *
					 * transistors), there will only be one physical switch for     *
					 * both edges.  Hence, I only want to count the capacitance of  *
					 * that switch for one of the two edges.  (Note:  if there is   *
					 * a pass transistor edge from x to y, I always build the graph *
					 * so that there is a corresponding edge using the same switch  *
					 * type from y to x.) So, I arbitrarily choose to add in the    *
					 * capacitance in that case of a pass transistor only when      *
					 * processing the the lower inode number.                       *
					 * If an edge uses a buffer I always have to add in the output  *
					 * capacitance.  I assume that buffers are shared at the same   *
					 * (i,j) location, so only one input capacitance needs to be    *
					 * added for all the buffered switches at that location.  If    *
					 * the buffers at that location have different sizes, I use the *
					 * input capacitance of the largest one.                        */

					if (!buffered && inode < to_node) { /* Pass transistor. */
						rr_node[inode].C += Cin;
						rr_node[to_node].C += Cout;
					}

					else if (buffered) {
						/* Prevent double counting of capacitance for UDSD */
						if (rr_node[to_node].drivers != SINGLE) {
							/* For multiple-driver architectures the output capacitance can
							 * be added now since each edge is actually a driver */
							rr_node[to_node].C += Cout;
						}
						isblock = seg_index_of_sblock(inode, to_node);
						buffer_Cin[isblock] = std::max(buffer_Cin[isblock], Cin);
					}

				}
				/* End edge to CHANX or CHANY node. */
				else if (to_rr_type == IPIN) {

					/* Code below implements sharing of the track to connection     *
					 * box buffer.  I assume there is one such buffer at every      *
					 * segment of the wire at which at least one logic block input  *
					 * connects.                                                    */

					icblock = seg_index_of_cblock(from_rr_type, to_node);
					if (cblock_counted[icblock] == FALSE) {
						rr_node[inode].C += C_ipin_cblock;
						cblock_counted[icblock] = TRUE;
					}
				}
			} /* End loop over all edges of a node. */

			/* Reset the cblock_counted and buffer_Cin arrays, and add buf Cin. */

			/* Method below would be faster for very unpopulated segments, but I  *
			 * think it would be slower overall for most FPGAs, so commented out. */

			/*   for (iedge=0;iedge<rr_node[inode].num_edges;iedge++) {
			 * to_node = rr_node[inode].edges[iedge];
			 * if (rr_node[to_node].type == IPIN) {
			 * icblock = seg_index_of_cblock (from_rr_type, to_node);
			 * cblock_counted[icblock] = FALSE;
			 * }
			 * }     */

			if (from_rr_type == CHANX) {
				iseg_low = rr_node[inode].xlow;
				iseg_high = rr_node[inode].xhigh;
			} else { /* CHANY */
				iseg_low = rr_node[inode].ylow;
				iseg_high = rr_node[inode].yhigh;
			}

			for (icblock = iseg_low; icblock <= iseg_high; icblock++) {
				cblock_counted[icblock] = FALSE;
			}

			for (isblock = iseg_low - 1; isblock <= iseg_high; isblock++) {
				rr_node[inode].C += buffer_Cin[isblock]; /* Biggest buf Cin at loc */
				buffer_Cin[isblock] = 0.;
			}

		}
		/* End node is CHANX or CHANY */
		else if (from_rr_type == OPIN) {

			for (iedge = 0; iedge < rr_node[inode].num_edges; iedge++) {
				switch_index = rr_node[inode].switches[iedge];
				/* UDSD by ICK Start */
				to_node = rr_node[inode].edges[iedge];
				to_rr_type = rr_node[to_node].type;
				assert(to_rr_type == CHANX || to_rr_type == CHANY || to_rr_type == IPIN);
				if (rr_node[to_node].drivers != SINGLE) {
					Cout = switch_inf[switch_index].Cout;
					to_node = rr_node[inode].edges[iedge]; /* Will be CHANX or CHANY or IPIN */
					rr_node[to_node].C += Cout;
				}
			}
		}
		/* End node is OPIN. */
	} /* End for all nodes. */

	/* Now we need to add any cout loads for nets that we previously didn't process
	 * Current structures only keep switch information from a node to the next node and
	 * not the reverse.  Therefore I need to go through all the possible edges to figure 
	 * out what the Cout's should be */
	Couts_to_add = (float *) my_calloc(num_rr_nodes, sizeof(float));
	for (inode = 0; inode < num_rr_nodes; inode++) {
		for (iedge = 0; iedge < rr_node[inode].num_edges; iedge++) {
			switch_index = rr_node[inode].switches[iedge];
			to_node = rr_node[inode].edges[iedge];
			to_rr_type = rr_node[to_node].type;
			if (to_rr_type == CHANX || to_rr_type == CHANY) {
				if (rr_node[to_node].drivers == SINGLE) {
					/* Cout was not added in these cases */
					if (Couts_to_add[to_node] != 0) {
						/* We've already found a Cout to add to this node
						 * We could take the max of all possibilities but
						 * instead I will fail if there are conflicting Couts */
						if (Couts_to_add[to_node]
								!= switch_inf[switch_index].Cout) {
							vpr_printf(TIO_MESSAGE_ERROR, "A single driver resource (%i) is driven by different Cout's (%e!=%e)\n",
									to_node, Couts_to_add[to_node],
									switch_inf[switch_index].Cout);
							exit(1);
						}
					}
					Couts_to_add[to_node] = switch_inf[switch_index].Cout;

				}
			}
		}
	}
	for (inode = 0; inode < num_rr_nodes; inode++) {
		rr_node[inode].C += Couts_to_add[inode];
	}
	free(Couts_to_add);
	free(cblock_counted);
	free(buffer_Cin);
}