verilog-to-routing · WhiteNinjaZ · Jun 13, 2025 · Jun 13, 2025 · Jun 14, 2025 · Jun 16, 2025
diff --git a/parmys/parmys-plugin/core/multiplier.cc b/parmys/parmys-plugin/core/multiplier.cc
@@ -937,7 +937,7 @@ void init_multiplier_adder(nnode_t *node, nnode_t *parent, int a, int b)
  *-----------------------------------------------------------------------*/
 void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *netlist)
 {
-    nnode_t *a0b0, *a0b1, *a1b0, *a1b1, *addsmall, *addbig;
+    nnode_t *a0b0, *a0b1, *a1b0, *a1b1, *addsmall, *addsmall2, *addbig;
     int size;
 
     /* Check for a legitimate split */
@@ -976,50 +976,153 @@ void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *
     init_split_multiplier(node, a1b0, a0, a1, 0, b0, a1b1, a0b0);
     mult_list = insert_in_vptr_list(mult_list, a1b0);
 
-    /* New node for the initial add */
-    addsmall = allocate_nnode(node->loc);
-    addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
-    strcpy(addsmall->name, node->name);
-    strcat(addsmall->name, "-add0");
-    // this addition will have a carry out in the worst case, add to input pins and connect then to gnd
-    init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + 1, a0b1->num_output_pins + 1);
-
-    /* New node for the BIG add */
-    addbig = allocate_nnode(node->loc);
-    addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
-    strcpy(addbig->name, node->name);
-    strcat(addbig->name, "-add1");
-    init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, a0b0->num_output_pins - b0 + a1b1->num_output_pins);
-
-    // connect inputs to port a of addsmall
-    for (int i = 0; i < a1b0->num_output_pins; i++)
-        connect_nodes(a1b0, i, addsmall, i);
-    add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins);
-    // connect inputs to port b of addsmall
-    for (int i = 0; i < a0b1->num_output_pins; i++)
-        connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0]);
-    add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0]);
-
-    // connect inputs to port a of addbig
-    size = addsmall->num_output_pins;
-    for (int i = 0; i < size; i++)
-        connect_nodes(addsmall, i, addbig, i);
-
-    // connect inputs to port b of addbig
-    for (int i = b0; i < a0b0->output_port_sizes[0]; i++)
-        connect_nodes(a0b0, i, addbig, i - b0 + size);
-    size = size + a0b0->output_port_sizes[0] - b0;
-    for (int i = 0; i < a1b1->output_port_sizes[0]; i++)
-        connect_nodes(a1b1, i, addbig, i + size);
-
-    // remap the multiplier outputs coming directly from a0b0
-    for (int i = 0; i < b0; i++) {
-        remap_pin_to_new_node(node->output_pins[i], a0b0, i);
-    }
+    // using the balenced addition method only works if a0 and b0 are the same size
+    // (i.e. if the input ports on the hardware multiplier are equal)
+    if (b0 == a0) {
+        /* New node for the initial add */
+        addsmall = allocate_nnode(node->loc);
+        addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
+        strcpy(addsmall->name, node->name);
+        strcat(addsmall->name, "-add0");
+        // this addition will have a carry out in the worst case, add to input pins and connect then to gnd
+        init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + 1, a0b1->num_output_pins + 1);
+
+        // connect inputs to port a of addsmall
+        for (int i = 0; i < a1b0->num_output_pins; i++)
+            connect_nodes(a1b0, i, addsmall, i);
+
+        add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins);
+        // connect inputs to port b of addsmall
+        for (int i = 0; i < a0b1->num_output_pins; i++)
+            connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0]);
+        add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0]);
+
+        /* New node for the BIG add */
+        addbig = allocate_nnode(node->loc);
+        addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
+        strcpy(addbig->name, node->name);
+        strcat(addbig->name, "-add1");
+        init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, a0b0->num_output_pins - b0 + a1b1->num_output_pins);
+
+        // connect inputs to port a of addbig
+        size = addsmall->num_output_pins;
+        for (int i = 0; i < size; i++)
+            connect_nodes(addsmall, i, addbig, i);
+
+        // connect inputs to port b of addbig
+        for (int i = b0; i < a0b0->output_port_sizes[0]; i++)
+            connect_nodes(a0b0, i, addbig, i - b0 + size);
+        size = size + a0b0->output_port_sizes[0] - b0;
+        for (int i = 0; i < a1b1->output_port_sizes[0]; i++)
+            connect_nodes(a1b1, i, addbig, i + size);
+
+        // remap the multiplier outputs coming directly from a0b0
+        for (int i = 0; i < b0; i++) {
+            remap_pin_to_new_node(node->output_pins[i], a0b0, i);
+        }
+
+        // remap the multiplier outputs coming from addbig
+        for (int i = 0; i < addbig->num_output_pins; i++) {
+            remap_pin_to_new_node(node->output_pins[i + b0], addbig, i);
+        }
+    } else {
+        /* Expounding upon the description for the method in this function.
+        if we have two numbers A and B and we have a hardware multiplier of size a0xb0,
+        we can split them into two parts:
+        A = A1 << a0 + A0
+        B = B1 << b0 + B0
+        where A1 and B1 are the high bits of A and B, and A0 and B0 are the low bits.
+        Note that len(A0) = a0 and len(B0) = b0 by definition.
+        The multiplication of A and B can be expressed as:
+        A * B = (A1 << a0 + A0) * (B1 << b0 + B0)
+              = {A1 * B1 << (a0 + b0)} + {(A1 * B0) << a0 + (A0 * B1) << b0} + {A0 * B0}
+        we define split the editions up like so:
+        addsmall = (A1 * B0) << a0 + (A0 * B1) << b0 // can have carry
+        addsmall2 = (A1 * B1 << (a0 + b0)) + (A0 * B0) // Will not have carry
+        addbig = addsmall + addsmall2
+        This is a slightly modified version of the Karatsuba algorithm.
+        */
+        /////////////// Addsmall /////////////////////
+        addsmall = allocate_nnode(node->loc);
+        addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
+        strcpy(addsmall->name, node->name);
+        strcat(addsmall->name, "-add0");
+        init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + a0 + 1, a0b1->num_output_pins + b0 + 1);
+
+        // The first a0 pins of addsmall input connecting to a1b0 are connected to zero
+        for (int i = 0; i < a0; i++) {
+            add_input_pin_to_node(addsmall, get_zero_pin(netlist), i);
+        }
+
+        // connect inputs to port a of addsmall
+        for (int i = 0; i < a1b0->num_output_pins; i++) {
+            connect_nodes(a1b0, i, addsmall, i + a0);
+        }
+
+        // add zero pin for carry
+        add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins + a0);
+
+        // The first b0 pins of addsmall input connecting to a0b1 are connected to zero
+        for (int i = 0; i < b0; i++) {
+            add_input_pin_to_node(addsmall, get_zero_pin(netlist), i + addsmall->input_port_sizes[0]);
+        }
+
+        // connect inputs to port b of addsmall
+        for (int i = 0; i < a0b1->num_output_pins; i++) {
+            connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0] + b0);
+        }
+
+        // add zero pin for carry
+        add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0] + b0);
+
+        /////////////// Addsmall2 /////////////////////
+        addsmall2 = allocate_nnode(node->loc);
+        addsmall2->name = (char *)vtr::malloc(strlen(node->name) + 6);
+        strcpy(addsmall2->name, node->name);
+        strcat(addsmall2->name, "-add1");
+        init_multiplier_adder(addsmall2, a1b1, a1b1->num_output_pins + a0 + b0, a0b0->num_output_pins);
 
-    // remap the multiplier outputs coming from addbig
-    for (int i = 0; i < addbig->num_output_pins; i++) {
-        remap_pin_to_new_node(node->output_pins[i + b0], addbig, i);
+        // connect first a0+ b0 pins of addsmall2 to zero
+        for (int i = 0; i < a0 + b0; i++) {
+            add_input_pin_to_node(addsmall2, get_zero_pin(netlist), i);
+        }
+
+        // connect inputs to port a of addsmall2
+        for (int i = 0; i < a1b1->num_output_pins; i++) {
+            connect_nodes(a1b1, i, addsmall2, i + a0 + b0);
+        }
+
+        // connect inputs to port b of addsmall2
+        for (int i = 0; i < a0b0->output_port_sizes[0]; i++) {
+            connect_nodes(a0b0, i, addsmall2, i + addsmall2->input_port_sizes[0]);
+        }
+
+        /////////////// Addbig /////////////////////
+        addbig = allocate_nnode(node->loc);
+        addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
+        strcpy(addbig->name, node->name);
+        strcat(addbig->name, "-add2");
+        init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, addsmall2->num_output_pins);
+        // Here the final addition can have a carry out in the worst case, however,
+        // our final product will always only be the length of the longest input port so regardless of the carry the
+        // final adds carry will always drop out.
+
+        // connect inputs to port a of addbig
+        for (int i = 0; i < addsmall->num_output_pins; i++) {
+            connect_nodes(addsmall, i, addbig, i);
+        }
+        // add_input_pin_to_node(addbig, get_zero_pin(netlist), addsmall->num_output_pins);
+
+        // connect inputs to port b of addbig
+        for (int i = 0; i < addsmall2->num_output_pins; i++) {
+            connect_nodes(addsmall2, i, addbig, i + addbig->input_port_sizes[0]);
+        }
+        // add_input_pin_to_node(addbig, get_zero_pin(netlist), addbig->input_port_sizes[0] + addsmall->num_output_pins);
+
+        // remap the multiplier outputs coming directly from a0b0
+        for (int i = 0; i < addbig->num_output_pins; i++) {
+            remap_pin_to_new_node(node->output_pins[i], addbig, i);
+        }
     }
 
     // CLEAN UP
@@ -1060,7 +1163,6 @@ void split_multiplier_a(nnode_t *node, int a0, int a1, int b)
     strcat(a0b->name, "-0");
     init_split_multiplier(node, a0b, 0, a0, 0, b, nullptr, nullptr);
     mult_list = insert_in_vptr_list(mult_list, a0b);
-
     /* New node for a1b multiply */
     a1b = allocate_nnode(node->loc);
     a1b->name = (char *)vtr::malloc(strlen(node->name) + 3);
@@ -1184,7 +1286,6 @@ void pad_multiplier(nnode_t *node, netlist_t *netlist)
 
     oassert(node->type == MULTIPLY);
     oassert(hard_multipliers != NULL);
-
     sizea = node->input_port_sizes[0];
     sizeb = node->input_port_sizes[1];
     sizeout = node->output_port_sizes[0];
@@ -1199,6 +1300,13 @@ void pad_multiplier(nnode_t *node, netlist_t *netlist)
     }
     diffa = ina - sizea;
     diffb = inb - sizeb;
+    // input multiplier size on middle range of unequal Hard Block size(ex; mul_size>18 && mul_size<25)
+    if (diffb < 0) {
+        std::swap(ina, inb);
+        diffa = ina - sizea;
+        diffb = inb - sizeb;
+    }
+
     diffout = hard_multipliers->outputs->size - sizeout;
 
     if (configuration.split_hard_multiplier == 1) {
@@ -1281,11 +1389,10 @@ void iterate_multipliers(netlist_t *netlist)
     int mula, mulb;
     int a0, a1, b0, b1;
     nnode_t *node;
-
     /* Can only perform the optimisation if hard multipliers exist! */
     if (hard_multipliers == NULL)
         return;
-
+    // std::cin.get();
     sizea = hard_multipliers->inputs->size;
     sizeb = hard_multipliers->inputs->next->size;
     if (sizea < sizeb) {
@@ -1313,7 +1420,6 @@ void iterate_multipliers(netlist_t *netlist)
             sizea = sizeb;
             sizeb = swap;
         }
-
         /* Do I need to split the multiplier on both inputs? */
         if ((mula > sizea) && (mulb > sizeb)) {
             a0 = sizea;
@@ -1890,4 +1996,4 @@ void free_multipliers()
 
         hard_multipliers->instances = NULL;
     }
-}
+}
diff --git a/parmys/parmys-plugin/netlist/netlist_utils.cc b/parmys/parmys-plugin/netlist/netlist_utils.cc
@@ -485,6 +485,7 @@ void remap_pin_to_new_net(npin_t *pin, nnet_t *new_net)
  *-----------------------------------------------------------------------*/
 void remap_pin_to_new_node(npin_t *pin, nnode_t *new_node, int pin_idx)
 {
+    oassert(pin != NULL);
     if (pin->type == INPUT) {
         /* clean out the entry in the old net */
         pin->node->input_pins[pin->pin_node_idx] = NULL;

diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/vtr_xilinx_qor/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/vtr_xilinx_qor/config/config.txt
@@ -12,6 +12,7 @@ circuits_dir=benchmarks/verilog
 arch_list_add=7series_BRAM_DSP_carry.xml
 
 # Add circuits to list to sweep
+circuit_list_add=mcml.v
 circuit_list_add=LU32PEEng.v
 circuit_list_add=LU8PEEng.v
 circuit_list_add=bgm.v