Skip to content

Generalize Parmys Mult_Split to Allow for Multipliers Whose Input Widths are not Equal #3143

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 156 additions & 50 deletions parmys/parmys-plugin/core/multiplier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,7 @@ void init_multiplier_adder(nnode_t *node, nnode_t *parent, int a, int b)
*-----------------------------------------------------------------------*/
void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *netlist)
{
nnode_t *a0b0, *a0b1, *a1b0, *a1b1, *addsmall, *addbig;
nnode_t *a0b0, *a0b1, *a1b0, *a1b1, *addsmall, *addsmall2, *addbig;
int size;

/* Check for a legitimate split */
Expand Down Expand Up @@ -976,50 +976,153 @@ void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *
init_split_multiplier(node, a1b0, a0, a1, 0, b0, a1b1, a0b0);
mult_list = insert_in_vptr_list(mult_list, a1b0);

/* New node for the initial add */
addsmall = allocate_nnode(node->loc);
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall->name, node->name);
strcat(addsmall->name, "-add0");
// this addition will have a carry out in the worst case, add to input pins and connect then to gnd
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + 1, a0b1->num_output_pins + 1);

/* New node for the BIG add */
addbig = allocate_nnode(node->loc);
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addbig->name, node->name);
strcat(addbig->name, "-add1");
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, a0b0->num_output_pins - b0 + a1b1->num_output_pins);

// connect inputs to port a of addsmall
for (int i = 0; i < a1b0->num_output_pins; i++)
connect_nodes(a1b0, i, addsmall, i);
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins);
// connect inputs to port b of addsmall
for (int i = 0; i < a0b1->num_output_pins; i++)
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0]);
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0]);

// connect inputs to port a of addbig
size = addsmall->num_output_pins;
for (int i = 0; i < size; i++)
connect_nodes(addsmall, i, addbig, i);

// connect inputs to port b of addbig
for (int i = b0; i < a0b0->output_port_sizes[0]; i++)
connect_nodes(a0b0, i, addbig, i - b0 + size);
size = size + a0b0->output_port_sizes[0] - b0;
for (int i = 0; i < a1b1->output_port_sizes[0]; i++)
connect_nodes(a1b1, i, addbig, i + size);

// remap the multiplier outputs coming directly from a0b0
for (int i = 0; i < b0; i++) {
remap_pin_to_new_node(node->output_pins[i], a0b0, i);
}
// using the balenced addition method only works if a0 and b0 are the same size
// (i.e. if the input ports on the hardware multiplier are equal)
if (b0 == a0) {
/* New node for the initial add */
addsmall = allocate_nnode(node->loc);
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall->name, node->name);
strcat(addsmall->name, "-add0");
// this addition will have a carry out in the worst case, add to input pins and connect then to gnd
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + 1, a0b1->num_output_pins + 1);

// connect inputs to port a of addsmall
for (int i = 0; i < a1b0->num_output_pins; i++)
connect_nodes(a1b0, i, addsmall, i);

add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins);
// connect inputs to port b of addsmall
for (int i = 0; i < a0b1->num_output_pins; i++)
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0]);
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0]);

/* New node for the BIG add */
addbig = allocate_nnode(node->loc);
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addbig->name, node->name);
strcat(addbig->name, "-add1");
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, a0b0->num_output_pins - b0 + a1b1->num_output_pins);

// connect inputs to port a of addbig
size = addsmall->num_output_pins;
for (int i = 0; i < size; i++)
connect_nodes(addsmall, i, addbig, i);

// connect inputs to port b of addbig
for (int i = b0; i < a0b0->output_port_sizes[0]; i++)
connect_nodes(a0b0, i, addbig, i - b0 + size);
size = size + a0b0->output_port_sizes[0] - b0;
for (int i = 0; i < a1b1->output_port_sizes[0]; i++)
connect_nodes(a1b1, i, addbig, i + size);

// remap the multiplier outputs coming directly from a0b0
for (int i = 0; i < b0; i++) {
remap_pin_to_new_node(node->output_pins[i], a0b0, i);
}

// remap the multiplier outputs coming from addbig
for (int i = 0; i < addbig->num_output_pins; i++) {
remap_pin_to_new_node(node->output_pins[i + b0], addbig, i);
}
} else {
/* Expounding upon the description for the method in this function.
if we have two numbers A and B and we have a hardware multiplier of size a0xb0,
we can split them into two parts:
A = A1 << a0 + A0
B = B1 << b0 + B0
where A1 and B1 are the high bits of A and B, and A0 and B0 are the low bits.
Note that len(A0) = a0 and len(B0) = b0 by definition.
The multiplication of A and B can be expressed as:
A * B = (A1 << a0 + A0) * (B1 << b0 + B0)
= {A1 * B1 << (a0 + b0)} + {(A1 * B0) << a0 + (A0 * B1) << b0} + {A0 * B0}
we define split the editions up like so:
addsmall = (A1 * B0) << a0 + (A0 * B1) << b0 // can have carry
addsmall2 = (A1 * B1 << (a0 + b0)) + (A0 * B0) // Will not have carry
addbig = addsmall + addsmall2
This is a slightly modified version of the Karatsuba algorithm.
*/
/////////////// Addsmall /////////////////////
addsmall = allocate_nnode(node->loc);
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall->name, node->name);
strcat(addsmall->name, "-add0");
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + a0 + 1, a0b1->num_output_pins + b0 + 1);

// The first a0 pins of addsmall input connecting to a1b0 are connected to zero
for (int i = 0; i < a0; i++) {
add_input_pin_to_node(addsmall, get_zero_pin(netlist), i);
}

// connect inputs to port a of addsmall
for (int i = 0; i < a1b0->num_output_pins; i++) {
connect_nodes(a1b0, i, addsmall, i + a0);
}

// add zero pin for carry
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins + a0);

// The first b0 pins of addsmall input connecting to a0b1 are connected to zero
for (int i = 0; i < b0; i++) {
add_input_pin_to_node(addsmall, get_zero_pin(netlist), i + addsmall->input_port_sizes[0]);
}

// connect inputs to port b of addsmall
for (int i = 0; i < a0b1->num_output_pins; i++) {
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0] + b0);
}

// add zero pin for carry
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0] + b0);

/////////////// Addsmall2 /////////////////////
addsmall2 = allocate_nnode(node->loc);
addsmall2->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall2->name, node->name);
strcat(addsmall2->name, "-add1");
init_multiplier_adder(addsmall2, a1b1, a1b1->num_output_pins + a0 + b0, a0b0->num_output_pins);

// remap the multiplier outputs coming from addbig
for (int i = 0; i < addbig->num_output_pins; i++) {
remap_pin_to_new_node(node->output_pins[i + b0], addbig, i);
// connect first a0+ b0 pins of addsmall2 to zero
for (int i = 0; i < a0 + b0; i++) {
add_input_pin_to_node(addsmall2, get_zero_pin(netlist), i);
}

// connect inputs to port a of addsmall2
for (int i = 0; i < a1b1->num_output_pins; i++) {
connect_nodes(a1b1, i, addsmall2, i + a0 + b0);
}

// connect inputs to port b of addsmall2
for (int i = 0; i < a0b0->output_port_sizes[0]; i++) {
connect_nodes(a0b0, i, addsmall2, i + addsmall2->input_port_sizes[0]);
}

/////////////// Addbig /////////////////////
addbig = allocate_nnode(node->loc);
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addbig->name, node->name);
strcat(addbig->name, "-add2");
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, addsmall2->num_output_pins);
// Here the final addition can have a carry out in the worst case, however,
// our final product will always only be the length of the longest input port so regardless of the carry the
// final adds carry will always drop out.

// connect inputs to port a of addbig
for (int i = 0; i < addsmall->num_output_pins; i++) {
connect_nodes(addsmall, i, addbig, i);
}
// add_input_pin_to_node(addbig, get_zero_pin(netlist), addsmall->num_output_pins);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commented out code. Was this intended to be commented out?


// connect inputs to port b of addbig
for (int i = 0; i < addsmall2->num_output_pins; i++) {
connect_nodes(addsmall2, i, addbig, i + addbig->input_port_sizes[0]);
}
// add_input_pin_to_node(addbig, get_zero_pin(netlist), addbig->input_port_sizes[0] + addsmall->num_output_pins);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment. Just want to make sure that some zeroed inputs were not being forgotten.


// remap the multiplier outputs coming directly from a0b0
for (int i = 0; i < addbig->num_output_pins; i++) {
remap_pin_to_new_node(node->output_pins[i], addbig, i);
}
}

// CLEAN UP
Expand Down Expand Up @@ -1060,7 +1163,6 @@ void split_multiplier_a(nnode_t *node, int a0, int a1, int b)
strcat(a0b->name, "-0");
init_split_multiplier(node, a0b, 0, a0, 0, b, nullptr, nullptr);
mult_list = insert_in_vptr_list(mult_list, a0b);

/* New node for a1b multiply */
a1b = allocate_nnode(node->loc);
a1b->name = (char *)vtr::malloc(strlen(node->name) + 3);
Expand Down Expand Up @@ -1184,7 +1286,6 @@ void pad_multiplier(nnode_t *node, netlist_t *netlist)

oassert(node->type == MULTIPLY);
oassert(hard_multipliers != NULL);

sizea = node->input_port_sizes[0];
sizeb = node->input_port_sizes[1];
sizeout = node->output_port_sizes[0];
Expand All @@ -1199,6 +1300,13 @@ void pad_multiplier(nnode_t *node, netlist_t *netlist)
}
diffa = ina - sizea;
diffb = inb - sizeb;
// input multiplier size on middle range of unequal Hard Block size(ex; mul_size>18 && mul_size<25)
if (diffb < 0) {
std::swap(ina, inb);
diffa = ina - sizea;
diffb = inb - sizeb;
}

diffout = hard_multipliers->outputs->size - sizeout;

if (configuration.split_hard_multiplier == 1) {
Expand Down Expand Up @@ -1281,11 +1389,10 @@ void iterate_multipliers(netlist_t *netlist)
int mula, mulb;
int a0, a1, b0, b1;
nnode_t *node;

/* Can only perform the optimisation if hard multipliers exist! */
if (hard_multipliers == NULL)
return;

// std::cin.get();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commented out code.

sizea = hard_multipliers->inputs->size;
sizeb = hard_multipliers->inputs->next->size;
if (sizea < sizeb) {
Expand Down Expand Up @@ -1313,7 +1420,6 @@ void iterate_multipliers(netlist_t *netlist)
sizea = sizeb;
sizeb = swap;
}

/* Do I need to split the multiplier on both inputs? */
if ((mula > sizea) && (mulb > sizeb)) {
a0 = sizea;
Expand Down Expand Up @@ -1890,4 +1996,4 @@ void free_multipliers()

hard_multipliers->instances = NULL;
}
}
}
1 change: 1 addition & 0 deletions parmys/parmys-plugin/netlist/netlist_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ void remap_pin_to_new_net(npin_t *pin, nnet_t *new_net)
*-----------------------------------------------------------------------*/
void remap_pin_to_new_node(npin_t *pin, nnode_t *new_node, int pin_idx)
{
oassert(pin != NULL);
if (pin->type == INPUT) {
/* clean out the entry in the old net */
pin->node->input_pins[pin->pin_node_idx] = NULL;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ circuits_dir=benchmarks/verilog
arch_list_add=7series_BRAM_DSP_carry.xml

# Add circuits to list to sweep
circuit_list_add=mcml.v
circuit_list_add=LU32PEEng.v
circuit_list_add=LU8PEEng.v
circuit_list_add=bgm.v
Expand Down
Loading
Loading