IMPLEMENTATION OF MAC UNIT using FPGA
`timescale 1ns/100ps
//VERILOG CODE FOR MAC UNIT:
module MAC_UNIT(clk,rst,a,b,z);
input clk,rst;
input [15:0] a,b;
output [31:0] z;
wire [31:0] w,x;
wire ci,co;
vedic_16x16 U1(.a(a),.b(b),.result(w));
brent_kung_adder U2(.A(w),.B(z),.Ci(ci),.S(x),.Co(co));
pipo U3(.din(x), .clk(clk),.rst(rst),.dout(z));
endmodule
//VERILOG CODE FOR 16-BIT VEDIC MULTIPLIER:
module vedic_16x16(a, b, result);
 input [15:0] a,b;
 output [31:0] result;
 wire [31:0] result;
  wire [15:0] q0, q1, q2, q3,q4;
  wire [23:0] q5,q6;
  wire [15:0] temp1, temp2;
  wire [23:0] temp3,temp4;
vedic_8x8 V9(a[7:0] , b[7:0] , q0[15:0]);
vedic_8x8 V10(a[15:8], b[7:0] , q1[15:0]);
vedic_8x8 V11(a[7:0] , b[15:8], q2[15:0]);
vedic_8x8 V12(a[15:8], b[15:8], q3[15:0]);
assign temp1= {8'b00000000, q0[15:8]};
assign temp2= q1[15:0];
assign temp3= {8'b00000000, q2[15:0]};
assign temp4= {q3[15:0], 8'b00000000};
adder16 A3(temp1, temp2, q4);
adder24 A4(temp3, temp4, q5);
adder24 A5({8'b00000000,q4}, q5, q6);
assign result[7:0]= q0[7:0];
assign result[31:8]= q6[23:0];
endmodule
module vedic_8x8(a, b, result);
 input [7:0] a,b;
 output [15:0] result;
 wire [15:0] result;
  wire [7:0] q0, q1, q2, q3,q4;
  wire [11:0] q5,q6;
  wire [7:0] temp1, temp2;
  wire [11:0] temp3,temp4;
vedic_4x4 V5(a[3:0], b[3:0], q0[7:0]);
vedic_4x4 V6(a[7:4], b[3:0], q1[7:0]);
vedic_4x4 V7(a[3:0], b[7:4], q2[7:0]);
vedic_4x4 V8(a[7:4], b[7:4], q3[7:0]);
assign temp1= {4'b0000, q0[7:4]};
assign temp2= q1[7:0];
assign temp3= {4'b0000, q2[7:0]};
assign temp4= {q3[7:0], 4'b0000};
adder8 A3(temp1, temp2, q4);
adder12 A4(temp3, temp4, q5);
adder12 A5({4'b0000,q4}, q5, q6);
assign result[3:0]= q0[3:0];
assign result[15:4]= q6[11:0];
endmodule
module vedic_4x4(a, b, result);
 input [3:0] a,b;
 output [7:0] result;
 wire [7:0] result;
wire w1, w2, w3, w4, w5;
wire [3:0] temp1;
wire [5:0] temp2;
wire [5:0] temp3;
wire [5:0] temp4;
wire [3:0] q0;
wire [3:0] q1;
wire [3:0] q2;
wire [3:0] q3;
wire [3:0] q4;
wire [5:0] q5;
wire [5:0] q6;
vedic_2x2 V1(a[1:0], b[1:0], q0[3:0]);
vedic_2x2 V2(a[3:2], b[1:0], q1[3:0]);
vedic_2x2 V3(a[1:0], b[3:2], q2[3:0]);
vedic_2x2 V4(a[3:2], b[3:2], q3[3:0]);
assign temp1= {2'b00, q0[3:2]};
adder4 A0(q1[3:0], temp1, q4);
assign temp2= {2'b00, q2[3:0]};
assign temp3= {q3[3:0], 2'b00};
adder6 A1(temp2, temp3, q5);
assign temp4= {2'b00, q4[3:0]};
adder6 A2(temp4, q5, q6);
assign result[1:0] = q0[1:0];
assign result[7:2] = q6[5:0];
endmodule
module vedic_2x2 (a, b, result);
 input [1:0] a,b;
 output [3:0] result;
  wire [3:0] w;
  assign result[0]= a[0]&b[0];
  assign w[0] = a[1]&b[0];
  assign w[1] = a[0]&b[1];
  assign w[2] = a[1]&b[1];
  halfAdder H0(w[0], w[1], result[1], w[3]);
  halfAdder H1(w[2], w[3], result[2], result[3]);
endmodule
module halfAdder(a,b,sum,carry);
 input a,b;
 output sum, carry;
assign sum = a ^ b;
assign carry = a & b;
endmodule
module adder4(a,b,sum);
input [3:0] a,b;
output [3:0] sum;
wire [3:0] sum;
assign sum = a + b;
endmodule
module adder6(a,b,sum);
input [5:0] a,b;
output [5:0] sum;
wire [5:0] sum;
assign sum = a + b;
endmodule
module adder8(a,b,sum);
input [7:0] a,b;
output [7:0] sum;
wire [7:0] sum;
assign sum = a + b;
endmodule
module adder12(a,b,sum);
input [11:0] a,b;
output [11:0] sum;
wire [11:0] sum;
assign sum = a + b;
endmodule
module adder16(a,b,sum);
input [15:0] a,b;
output [15:0] sum;
wire [15:0] sum;
assign sum = a + b;
endmodule
module adder24(a,b,sum);
input [23:0] a,b;
output [23:0] sum;
wire [23:0] sum;
assign sum = a + b;
endmodule
//VERILOG CODE FOR 32-BIT BRENT KUNG ADDER:
module brent_kung_adder(
   input [31:0] A, B,
   input Ci,
   output [31:0] S,
   output Co
// output [15:0] G2,P2,
// output [7:0] G3, P3,
// output [3:0] G4,P4,
// output [1:0] G5,P5
);
   wire [31:0] P1, G1;
   wire [32:1] C;
 wire [15:0] G2,P2;
 wire [7:0] G3, P3;
 wire [3:0] G4,P4;
 wire [1:0] G5,P5;
 wire G6, P6;
  /////// Generating 1st order P's and G's signals ////////
  assign P1 = A ^ B;
  assign G1 = A & B;
  //////// Generating 2nd order P's and G's signals ///////
  genvar i;
  generate
   for(i=0; i<=30; i=i+2) begin: second_stage //32
      assign G2[i/2] = G1[i+1] | (P1[i+1] & G1[i]);
      assign P2[i/2] = P1[i+1] & P1[i];
   end
 endgenerate
 /////// Generating 3rd order P's and G's signals //////
 generate
     for(i=0; i<=14; i=i+2) begin: third_stage //16
         assign G3[i/2] = G2[i+1] | (P2[i+1] & G2[i]);
         assign P3[i/2] = P2[i+1] & P2[i];
     end
 endgenerate
 /////// Generating 4th order P's and G's signals /////
 generate
     for(i=0; i<=6; i=i+2) begin: fourth_stage //8
         assign G4[i/2] = G3[i+1] | (P3[i+1] & G3[i]);
         assign P4[i/2] = P3[i+1] & P3[i];
     end
 endgenerate
 /////// Generating 5th order P's and G's signals
 generate
     for(i=0; i<=2; i=i+2) begin: fifth_stage //4
         assign G5[i/2] = G4[i+1] | (P4[i+1] & G4[i]);
         assign P5[i/2] = P4[i+1] & P4[i];
     end
 endgenerate
 //////// Generating 6th order P's and G's signals
 assign G6 = G5[1] | (P5[1] & G5[0]);
 assign P6 = P5[1] & P5[0];
 //////// Generating carry which can be calculated directly from input carry /////
 assign C[1] = G1[0] | (P1[0] & Ci);
 assign C[2] = G2[0] | (P2[0] & Ci);
 assign C[4] = G3[0] | (P3[0] & Ci);
 assign C[8] = G4[0] | (P4[0] & Ci);
 assign C[16] = G5[0] | (P5[0] & Ci);
 assign C[32] = G6 | (P6 & Ci);
/////// Now generating all carry signals at remaining stages ////////////
   assign C[3] = G1[2] | (P1[2] & C[2]);
 assign C[5] = G1[4] | (P1[4] & C[4]);
 assign C[6] = G2[2] | (P2[2] & C[4]);
 assign C[7] = G1[6] | (P1[6] & C[6]);
 assign C[9] = G1[8] | (P1[8] & C[8]);
 assign C[10] = G2[4] | (P2[4] & C[8]);
 assign C[11] = G1[10] | (P1[10] & C[10]);
 assign C[12] = G3[2] | (P3[2] & C[8]);
 assign C[13] = G1[12] | (P1[12] & C[12]);
 assign C[14] = G2[6] | (P2[6] & C[12]);
  assign C[15] = G1[14] | (P1[14] & C[14]);
  assign C[17] = G1[16] | (P1[16] & C[16]);
  assign C[18] = G2[8] | (P2[8] & C[16]); //2nd order => /2
  assign C[19] = G1[18] | (P1[18] & C[18]);
  assign C[20] = G3[4] | (P3[4] & C[16]); //3rd order = /4
  assign C[21] = G1[20] | (P1[20] & C[20]);
  assign C[22] = G2[10] | (P2[10] & C[20]);
  assign C[23] = G1[22] | (P1[22] & C[22]);
  assign C[24] = G4[2] | (P4[2] & C[16]); //4th order => /8
  assign C[25] = G1[24] | (P1[24] & C[24]);
  assign C[26] = G2[12] | (P2[12] & C[24]);
  assign C[27] = G1[26] | (P1[26] & C[26]);
  assign C[28] = G3[6] | (P3[6] & C[24]);
  assign C[29] = G1[28] | (P1[28] & C[28]);
  assign C[30] = G2[14] | (P2[14] & C[28]);
  assign C[31] = G1[30] | (P1[30] & C[30]);
  ///////////////////////
  assign S = P1 ^ {C[31:1],Ci};
  assign Co = C[32];
endmodule
//VERILOG CODE FOR 32-BIT PARALLEL IN PARALLEL OUT SHIFT REGISTER:
module pipo(din,clk,rst,dout);
input [31:0] din;
input clk,rst;
output [31:0] dout;
wire [31:0] din;
wire clk,rst;
reg [31:0] dout;
always @(posedge clk or negedge rst)
begin
if(!rst)
begin
dout <= 32'b0;
end
else
begin
dout <= din;
end
end
endmodule
RESULT:
      Thus, MAC unit is implemented in FPGA kit using Verilog code.