--======================================================-- -- -- -- NORTHEASTERN UNIVERSITY -- -- DEPARTMENT OF ELECTRICAL AND COMPUTER ENGINEERING -- -- RAPID PROTOTYPING LABORATORY -- -- -- -- FILE | multimac.vhd -- -- -------------+------------------------------------ -- -- DESCRIPTION | Parameterized accumulator -- -- -------------+------------------------------------ -- -- AUTHOR | Al Conti -- -- -------------+------------------------------------ -- -- DATE | 20 Jan 2006 -- --======================================================-- --******************************************************************************-- -- -- -- Copyright (C) 2000 Albert Conti -- -- -- -- This program is free software; you can redistribute it and/or -- -- modify it under the terms of the GNU General Public License -- -- as published by the Free Software Foundation; either version 2 -- -- of the License, or (at your option) any later version. -- -- -- -- This program is distributed in the hope that it will be useful, -- -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- -- GNU General Public License for more details. -- -- -- -- You should have received a copy of the GNU General Public License -- -- along with this program; if not, write to the Free Software -- -- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -- -- -- --****************************************************************************** library ieee; use ieee.std_logic_1164.all; use ieee.std_logic_arith.all; use ieee.std_logic_unsigned.all; package multimac_pkg is component multimac generic ( exponent : natural := 8; mantissa : natural := 23; mult_in_parallel : natural := 4 ); port ( clk : in std_logic; ready : in std_logic; exception_in : in std_logic; op1 : in std_logic_vector(mult_in_parallel*(exponent+mantissa+1)-1 downto 0); op2 : in std_logic_vector(mult_in_parallel*(exponent+mantissa+1)-1 downto 0); result : out std_logic_vector(exponent+mantissa downto 0); done : out std_logic; exception_out : out std_logic ); end component; end multimac_pkg; package body multimac_pkg is end; library ieee; use ieee.std_logic_1164.all; use ieee.std_logic_arith.all; use ieee.std_logic_unsigned.all; library fp_lib; use fp_lib.float_pkg.all; entity multimac is generic ( exponent : natural := 8; mantissa : natural := 23; mult_in_parallel : natural := 4 ); port ( clk : in std_logic; ready : in std_logic; exception_in : in std_logic; op1 : in std_logic_vector(mult_in_parallel*(exponent+mantissa+1)-1 downto 0); op2 : in std_logic_vector(mult_in_parallel*(exponent+mantissa+1)-1 downto 0); result : out std_logic_vector(exponent+mantissa downto 0); done : out std_logic; exception_out : out std_logic ); end multimac; architecture behavioral of multimac is component fp_mul is generic ( exp_bits : integer := 8; man_bits : integer := 23 ); port ( OP1 : in std_logic_vector(exp_bits+man_bits downto 0); OP2 : in std_logic_vector(exp_bits+man_bits downto 0); READY : in std_logic; EXCEPTION_IN : in std_logic; CLK : in std_logic; RESULT : out std_logic_vector(exp_bits+(2*man_bits) downto 0); EXCEPTION_OUT : out std_logic; DONE : out std_logic ); end component; component rnd_norm_wrapper is generic ( exp_bits : integer := 8; man_bits_in : integer := 23; man_bits_out : integer := 23 ); port ( IN1 : in std_logic_vector(exp_bits+man_bits_in downto 0); READY : in std_logic; CLK : in std_logic; ROUND : in std_logic; EXCEPTION_IN : in std_logic; OUT1 : out std_logic_vector(exp_bits+man_bits_out downto 0); DONE : out std_logic; EXCEPTION_OUT : out std_logic ); end component; component fp_add is generic ( exp_bits : integer := 8; man_bits : integer := 23 ); port ( OP1 : in std_logic_vector(exp_bits+man_bits downto 0); OP2 : in std_logic_vector(exp_bits+man_bits downto 0); READY : in std_logic; EXCEPTION_IN : in std_logic; CLK : in std_logic; RESULT : out std_logic_vector(exp_bits+man_bits+1 downto 0); EXCEPTION_OUT : out std_logic; DONE : out std_logic ); end component; constant fp_add_latency : natural := 4; constant fp_mul_latency : natural := 3; constant fp_rnd_norm_latency : natural := 2; type input_fp_array is array(0 to mult_in_parallel-1) of std_logic_vector(exponent+mantissa downto 0); type mult_fp_array is array(0 to mult_in_parallel-1) of std_logic_vector(exponent+2*mantissa downto 0); type add_fp_array is array(0 to mult_in_parallel-1) of std_logic_vector(exponent+mantissa+mult_in_parallel-1 downto 0); type add_fp_table is array(0 to (mult_in_parallel-1)*(fp_add_latency+1)) of add_fp_array; type control_array is array(0 to mult_in_parallel-1) of std_logic_vector(1 downto 0); type control_table is array(0 to 1+(mult_in_parallel-1)*(fp_add_latency+1)) of control_array; -- here I use rectangular arrays when only diaganol arrays are necessary -- this is for progrmability, synthesis tools will whipe out unassigned signals signal ready_d : std_logic; signal exception_in_d : std_logic; signal sample : input_fp_array; signal weight : input_fp_array; signal control : control_table; signal control_joined : control_table; signal product : mult_fp_array; signal intermediate : add_fp_table; signal mac : std_logic_vector(exponent+mantissa downto 0); signal mac_control : std_logic_vector(1 downto 0); signal accum : std_logic_vector(exponent+mantissa downto 0); signal accum_control : std_logic_vector(1 downto 0); signal accum_control_joined : std_logic_vector(1 downto 0); signal sum : std_logic_vector(exponent+mantissa+1 downto 0); signal sum_control : std_logic_vector(1 downto 0); signal sum_norm : std_logic_vector(exponent+mantissa downto 0); signal sum_norm_control : std_logic_vector(1 downto 0); begin gen0 : for i in 0 to mult_in_parallel-1 generate process(clk) begin if rising_edge(clk) then sample(i) <= op1((i+1)*(exponent+mantissa+1)-1 downto i*(exponent+mantissa+1)); weight(i) <= op2((i+1)*(exponent+mantissa+1)-1 downto i*(exponent+mantissa+1)); end if; end process; end generate gen0; process(clk) begin if rising_edge(clk) then ready_d <= ready; exception_in_d <= exception_in; end if; end process; gen1 : for i in 0 to mult_in_parallel-1 generate fp_mul_i : fp_mul generic map ( exp_bits => exponent, man_bits => mantissa ) port map ( OP1 => sample(i), OP2 => weight(i), READY => ready_d, EXCEPTION_IN => exception_in_d, CLK => clk, RESULT => product(i), EXCEPTION_OUT => control(0)(i)(0), DONE => control(0)(i)(1) ); end generate gen1; gen2 : for i in 0 to mult_in_parallel-1 generate ifgen_i : if i < 2 generate rnd_norm_i : rnd_norm_wrapper generic map ( exp_bits => exponent, man_bits_in => 2*mantissa, man_bits_out => mantissa ) port map ( IN1 => product(i), READY => control(0)(i)(1), CLK => clk, ROUND => '1', EXCEPTION_IN => control(0)(i)(0), OUT1 => intermediate(0)(i)(exponent+mantissa downto 0), DONE => control(1)(i)(1), EXCEPTION_OUT => control(1)(i)(0) ); end generate ifgen_i; elsegen_i : if i > 1 generate rnd_norm_i : rnd_norm_wrapper generic map ( exp_bits => exponent, man_bits_in => 2*mantissa, man_bits_out => mantissa+i-1 ) port map ( IN1 => product(i), READY => control(0)(i)(1), CLK => clk, ROUND => '1', EXCEPTION_IN => control(0)(i)(0), OUT1 => intermediate(0)(i)(exponent+mantissa+i-1 downto 0), DONE => control(1)(i)(1), EXCEPTION_OUT => control(1)(i)(0) ); end generate elsegen_i; end generate gen2; ifgen0: if mult_in_parallel > 1 generate gen : for i in 0 to mult_in_parallel-2 generate control_joined(i*(fp_add_latency+1)+1)(i+1)(0) <= control(i*(fp_add_latency+1)+1)(i)(0) or control(i*(fp_add_latency+1)+1)(i+1)(0); control_joined(i*(fp_add_latency+1)+1)(i+1)(1) <= control(i*(fp_add_latency+1)+1)(i)(1) and control(i*(fp_add_latency+1)+1)(i+1)(1); fp_add_i : fp_add generic map ( exp_bits => exponent, man_bits => mantissa+i ) port map ( OP1 => intermediate(i*(fp_add_latency+1))(i)(i+exponent+mantissa downto 0), OP2 => intermediate(i*(fp_add_latency+1))(i+1)(i+exponent+mantissa downto 0), READY => control_joined(i*(fp_add_latency+1)+1)(i+1)(1), EXCEPTION_IN => control_joined(i*(fp_add_latency+1)+1)(i+1)(0), CLK => clk, RESULT => intermediate((i+1)*(fp_add_latency+1))(i+1)(i+1+exponent+mantissa downto 0), EXCEPTION_OUT => control((i+1)*(fp_add_latency+1)+1)(i+1)(0), DONE => control((i+1)*(fp_add_latency+1)+1)(i+1)(1) ); ifgen_i : if i < mult_in_parallel-2 generate gen_i : for j in i+2 to mult_in_parallel-1 generate gen_j : for k in 0 to fp_add_latency-1 generate process(clk) begin if rising_edge(clk) then intermediate(i*(fp_add_latency+1)+1+k)(j) <= intermediate(i*(fp_add_latency+1)+k)(j); control(i*(fp_add_latency+1)+2+k)(j) <= control(i*(fp_add_latency+1)+1+k)(j); end if; end process; end generate gen_j; intermediate((i+1)*(fp_add_latency+1))(j) <= intermediate((i+1)*(fp_add_latency+1)-1)(j); control((i+1)*(fp_add_latency+1)+1)(j) <= control((i+1)*(fp_add_latency+1))(j); end generate gen_i; end generate ifgen_i; end generate gen; end generate ifgen0; rnd_norm_0 : rnd_norm_wrapper generic map ( exp_bits => exponent, man_bits_in => mantissa+mult_in_parallel-1, man_bits_out => mantissa ) port map ( IN1 => intermediate((mult_in_parallel-1)*(fp_add_latency+1))(mult_in_parallel-1), READY => control((mult_in_parallel-1)*(fp_add_latency+1)+1)(mult_in_parallel-1)(1), CLK => clk, ROUND => '1', EXCEPTION_IN => control((mult_in_parallel-1)*(fp_add_latency+1)+1)(mult_in_parallel-1)(0), OUT1 => mac, DONE => mac_control(1), EXCEPTION_OUT => mac_control(0) ); accum_control_joined(0) <= accum_control(0) or mac_control(0); accum_control_joined(1) <= mac_control(1); fp_add_0 : fp_add generic map ( exp_bits => exponent, man_bits => mantissa ) port map ( OP1 => mac, OP2 => accum, READY => accum_control_joined(1), EXCEPTION_IN => accum_control_joined(0), CLK => clk, RESULT => sum, EXCEPTION_OUT => sum_control(0), DONE => sum_control(1) ); rnd_norm_1 : rnd_norm_wrapper generic map ( exp_bits => exponent, man_bits_in => mantissa+1, man_bits_out => mantissa ) port map ( IN1 => sum, READY => sum_control(1), CLK => clk, ROUND => '1', EXCEPTION_IN => sum_control(0), OUT1 => sum_norm, DONE => sum_norm_control(1), EXCEPTION_OUT => sum_norm_control(0) ); process(clk) begin if rising_edge(clk) then if sum_norm_control(1) = '1' then accum <= sum_norm; accum_control <= sum_norm_control; else accum <= (others=>'0'); accum_control <= (others=>'0'); end if; end if; end process; result <= accum; done <= accum_control(1); exception_out <= accum_control(0); end behavioral; -- this wrapper is necessary to correct what I consider to be a bug in rnd_norm. -- rnd_norm requires that the input mantissa bit width be at least two greater -- than the ouput mantissa bitwidth. i feel as though this module should work -- for any combination of generic bitwidths. library ieee; use ieee.std_logic_1164.all; use ieee.std_logic_arith.all; use ieee.std_logic_unsigned.all; library fp_lib; use fp_lib.float_pkg.all; entity rnd_norm_wrapper is generic ( exp_bits : integer := 8; man_bits_in : integer := 23; man_bits_out : integer := 23 ); port ( IN1 : in std_logic_vector(exp_bits+man_bits_in downto 0); READY : in std_logic; CLK : in std_logic; ROUND : in std_logic; EXCEPTION_IN : in std_logic; OUT1 : out std_logic_vector(exp_bits+man_bits_out downto 0); DONE : out std_logic; EXCEPTION_OUT : out std_logic ); end rnd_norm_wrapper; architecture rtl of rnd_norm_wrapper is component rnd_norm is generic ( exp_bits : integer := 8; man_bits_in : integer := 23; man_bits_out : integer := 23 ); port ( IN1 : in std_logic_vector(exp_bits+man_bits_in downto 0); READY : in std_logic; CLK : in std_logic; ROUND : in std_logic; EXCEPTION_IN : in std_logic; OUT1 : out std_logic_vector(exp_bits+man_bits_out downto 0); DONE : out std_logic; EXCEPTION_OUT : out std_logic ); end component; signal input_pad : std_logic_vector(exp_bits+man_bits_in+2 downto 0); begin input_pad <= IN1 & "00"; ifgen : if man_bits_out = man_bits_in generate wrapper : rnd_norm generic map ( exp_bits => exp_bits, man_bits_in => man_bits_in+2, man_bits_out => man_bits_out ) port map ( IN1 => input_pad, READY => READY, CLK => CLK, ROUND => ROUND, EXCEPTION_IN => EXCEPTION_IN, OUT1 => OUT1, DONE => DONE, EXCEPTION_OUT => EXCEPTION_OUT ); end generate ifgen; elsegen0 : if man_bits_out = man_bits_in-1 generate wrapper : rnd_norm generic map ( exp_bits => exp_bits, man_bits_in => man_bits_in+1, man_bits_out => man_bits_out ) port map ( IN1 => input_pad(exp_bits+man_bits_in+2 downto 1), READY => READY, CLK => CLK, ROUND => ROUND, EXCEPTION_IN => EXCEPTION_IN, OUT1 => OUT1, DONE => DONE, EXCEPTION_OUT => EXCEPTION_OUT ); end generate elsegen0; elsegen1 : if man_bits_out /= man_bits_in and man_bits_out /= man_bits_in-1 generate wrapper : rnd_norm generic map ( exp_bits => exp_bits, man_bits_in => man_bits_in, man_bits_out => man_bits_out ) port map ( IN1 => IN1, READY => READY, CLK => CLK, ROUND => ROUND, EXCEPTION_IN => EXCEPTION_IN, OUT1 => OUT1, DONE => DONE, EXCEPTION_OUT => EXCEPTION_OUT ); end generate elsegen1; end rtl;