Merge pull request #422 from paulusmack/real-icache

Icache improvements - use synchronous RAMs and remove 4kB per set limit
icache_tb: Update for recent icache changes
13 changed files with 893 additions and 389 deletions
--- a/6
+++ b/6
@ -166,10 +166,6 @@ RAM_INIT_FILE ?=hello_world/hello_world.hex

 FPGA_TARGET ?= ORANGE-CRAB-0.21

-# FIXME: icache RAMs aren't being inferrenced as block RAMs on ECP5
-# with yosys, so make it smaller for now as a workaround.
-ICACHE_NUM_LINES=4
-
 clkgen=fpga/clk_gen_ecp5.vhd
 toplevel=fpga/top-generic.vhdl
 dmi_dtm=dmi_dtm_dummy.vhdl
@ -227,7 +223,7 @@ LITEDRAM_GHDL_ARG=-gUSE_LITEDRAM=true
 endif

 GHDL_IMAGE_GENERICS=-gMEMORY_SIZE=$(MEMORY_SIZE) -gRAM_INIT_FILE=$(RAM_INIT_FILE) \
-	-gRESET_LOW=$(RESET_LOW) -gCLK_INPUT=$(CLK_INPUT) -gCLK_FREQUENCY=$(CLK_FREQUENCY) -gICACHE_NUM_LINES=$(ICACHE_NUM_LINES) \
+	-gRESET_LOW=$(RESET_LOW) -gCLK_INPUT=$(CLK_INPUT) -gCLK_FREQUENCY=$(CLK_FREQUENCY) \
 	$(LITEDRAM_GHDL_ARG)


--- a/common.vhdl
+++ b/common.vhdl
@ -194,6 +194,10 @@ package common is
    subtype real_addr_t is std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
    function addr_to_real(addr: std_ulogic_vector(63 downto 0)) return real_addr_t;

+    -- Minimum page size
+    constant MIN_LG_PGSZ : positive := 12;
+    constant MIN_PAGESZ  : positive := 2 ** MIN_LG_PGSZ;
+
    -- Used for tracking instruction completion and pending register writes
    constant TAG_COUNT : positive := 4;
    constant TAG_NUMBER_BITS : natural := log2(TAG_COUNT);
@ -231,6 +235,7 @@ package common is

    type Fetch1ToIcacheType is record
 	req: std_ulogic;
+        fetch_fail : std_ulogic;
        virt_mode : std_ulogic;
        priv_mode : std_ulogic;
        big_endian : std_ulogic;
@ -238,6 +243,9 @@ package common is
        predicted : std_ulogic;
        pred_ntaken : std_ulogic;
 	nia: std_ulogic_vector(63 downto 0);
+        next_nia: std_ulogic_vector(63 downto 0);
+        rpn: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0);
+        next_rpn: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0);
    end record;

    type IcacheToDecode1Type is record
@ -606,7 +614,7 @@ package common is
        data  : std_ulogic_vector(63 downto 0);
    end record;

-    type MmuToIcacheType is record
+    type MmuToITLBType is record
        tlbld : std_ulogic;
        tlbie : std_ulogic;
        doall : std_ulogic;
@ -658,7 +666,6 @@ package common is
 	redirect: std_ulogic;
        redir_mode: std_ulogic_vector(3 downto 0);
        last_nia: std_ulogic_vector(63 downto 0);
-        br_offset: std_ulogic_vector(63 downto 0);
        br_last: std_ulogic;
        br_taken: std_ulogic;
        abs_br: std_ulogic;
@ -672,7 +679,7 @@ package common is
         write_data => (others => '0'), write_cr_mask => (others => '0'),
         write_cr_data => (others => '0'), write_reg => (others => '0'),
         interrupt => '0', intr_vec => 0, redirect => '0', redir_mode => "0000",
-         last_nia => (others => '0'), br_offset => (others => '0'),
+         last_nia => (others => '0'),
         br_last => '0', br_taken => '0', abs_br => '0',
         srr1 => (others => '0'), msr => (others => '0'));

@ -758,11 +765,14 @@ package common is
        br_nia : std_ulogic_vector(63 downto 0);
        br_last : std_ulogic;
        br_taken : std_ulogic;
+        interrupt : std_ulogic;
+        intr_vec : std_ulogic_vector(11 downto 0);
    end record;
    constant WritebackToFetch1Init : WritebackToFetch1Type :=
        (redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0',
         mode_32bit => '0', redirect_nia => (others => '0'),
-         br_last => '0', br_taken => '0', br_nia => (others => '0'));
+         br_last => '0', br_taken => '0', br_nia => (others => '0'),
+         interrupt => '0', intr_vec => x"000");

    type WritebackToRegisterFileType is record
 	write_reg : gspr_index_t;
--- a/core.vhdl
+++ b/core.vhdl
@ -57,7 +57,7 @@ architecture behave of core is
    signal fetch1_to_icache : Fetch1ToIcacheType;
    signal writeback_to_fetch1: WritebackToFetch1Type;
    signal icache_to_decode1 : IcacheToDecode1Type;
-    signal mmu_to_icache : MmuToIcacheType;
+    signal mmu_to_itlb : MmuToITLBType;

    -- decode signals
    signal decode1_to_decode2: Decode1ToDecode2Type;
@ -223,6 +223,7 @@ begin
        generic map (
            RESET_ADDRESS => (others => '0'),
 	    ALT_RESET_ADDRESS => ALT_RESET_ADDRESS,
+            TLB_SIZE => ICACHE_TLB_SIZE,
            HAS_BTC => HAS_BTC
            )
        port map (
@ -231,8 +232,9 @@ begin
 	    alt_reset_in => alt_reset_d,
            stall_in => fetch1_stall_in,
            flush_in => fetch1_flush,
-            inval_btc => ex1_icache_inval or mmu_to_icache.tlbie,
+            inval_btc => ex1_icache_inval or mmu_to_itlb.tlbie,
 	    stop_in => dbg_core_stop,
+            m_in => mmu_to_itlb,
            d_in => decode1_to_fetch1,
            w_in => writeback_to_fetch1,
            i_out => fetch1_to_icache,
@ -249,7 +251,6 @@ begin
            LINE_SIZE => 64,
            NUM_LINES => ICACHE_NUM_LINES,
            NUM_WAYS => ICACHE_NUM_WAYS,
-            TLB_SIZE => ICACHE_TLB_SIZE,
            LOG_LENGTH => LOG_LENGTH
            )
        port map(
@ -257,7 +258,6 @@ begin
            rst => rst_icache,
            i_in => fetch1_to_icache,
            i_out => icache_to_decode1,
-            m_in => mmu_to_icache,
            flush_in => fetch1_flush,
            inval_in => dbg_icache_rst or ex1_icache_inval,
            stall_in => icache_stall_in,
@ -454,7 +454,7 @@ begin
            l_out => mmu_to_loadstore1,
            d_out => mmu_to_dcache,
            d_in => dcache_to_mmu,
-            i_out => mmu_to_icache
+            i_out => mmu_to_itlb
            );

    dcache_0: entity work.dcache
--- a/decode1.vhdl
+++ b/decode1.vhdl
@ -35,8 +35,7 @@ architecture behaviour of decode1 is
    signal f, fin : Decode1ToFetch1Type;

    type br_predictor_t is record
-        br_nia    : std_ulogic_vector(61 downto 0);
-        br_offset : signed(23 downto 0);
+        br_target : signed(61 downto 0);
        predict   : std_ulogic;
    end record;

@ -94,8 +93,10 @@ architecture behaviour of decode1 is
        INSN_andi_dot    =>  (ALU,  NONE, OP_LOGIC,     NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0', NONE),
        INSN_andis_dot   =>  (ALU,  NONE, OP_LOGIC,     NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0', NONE),
        INSN_attn        =>  (ALU,  NONE, OP_ATTN,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE),
-        INSN_b           =>  (ALU,  NONE, OP_B,         NONE,       CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
-        INSN_bc          =>  (ALU,  NONE, OP_BC,        NONE,       CONST_BD,    NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
+        INSN_brel        =>  (ALU,  NONE, OP_B,         CIA,        CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
+        INSN_babs        =>  (ALU,  NONE, OP_B,         NONE,       CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
+        INSN_bcrel       =>  (ALU,  NONE, OP_BC,        CIA,        CONST_BD,    NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
+        INSN_bcabs       =>  (ALU,  NONE, OP_BC,        NONE,       CONST_BD,    NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
        INSN_bcctr       =>  (ALU,  NONE, OP_BCREG,     NONE,       NONE,        NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
        INSN_bclr        =>  (ALU,  NONE, OP_BCREG,     NONE,       NONE,        NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
        INSN_bctar       =>  (ALU,  NONE, OP_BCREG,     NONE,       NONE,        NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
@ -310,6 +311,7 @@ architecture behaviour of decode1 is
        INSN_rlwimi      =>  (ALU,  NONE, OP_RLC,       RA,         CONST_SH32,  RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE),
        INSN_rlwinm      =>  (ALU,  NONE, OP_RLC,       NONE,       CONST_SH32,  RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE),
        INSN_rlwnm       =>  (ALU,  NONE, OP_RLC,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE),
+        INSN_rnop        =>  (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
        INSN_sc          =>  (ALU,  NONE, OP_SC,        NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
        INSN_setb        =>  (ALU,  NONE, OP_SETB,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
        INSN_slbia       =>  (LDST, NONE, OP_TLBIE,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
@ -476,8 +478,6 @@ begin
                end if;
            end if;
            if rst = '1' then
-                br.br_nia <= (others => '0');
-                br.br_offset <= (others => '0');
                br.predict <= '0';
            else
                br <= br_in;
@ -499,8 +499,8 @@ begin
    decode1_1: process(all)
        variable v : Decode1ToDecode2Type;
        variable vr : Decode1ToRegisterFileType;
-        variable br_target : std_ulogic_vector(61 downto 0);
-        variable br_offset : signed(23 downto 0);
+        variable br_nia    : std_ulogic_vector(61 downto 0);
+        variable br_offset : std_ulogic_vector(23 downto 0);
        variable bv : br_predictor_t;
        variable icode : insn_code;
        variable sprn : spr_num_t;
@ -594,31 +594,28 @@ begin
        -- Branch predictor
        -- Note bclr, bcctr and bctar not predicted as we have no
        -- count cache or link stack.
-        br_offset := (others => '0');
+        br_offset := f_in.insn(25 downto 2);
        case icode is
-            when INSN_b =>
+            when INSN_brel | INSN_babs =>
                -- Unconditional branches are always taken
                v.br_pred := '1';
-                br_offset := signed(f_in.insn(25 downto 2));
-            when INSN_bc =>
-                -- Predict backward branches as taken, forward as untaken
+            when INSN_bcrel =>
+                -- Predict backward relative branches as taken, others as untaken
                v.br_pred := f_in.insn(15);
-                br_offset := resize(signed(f_in.insn(15 downto 2)), 24);
+                br_offset(23 downto 14) := (others => '1');
            when others =>
        end case;
-        bv.br_nia := f_in.nia(63 downto 2);
+        br_nia := f_in.nia(63 downto 2);
        if f_in.insn(1) = '1' then
-            bv.br_nia := (others => '0');
+            br_nia := (others => '0');
        end if;
-        bv.br_offset := br_offset;
+        bv.br_target := signed(br_nia) + signed(br_offset);
        if f_in.next_predicted = '1' then
            v.br_pred := '1';
        elsif f_in.next_pred_ntaken = '1' then
            v.br_pred := '0';
        end if;
        bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted;
-        -- after a clock edge...
-        br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset);

        -- Work out GPR/FPR read addresses
        -- Note that for prefixed instructions we are working this out based
@ -665,7 +662,7 @@ begin
        d_out.decode <= decode;
        r_out <= vr;
        f_out.redirect <= br.predict;
-        f_out.redirect_nia <= br_target & "00";
+        f_out.redirect_nia <= std_ulogic_vector(br.br_target) & "00";
        flush_out <= bv.predict or br.predict;
    end process;

--- a/decode2.vhdl
+++ b/decode2.vhdl
@ -221,9 +221,8 @@ architecture behaviour of decode2 is
        OP_SHR      => "010",
        OP_EXTSWSLI => "010",
        OP_MUL_L64  => "011",           -- muldiv_result
-        OP_B        => "110",           -- next_nia
-        OP_BC       => "110",
-        OP_BCREG    => "110",
+        OP_BCREG    => "101",           -- ramspr_result
+        OP_RFID     => "101",
        OP_ADDG6S   => "111",           -- misc_result
        OP_ISEL     => "111",
        OP_DARN     => "111",
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@ -47,14 +47,16 @@ package decode_types is
        INSN_andi_dot, -- 10
        INSN_andis_dot,
        INSN_attn,
-        INSN_b,
-        INSN_bc,
+        INSN_brel,
+        INSN_babs,
+        INSN_bcrel,
+        INSN_bcabs,
        INSN_bcctr,
        INSN_bclr,
        INSN_bctar,
-        INSN_brh,
+        INSN_brh, -- 20
        INSN_brw,
-        INSN_brd, -- 20
+        INSN_brd,
        INSN_cbcdtd,
        INSN_cdtbcd,
        INSN_cmpi,
@ -62,9 +64,9 @@ package decode_types is
        INSN_cntlzw,
        INSN_cntlzd,
        INSN_cnttzw,
-        INSN_cnttzd,
+        INSN_cnttzd, -- 30
        INSN_crand,
-        INSN_crandc, -- 30
+        INSN_crandc,
        INSN_creqv,
        INSN_crnand,
        INSN_crnor,
@ -72,9 +74,9 @@ package decode_types is
        INSN_crorc,
        INSN_crxor,
        INSN_darn,
-        INSN_eieio,
+        INSN_eieio, -- 40
        INSN_extsb,
-        INSN_extsh, -- 40
+        INSN_extsh,
        INSN_extsw,
        INSN_extswsli,
        INSN_isync,
@ -82,9 +84,9 @@ package decode_types is
        INSN_ld,
        INSN_ldu,
        INSN_lhau,
-        INSN_lwa,
+        INSN_lwa, -- 50
        INSN_lwzu,
-        INSN_mcrf, -- 50
+        INSN_mcrf,
        INSN_mcrxrx,
        INSN_mfcr,
        INSN_mfmsr,
@ -92,9 +94,9 @@ package decode_types is
        INSN_mtcrf,
        INSN_mtmsr,
        INSN_mtmsrd,
-        INSN_mtspr,
+        INSN_mtspr, -- 60
        INSN_mulli,
-        INSN_neg, -- 60
+        INSN_neg,
        INSN_nop,
        INSN_ori,
        INSN_oris,
@ -102,46 +104,49 @@ package decode_types is
        INSN_popcntw,
        INSN_popcntd,
        INSN_prtyw,
-        INSN_prtyd,
+        INSN_prtyd, -- 70
        INSN_rfid,
-        INSN_rldic, -- 70
+        INSN_rldic,
        INSN_rldicl,
        INSN_rldicr,
        INSN_rldimi,
        INSN_rlwimi,
        INSN_rlwinm,
+        INSN_rnop,
        INSN_sc,
-        INSN_setb,
+        INSN_setb, -- 80
        INSN_slbia,
        INSN_sradi,
-        INSN_srawi, -- 80
+        INSN_srawi,
        INSN_stbu,
        INSN_std,
        INSN_stdu,
        INSN_sthu,
        INSN_stwu,
        INSN_subfic,
-        INSN_subfme,
+        INSN_subfme, -- 90
        INSN_subfze,
        INSN_sync,
-        INSN_tdi, -- 90
+        INSN_tdi,
        INSN_tlbsync,
        INSN_twi,
        INSN_wait,
        INSN_xori,
        INSN_xoris,
+        -- pad to 104
+        INSN_063, INSN_064, INSN_065, INSN_066, INSN_067,

        -- Non-prefixed instructions that have a MLS:D prefixed form and
        -- their corresponding prefixed instructions.
        -- The non-prefixed versions have even indexes so that we can
        -- convert them to the prefixed version by setting bit 0
-        INSN_addi, -- 96
+        INSN_addi, -- 104
        INSN_paddi,
        INSN_lbz,
        INSN_plbz,
-        INSN_lha, -- 100
+        INSN_lha,
        INSN_plha,
-        INSN_lhz,
+        INSN_lhz, -- 110
        INSN_plhz,
        INSN_lwz,
        INSN_plwz,
@ -149,11 +154,11 @@ package decode_types is
        INSN_pstb,
        INSN_sth,
        INSN_psth,
-        INSN_stw, -- 110
+        INSN_stw,
        INSN_pstw,

        -- Slots for non-prefixed opcodes that are 8LS:D when prefixed
-        INSN_lhzu, -- 112
+        INSN_lhzu, -- 120
        INSN_plwa,
        INSN_op57,
        INSN_pld,
@ -161,8 +166,7 @@ package decode_types is
        INSN_pstd,

        -- pad to 128 to simplify comparison logic
-        INSN_076, INSN_077,
-        INSN_078, INSN_079, INSN_07a, INSN_07b, INSN_07c, INSN_07d, INSN_07e, INSN_07f,
+        INSN_07e, INSN_07f,

        -- The following instructions have an RB operand but don't access FPRs
        INSN_add,
@ -475,7 +479,308 @@ package decode_types is
 						update => '0', reserve => '0', is_32bit => '0',
 						is_signed => '0', rc => NONE, lr => '0', sgl_pipe => '0', repeat => NONE);

+    -- This function maps from insn_code values to primary opcode.
+    -- With this, we don't have to store the primary opcode of each instruction
+    -- in the icache if we are storing its insn_code.
+    function recode_primary_opcode(icode: insn_code) return std_ulogic_vector;
+
 end decode_types;

 package body decode_types is
+
+    function recode_primary_opcode(icode: insn_code) return std_ulogic_vector is
+    begin
+        case icode is
+            when INSN_addic     => return "001100";
+            when INSN_addic_dot => return "001101";
+            when INSN_addi      => return "001110";
+            when INSN_addis     => return "001111";
+            when INSN_addpcis   => return "010011";
+            when INSN_andi_dot  => return "011100";
+            when INSN_andis_dot => return "011101";
+            when INSN_attn      => return "000000";
+            when INSN_brel      => return "010010";
+            when INSN_babs      => return "010010";
+            when INSN_bcrel     => return "010000";
+            when INSN_bcabs     => return "010000";
+            when INSN_brh       => return "011111";
+            when INSN_brw       => return "011111";
+            when INSN_brd       => return "011111";
+            when INSN_cmpi      => return "001011";
+            when INSN_cmpli     => return "001010";
+            when INSN_lbz       => return "100010";
+            when INSN_lbzu      => return "100011";
+            when INSN_lfd       => return "110010";
+            when INSN_lfdu      => return "110011";
+            when INSN_lfs       => return "110000";
+            when INSN_lfsu      => return "110001";
+            when INSN_lha       => return "101010";
+            when INSN_lhau      => return "101011";
+            when INSN_lhz       => return "101000";
+            when INSN_lhzu      => return "101001";
+            when INSN_lwz       => return "100000";
+            when INSN_lwzu      => return "100001";
+            when INSN_mulli     => return "000111";
+            when INSN_nop       => return "011000";
+            when INSN_ori       => return "011000";
+            when INSN_oris      => return "011001";
+            when INSN_rlwimi    => return "010100";
+            when INSN_rlwinm    => return "010101";
+            when INSN_rlwnm     => return "010111";
+            when INSN_sc        => return "010001";
+            when INSN_stb       => return "100110";
+            when INSN_stbu      => return "100111";
+            when INSN_stfd      => return "110110";
+            when INSN_stfdu     => return "110111";
+            when INSN_stfs      => return "110100";
+            when INSN_stfsu     => return "110101";
+            when INSN_sth       => return "101100";
+            when INSN_sthu      => return "101101";
+            when INSN_stw       => return "100100";
+            when INSN_stwu      => return "100101";
+            when INSN_subfic    => return "001000";
+            when INSN_tdi       => return "000010";
+            when INSN_twi       => return "000011";
+            when INSN_xori      => return "011010";
+            when INSN_xoris     => return "011011";
+            when INSN_maddhd    => return "000100";
+            when INSN_maddhdu   => return "000100";
+            when INSN_maddld    => return "000100";
+            when INSN_rldic     => return "011110";
+            when INSN_rldicl    => return "011110";
+            when INSN_rldicr    => return "011110";
+            when INSN_rldimi    => return "011110";
+            when INSN_rldcl     => return "011110";
+            when INSN_rldcr     => return "011110";
+            when INSN_ld        => return "111010";
+            when INSN_ldu       => return "111010";
+            when INSN_lwa       => return "111010";
+            when INSN_fdivs     => return "111011";
+            when INSN_fsubs     => return "111011";
+            when INSN_fadds     => return "111011";
+            when INSN_fsqrts    => return "111011";
+            when INSN_fres      => return "111011";
+            when INSN_fmuls     => return "111011";
+            when INSN_frsqrtes  => return "111011";
+            when INSN_fmsubs    => return "111011";
+            when INSN_fmadds    => return "111011";
+            when INSN_fnmsubs   => return "111011";
+            when INSN_fnmadds   => return "111011";
+            when INSN_std       => return "111110";
+            when INSN_stdu      => return "111110";
+            when INSN_fdiv      => return "111111";
+            when INSN_fsub      => return "111111";
+            when INSN_fadd      => return "111111";
+            when INSN_fsqrt     => return "111111";
+            when INSN_fsel      => return "111111";
+            when INSN_fre       => return "111111";
+            when INSN_fmul      => return "111111";
+            when INSN_frsqrte   => return "111111";
+            when INSN_fmsub     => return "111111";
+            when INSN_fmadd     => return "111111";
+            when INSN_fnmsub    => return "111111";
+            when INSN_fnmadd    => return "111111";
+            when INSN_prefix    => return "000001";
+            when INSN_op57      => return "111001";
+            when INSN_op61      => return "111101";
+            when INSN_add       => return "011111";
+            when INSN_addc      => return "011111";
+            when INSN_adde      => return "011111";
+            when INSN_addex     => return "011111";
+            when INSN_addg6s    => return "011111";
+            when INSN_addme     => return "011111";
+            when INSN_addze     => return "011111";
+            when INSN_and       => return "011111";
+            when INSN_andc      => return "011111";
+            when INSN_bperm     => return "011111";
+            when INSN_cbcdtd    => return "011111";
+            when INSN_cdtbcd    => return "011111";
+            when INSN_cmp       => return "011111";
+            when INSN_cmpb      => return "011111";
+            when INSN_cmpeqb    => return "011111";
+            when INSN_cmpl      => return "011111";
+            when INSN_cmprb     => return "011111";
+            when INSN_cntlzd    => return "011111";
+            when INSN_cntlzw    => return "011111";
+            when INSN_cnttzd    => return "011111";
+            when INSN_cnttzw    => return "011111";
+            when INSN_darn      => return "011111";
+            when INSN_dcbf      => return "011111";
+            when INSN_dcbst     => return "011111";
+            when INSN_dcbt      => return "011111";
+            when INSN_dcbtst    => return "011111";
+            when INSN_dcbz      => return "011111";
+            when INSN_divdeu    => return "011111";
+            when INSN_divweu    => return "011111";
+            when INSN_divde     => return "011111";
+            when INSN_divwe     => return "011111";
+            when INSN_divdu     => return "011111";
+            when INSN_divwu     => return "011111";
+            when INSN_divd      => return "011111";
+            when INSN_divw      => return "011111";
+            when INSN_eieio     => return "011111";
+            when INSN_eqv       => return "011111";
+            when INSN_extsb     => return "011111";
+            when INSN_extsh     => return "011111";
+            when INSN_extsw     => return "011111";
+            when INSN_extswsli  => return "011111";
+            when INSN_icbi      => return "011111";
+            when INSN_icbt      => return "011111";
+            when INSN_isel      => return "011111";
+            when INSN_lbarx     => return "011111";
+            when INSN_lbzcix    => return "011111";
+            when INSN_lbzux     => return "011111";
+            when INSN_lbzx      => return "011111";
+            when INSN_ldarx     => return "011111";
+            when INSN_ldbrx     => return "011111";
+            when INSN_ldcix     => return "011111";
+            when INSN_ldux      => return "011111";
+            when INSN_ldx       => return "011111";
+            when INSN_lfdx      => return "011111";
+            when INSN_lfdux     => return "011111";
+            when INSN_lfiwax    => return "011111";
+            when INSN_lfiwzx    => return "011111";
+            when INSN_lfsx      => return "011111";
+            when INSN_lfsux     => return "011111";
+            when INSN_lharx     => return "011111";
+            when INSN_lhaux     => return "011111";
+            when INSN_lhax      => return "011111";
+            when INSN_lhbrx     => return "011111";
+            when INSN_lhzcix    => return "011111";
+            when INSN_lhzux     => return "011111";
+            when INSN_lhzx      => return "011111";
+            when INSN_lwarx     => return "011111";
+            when INSN_lwaux     => return "011111";
+            when INSN_lwax      => return "011111";
+            when INSN_lwbrx     => return "011111";
+            when INSN_lwzcix    => return "011111";
+            when INSN_lwzux     => return "011111";
+            when INSN_lwzx      => return "011111";
+            when INSN_mcrxrx    => return "011111";
+            when INSN_mfcr      => return "011111";
+            when INSN_mfmsr     => return "011111";
+            when INSN_mfspr     => return "011111";
+            when INSN_modud     => return "011111";
+            when INSN_moduw     => return "011111";
+            when INSN_modsd     => return "011111";
+            when INSN_modsw     => return "011111";
+            when INSN_mtcrf     => return "011111";
+            when INSN_mtmsr     => return "011111";
+            when INSN_mtmsrd    => return "011111";
+            when INSN_mtspr     => return "011111";
+            when INSN_mulhd     => return "011111";
+            when INSN_mulhdu    => return "011111";
+            when INSN_mulhw     => return "011111";
+            when INSN_mulhwu    => return "011111";
+            when INSN_mulld     => return "011111";
+            when INSN_mullw     => return "011111";
+            when INSN_nand      => return "011111";
+            when INSN_neg       => return "011111";
+            when INSN_rnop      => return "011111";
+            when INSN_nor       => return "011111";
+            when INSN_or        => return "011111";
+            when INSN_orc       => return "011111";
+            when INSN_popcntb   => return "011111";
+            when INSN_popcntd   => return "011111";
+            when INSN_popcntw   => return "011111";
+            when INSN_prtyd     => return "011111";
+            when INSN_prtyw     => return "011111";
+            when INSN_setb      => return "011111";
+            when INSN_slbia     => return "011111";
+            when INSN_sld       => return "011111";
+            when INSN_slw       => return "011111";
+            when INSN_srad      => return "011111";
+            when INSN_sradi     => return "011111";
+            when INSN_sraw      => return "011111";
+            when INSN_srawi     => return "011111";
+            when INSN_srd       => return "011111";
+            when INSN_srw       => return "011111";
+            when INSN_stbcix    => return "011111";
+            when INSN_stbcx     => return "011111";
+            when INSN_stbux     => return "011111";
+            when INSN_stbx      => return "011111";
+            when INSN_stdbrx    => return "011111";
+            when INSN_stdcix    => return "011111";
+            when INSN_stdcx     => return "011111";
+            when INSN_stdux     => return "011111";
+            when INSN_stdx      => return "011111";
+            when INSN_stfdx     => return "011111";
+            when INSN_stfdux    => return "011111";
+            when INSN_stfiwx    => return "011111";
+            when INSN_stfsx     => return "011111";
+            when INSN_stfsux    => return "011111";
+            when INSN_sthbrx    => return "011111";
+            when INSN_sthcix    => return "011111";
+            when INSN_sthcx     => return "011111";
+            when INSN_sthux     => return "011111";
+            when INSN_sthx      => return "011111";
+            when INSN_stwbrx    => return "011111";
+            when INSN_stwcix    => return "011111";
+            when INSN_stwcx     => return "011111";
+            when INSN_stwux     => return "011111";
+            when INSN_stwx      => return "011111";
+            when INSN_subf      => return "011111";
+            when INSN_subfc     => return "011111";
+            when INSN_subfe     => return "011111";
+            when INSN_subfme    => return "011111";
+            when INSN_subfze    => return "011111";
+            when INSN_sync      => return "011111";
+            when INSN_td        => return "011111";
+            when INSN_tw        => return "011111";
+            when INSN_tlbie     => return "011111";
+            when INSN_tlbiel    => return "011111";
+            when INSN_tlbsync   => return "011111";
+            when INSN_wait      => return "011111";
+            when INSN_xor       => return "011111";
+            when INSN_bcctr     => return "010011";
+            when INSN_bclr      => return "010011";
+            when INSN_bctar     => return "010011";
+            when INSN_crand     => return "010011";
+            when INSN_crandc    => return "010011";
+            when INSN_creqv     => return "010011";
+            when INSN_crnand    => return "010011";
+            when INSN_crnor     => return "010011";
+            when INSN_cror      => return "010011";
+            when INSN_crorc     => return "010011";
+            when INSN_crxor     => return "010011";
+            when INSN_isync     => return "010011";
+            when INSN_mcrf      => return "010011";
+            when INSN_rfid      => return "010011";
+            when INSN_fcfids    => return "111011";
+            when INSN_fcfidus   => return "111011";
+            when INSN_fcmpu     => return "111111";
+            when INSN_fcmpo     => return "111111";
+            when INSN_mcrfs     => return "111111";
+            when INSN_ftdiv     => return "111111";
+            when INSN_ftsqrt    => return "111111";
+            when INSN_mtfsb     => return "111111";
+            when INSN_mtfsfi    => return "111111";
+            when INSN_fmrgow    => return "111111";
+            when INSN_fmrgew    => return "111111";
+            when INSN_mffs      => return "111111";
+            when INSN_mtfsf     => return "111111";
+            when INSN_fcpsgn    => return "111111";
+            when INSN_fneg      => return "111111";
+            when INSN_fmr       => return "111111";
+            when INSN_fnabs     => return "111111";
+            when INSN_fabs      => return "111111";
+            when INSN_frin      => return "111111";
+            when INSN_friz      => return "111111";
+            when INSN_frip      => return "111111";
+            when INSN_frim      => return "111111";
+            when INSN_frsp      => return "111111";
+            when INSN_fctiw     => return "111111";
+            when INSN_fctiwu    => return "111111";
+            when INSN_fctid     => return "111111";
+            when INSN_fcfid     => return "111111";
+            when INSN_fctidu    => return "111111";
+            when INSN_fcfidu    => return "111111";
+            when INSN_fctiwz    => return "111111";
+            when INSN_fctiwuz   => return "111111";
+            when INSN_fctidz    => return "111111";
+            when INSN_fctiduz   => return "111111";
+            when others         => return "XXXXXX";
+        end case;
+    end;
+
 end decode_types;
--- a/execute1.vhdl
+++ b/execute1.vhdl
@ -95,6 +95,7 @@ architecture behaviour of execute1 is
        exception : std_ulogic;
        trap : std_ulogic;
        advance_nia : std_ulogic;
+        redir_to_next : std_ulogic;
        new_msr : std_ulogic_vector(63 downto 0);
        take_branch : std_ulogic;
        direct_branch : std_ulogic;
@ -124,6 +125,9 @@ architecture behaviour of execute1 is
        res2_sel : std_ulogic_vector(1 downto 0);
        spr_select : spr_id;
        pmu_spr_num : std_ulogic_vector(4 downto 0);
+        redir_to_next : std_ulogic;
+        advance_nia : std_ulogic;
+        lr_from_next : std_ulogic;
 	mul_in_progress : std_ulogic;
        mul_finish : std_ulogic;
        div_in_progress : std_ulogic;
@ -145,6 +149,7 @@ architecture behaviour of execute1 is
         prev_prefixed => '0',
         oe => '0', mul_select => "00", res2_sel => "00",
         spr_select => spr_id_init, pmu_spr_num => 5x"0",
+         redir_to_next => '0', advance_nia => '0', lr_from_next => '0',
         mul_in_progress => '0', mul_finish => '0', div_in_progress => '0',
         no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0',
         taken_branch_event => '0', br_mispredict => '0',
@ -510,6 +515,7 @@ begin
        variable wr_addr : ramspr_index;
        variable even_wr_enab, odd_wr_enab : std_ulogic;
        variable even_wr_data, odd_wr_data : std_ulogic_vector(63 downto 0);
+        variable ramspr_even_data : std_ulogic_vector(63 downto 0);
        variable doit : std_ulogic;
    begin
        -- Read address mux and async RAM reading
@ -533,11 +539,16 @@ begin
        else
            wr_addr := ex1.ramspr_wraddr;
        end if;
+        if ex1.lr_from_next = '1' then
+            ramspr_even_data := next_nia;
+        else
+            ramspr_even_data := ex1.e.write_data;
+        end if;
        if interrupt_in.intr = '1' then
            even_wr_data := ex2.e.last_nia;
            odd_wr_data := intr_srr1(ctrl.msr, interrupt_in.srr1);
        else
-            even_wr_data := ex1.e.write_data;
+            even_wr_data := ramspr_even_data;
            odd_wr_data := ex1.ramspr_odd_data;
        end if;
        ramspr_wr_addr <= wr_addr;
@ -550,7 +561,7 @@ begin
        -- We assume no instruction executes in the cycle immediately following
        -- an interrupt, so we don't need to bypass interrupt data
        if ex1.se.ramspr_write_even = '1' and e_in.ramspr_even_rdaddr = ex1.ramspr_wraddr then
-            ramspr_even <= ex1.e.write_data;
+            ramspr_even <= ramspr_even_data;
        else
            ramspr_even <= even_rd_data;
        end if;
@ -593,7 +604,6 @@ begin
        shortmul_result    when "011",
        muldiv_result      when "100",
        ramspr_result      when "101",
-        next_nia           when "110",
        misc_result        when others;

    execute1_0: process(clk)
@ -1016,7 +1026,6 @@ begin
        v.e.mode_32bit := not ex1.msr(MSR_SF);
        v.e.instr_tag := e_in.instr_tag;
        v.e.last_nia := e_in.nia;
-        v.e.br_offset := 64x"4";

        v.se.ramspr_write_even := e_in.ramspr_write_even;
        v.se.ramspr_write_odd := e_in.ramspr_write_odd;
@ -1114,8 +1123,6 @@ begin
                v.direct_branch := '1';
                v.e.br_last := '1';
                v.e.br_taken := '1';
-                v.e.br_offset := b_in;
-                v.e.abs_br := insn_aa(e_in.insn);
                if e_in.br_pred = '0' then
                    -- should never happen
                    v.e.redirect := '1';
@ -1129,14 +1136,13 @@ begin
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
                v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd);
-                if v.take_branch = '1' then
-                    v.e.br_offset := b_in;
-                    v.e.abs_br := insn_aa(e_in.insn);
-                end if;
                -- Mispredicted branches cause a redirect
                if v.take_branch /= e_in.br_pred then
                    v.e.redirect := '1';
                end if;
+                if v.take_branch = '0' then
+                    v.redir_to_next := '1';
+                end if;
                v.direct_branch := '1';
                v.e.br_last := '1';
                v.e.br_taken := v.take_branch;
@ -1150,10 +1156,6 @@ begin
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
                v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd);
-                if v.take_branch = '1' then
-                    v.e.br_offset := ramspr_result;
-                    v.e.abs_br := '1';
-                end if;
                -- Indirect branches are never predicted taken
                v.e.redirect := v.take_branch;
                v.e.br_taken := v.take_branch;
@ -1177,8 +1179,6 @@ begin
                    v.new_msr(MSR_DR) := '1';
                end if;
                v.se.write_msr := '1';
-                v.e.br_offset := ramspr_result;
-                v.e.abs_br := '1';
                v.e.redirect := '1';
                v.se.write_cfar := '1';
                if HAS_FPU then
@ -1292,6 +1292,7 @@ begin

 	    when OP_ISYNC =>
 		v.e.redirect := '1';
+                v.redir_to_next := '1';

 	    when OP_ICBI =>
 		v.se.icache_inval := '1';
@ -1406,6 +1407,7 @@ begin
            v.mul_select := e_in.sub_select(1 downto 0);
            v.se := side_effect_init;
            v.ramspr_wraddr := e_in.ramspr_wraddr;
+            v.lr_from_next := e_in.lr;
            v.ramspr_odd_data := actions.ramspr_odd_data;
        end if;

@ -1423,9 +1425,6 @@ begin

        irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in);

-	-- Next insn adder used in a couple of places
-	next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4);
-
 	-- rotator control signals
 	right_shift <= '1' when e_in.insn_type = OP_SHR else '0';
 	rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0';
@ -1507,10 +1506,9 @@ begin
            x_to_divider.valid <= actions.start_div;
            v.div_in_progress := actions.start_div;
            v.br_mispredict := v.e.redirect and actions.direct_branch;
+            v.advance_nia := actions.advance_nia;
+            v.redir_to_next := actions.redir_to_next;
            exception := actions.trap;
-            if actions.advance_nia = '1' then
-                v.e.last_nia := next_nia;
-            end if;

            -- Go busy while division is happening because the
            -- divider is not pipelined.  Also go busy while a
@ -1681,6 +1679,9 @@ begin
        variable sign, zero : std_ulogic;
        variable rcnz_hi, rcnz_lo : std_ulogic;
    begin
+	-- Next insn adder used in a couple of places
+	next_nia <= std_ulogic_vector(unsigned(ex1.e.last_nia) + 4);
+
 	v := ex2;
        if stage2_stall = '0' then
            v.e := ex1.e;
@ -1688,6 +1689,9 @@ begin
            v.ext_interrupt := ex1.ext_interrupt;
            v.taken_branch_event := ex1.taken_branch_event;
            v.br_mispredict := ex1.br_mispredict;
+            if ex1.advance_nia = '1' then
+                v.e.last_nia := next_nia;
+            end if;
        end if;

        if ex1.se.mult_32s = '1' and ex1.oe = '1' then
@ -1748,10 +1752,12 @@ begin
        else
            sprres := pmu_to_x.spr_val;
        end if;
-        if ex1.res2_sel(1) = '0' then
-            ex_result := rcresult;
-        else
+        if ex1.res2_sel(1) = '1' then
            ex_result := sprres;
+        elsif ex1.redir_to_next = '1' then
+            ex_result := next_nia;
+        else
+            ex_result := rcresult;
        end if;

        cr_res := ex1.e.write_cr_data;
--- a/fetch1.vhdl
+++ b/fetch1.vhdl
@ -3,12 +3,14 @@ use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;

 library work;
+use work.utils.all;
 use work.common.all;

 entity fetch1 is
    generic(
 	RESET_ADDRESS     : std_logic_vector(63 downto 0) := (others => '0');
 	ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0');
+        TLB_SIZE          : positive := 64;        -- L1 ITLB number of entries (direct mapped)
        HAS_BTC           : boolean := true
 	);
    port(
@ -21,6 +23,7 @@ entity fetch1 is
        inval_btc     : in std_ulogic;
 	stop_in       : in std_ulogic;
 	alt_reset_in  : in std_ulogic;
+        m_in          : in MmuToITLBType;

 	-- redirect from writeback unit
 	w_in          : in WritebackToFetch1Type;
@ -40,14 +43,32 @@ architecture behaviour of fetch1 is
    type reg_internal_t is record
        mode_32bit: std_ulogic;
        rd_is_niap4: std_ulogic;
-        predicted_taken: std_ulogic;
-        predicted_nia: std_ulogic_vector(63 downto 0);
+        tlbcheck: std_ulogic;
+        tlbstall: std_ulogic;
+        next_nia: std_ulogic_vector(63 downto 0);
    end record;
+
+    -- Mini effective to real translation cache
+    type erat_t is record
+        epn0: std_ulogic_vector(63 - MIN_LG_PGSZ downto 0);
+        epn1: std_ulogic_vector(63 - MIN_LG_PGSZ downto 0);
+        rpn0: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0);
+        rpn1: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0);
+        priv0: std_ulogic;
+        priv1: std_ulogic;
+        valid: std_ulogic_vector(1 downto 0);
+        mru: std_ulogic;        -- '1' => entry 1 most recently used
+    end record;
+
    signal r, r_next : Fetch1ToIcacheType;
    signal r_int, r_next_int : reg_internal_t;
    signal advance_nia : std_ulogic;
    signal log_nia : std_ulogic_vector(42 downto 0);

+    signal erat : erat_t;
+    signal erat_hit : std_ulogic;
+    signal erat_sel : std_ulogic;
+
    constant BTC_ADDR_BITS : integer := 10;
    constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS;
    constant BTC_TARGET_BITS : integer := 62;
@ -55,43 +76,75 @@ architecture behaviour of fetch1 is
    constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS + 2;
    type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0);

+    signal btc_rd_addr : unsigned(BTC_ADDR_BITS - 1 downto 0);
    signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0');
    signal btc_rd_valid : std_ulogic := '0';

+    -- L1 ITLB.
+    constant TLB_BITS : natural := log2(TLB_SIZE);
+    constant TLB_EA_TAG_BITS : natural := 64 - (MIN_LG_PGSZ + TLB_BITS);
+    constant TLB_PTE_BITS : natural := 64;
+
+    subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
+    type tlb_valids_t is array(tlb_index_t) of std_ulogic;
+    subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
+    type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
+    subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
+    type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
+
+    signal itlb_valids : tlb_valids_t;
+    signal itlb_tags : tlb_tags_t;
+    signal itlb_ptes : tlb_ptes_t;
+
+    -- Values read from above arrays on a clock edge
+    signal itlb_valid : std_ulogic;
+    signal itlb_ttag : tlb_tag_t;
+    signal itlb_pte : tlb_pte_t;
+    signal itlb_hit : std_ulogic;
+
+    -- Privilege bit from PTE EAA field
+    signal eaa_priv  : std_ulogic;
+
+    -- Simple hash for direct-mapped TLB index
+    function hash_ea(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
+        variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
+    begin
+        hash := addr(MIN_LG_PGSZ + TLB_BITS - 1 downto MIN_LG_PGSZ)
+                xor addr(MIN_LG_PGSZ + 2 * TLB_BITS - 1 downto MIN_LG_PGSZ + TLB_BITS)
+                xor addr(MIN_LG_PGSZ + 3 * TLB_BITS - 1 downto MIN_LG_PGSZ + 2 * TLB_BITS);
+        return hash;
+    end;
+
 begin

    regs : process(clk)
    begin
 	if rising_edge(clk) then
            log_nia <= r.nia(63) & r.nia(43 downto 2);
-	    if r /= r_next then
+	    if r /= r_next and advance_nia = '1' then
 		report "fetch1 rst:" & std_ulogic'image(rst) &
                    " IR:" & std_ulogic'image(r_next.virt_mode) &
                    " P:" & std_ulogic'image(r_next.priv_mode) &
                    " E:" & std_ulogic'image(r_next.big_endian) &
                    " 32:" & std_ulogic'image(r_next_int.mode_32bit) &
+                    " I:" & std_ulogic'image(w_in.interrupt) &
 		    " R:" & std_ulogic'image(w_in.redirect) & std_ulogic'image(d_in.redirect) &
 		    " S:" & std_ulogic'image(stall_in) &
 		    " T:" & std_ulogic'image(stop_in) &
-		    " nia:" & to_hstring(r_next.nia);
+		    " nia:" & to_hstring(r_next.nia) &
+                    " req:" & std_ulogic'image(r_next.req) &
+                    " FF:" & std_ulogic'image(r_next.fetch_fail);
 	    end if;
-            if rst = '1' or w_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then
-                r.virt_mode <= r_next.virt_mode;
-                r.priv_mode <= r_next.priv_mode;
-                r.big_endian <= r_next.big_endian;
-                r_int.mode_32bit <= r_next_int.mode_32bit;
-            end if;
            if advance_nia = '1' then
-                r.predicted <= r_next.predicted;
-                r.pred_ntaken <= r_next.pred_ntaken;
-                r.nia <= r_next.nia;
-                r_int.predicted_taken <= r_next_int.predicted_taken;
-                r_int.predicted_nia <= r_next_int.predicted_nia;
-                r_int.rd_is_niap4 <= r_next_int.rd_is_niap4;
+                r <= r_next;
+                r_int <= r_next_int;
            end if;
            -- always send the up-to-date stop mark and req
            r.stop_mark <= stop_in;
-            r.req <= not rst and not stop_in;
+            r.req <= r_next.req;
+            r.fetch_fail <= r_next.fetch_fail;
+            r_int.tlbcheck <= r_next_int.tlbcheck;
+            r_int.tlbstall <= r_next_int.tlbstall;
 	end if;
    end process;
    log_out <= log_nia;
@ -119,15 +172,13 @@ begin
            variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0);
        begin
            if rising_edge(clk) then
-                raddr := unsigned(r.nia(BTC_ADDR_BITS + 1 downto 2)) +
-                         to_unsigned(2, BTC_ADDR_BITS);
                if advance_nia = '1' then
-		    if is_X(raddr) then
+		    if is_X(btc_rd_addr) then
 			btc_rd_data <= (others => 'X');
 			btc_rd_valid <= 'X';
 		    else
-			btc_rd_data <= btc_memory(to_integer(raddr));
-			btc_rd_valid <= btc_valids(to_integer(raddr));
+			btc_rd_data <= btc_memory(to_integer(btc_rd_addr));
+			btc_rd_valid <= btc_valids(to_integer(btc_rd_addr));
 		    end if;
                end if;
                if btc_wr = '1' then
@ -144,70 +195,250 @@ begin
        end process;
    end generate;

+    erat_sync : process(clk)
+    begin
+        if rising_edge(clk) then
+            if rst /= '0' or m_in.tlbie = '1' then
+                erat.valid <= "00";
+                erat.mru <= '0';
+            else
+                if erat_hit = '1' then
+                    erat.mru <= erat_sel;
+                end if;
+                if m_in.tlbld = '1' then
+                    erat.epn0 <= m_in.addr(63 downto MIN_LG_PGSZ);
+                    erat.rpn0 <= m_in.pte(REAL_ADDR_BITS-1 downto MIN_LG_PGSZ);
+                    erat.priv0 <= m_in.pte(3);
+                    erat.valid(0) <= '1';
+                    erat.valid(1) <= '0';
+                    erat.mru <= '0';
+                elsif r_int.tlbcheck = '1' and itlb_hit = '1' then
+                    if erat.mru = '0' then
+                        erat.epn1 <= r.nia(63 downto MIN_LG_PGSZ);
+                        erat.rpn1 <= itlb_pte(REAL_ADDR_BITS-1 downto MIN_LG_PGSZ);
+                        erat.priv1 <= itlb_pte(3);
+                        erat.valid(1) <= '1';
+                    else
+                        erat.epn0 <= r.nia(63 downto MIN_LG_PGSZ);
+                        erat.rpn0 <= itlb_pte(REAL_ADDR_BITS-1 downto MIN_LG_PGSZ);
+                        erat.priv0 <= itlb_pte(3);
+                        erat.valid(0) <= '1';
+                    end if;
+                    erat.mru <= not erat.mru;
+                end if;
+            end if;
+        end if;
+    end process;
+
+    -- Read TLB using the NIA for the next cycle
+    itlb_read : process(clk)
+	variable tlb_req_index : std_ulogic_vector(TLB_BITS - 1 downto 0);
+    begin
+        if rising_edge(clk) then
+            if advance_nia = '1' then
+                tlb_req_index := hash_ea(r_next.nia);
+                if is_X(tlb_req_index) then
+                    itlb_pte <= (others => 'X');
+                    itlb_ttag <= (others => 'X');
+		    itlb_valid <= 'X';
+                else
+                    itlb_pte <= itlb_ptes(to_integer(unsigned(tlb_req_index)));
+                    itlb_ttag <= itlb_tags(to_integer(unsigned(tlb_req_index)));
+		    itlb_valid <= itlb_valids(to_integer(unsigned(tlb_req_index)));
+                end if;
+            end if;
+        end if;
+    end process;
+
+    -- TLB hit detection
+    itlb_lookup : process(all)
+    begin
+        itlb_hit <= '0';
+        if itlb_ttag = r.nia(63 downto MIN_LG_PGSZ + TLB_BITS) then
+            itlb_hit <= itlb_valid;
+        end if;
+    end process;
+
+    -- iTLB update
+    itlb_update: process(clk)
+	variable wr_index : std_ulogic_vector(TLB_BITS - 1 downto 0);
+    begin
+        if rising_edge(clk) then
+            wr_index := hash_ea(m_in.addr);
+            if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then
+                -- clear all valid bits
+                for i in tlb_index_t loop
+                    itlb_valids(i) <= '0';
+                end loop;
+            elsif m_in.tlbie = '1' then
+		assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE;
+                -- clear entry regardless of hit or miss
+                itlb_valids(to_integer(unsigned(wr_index))) <= '0';
+            elsif m_in.tlbld = '1' then
+		assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE;
+                itlb_tags(to_integer(unsigned(wr_index))) <= m_in.addr(63 downto MIN_LG_PGSZ + TLB_BITS);
+                itlb_ptes(to_integer(unsigned(wr_index))) <= m_in.pte;
+                itlb_valids(to_integer(unsigned(wr_index))) <= '1';
+            end if;
+            --ev.itlb_miss_resolved <= m_in.tlbld and not rst;
+        end if;
+    end process;
+
    comb : process(all)
 	variable v : Fetch1ToIcacheType;
 	variable v_int : reg_internal_t;
+        variable next_nia : std_ulogic_vector(63 downto 0);
+        variable m32 : std_ulogic;
+        variable ehit, esel : std_ulogic;
+        variable eaa_priv : std_ulogic;
    begin
 	v := r;
 	v_int := r_int;
        v.predicted := '0';
        v.pred_ntaken := '0';
-        v_int.predicted_taken := '0';
-        v_int.rd_is_niap4 := '0';
+        v.req := not stop_in;
+        v_int.tlbstall := r_int.tlbcheck;
+        v_int.tlbcheck := '0';
+
+        if r_int.tlbcheck = '1' and itlb_hit = '0' then
+            v.fetch_fail := '1';
+        end if;

-	if rst = '1' then
+        -- Combinatorial computation of the CIA for the next cycle.
+        -- Needs to be simple so the result can be used for RAM
+        -- and TLB access in the icache.
+        -- If we are stalled, this still advances, and the assumption
+        -- is that it will not be used.
+        m32 := r_int.mode_32bit;
+        if w_in.redirect = '1' then
+            next_nia := w_in.redirect_nia(63 downto 2) & "00";
+            m32 := w_in.mode_32bit;
+            v.virt_mode := w_in.virt_mode;
+            v.priv_mode := w_in.priv_mode;
+            v.big_endian := w_in.big_endian;
+            v_int.mode_32bit := w_in.mode_32bit;
+            v.fetch_fail := '0';
+        elsif d_in.redirect = '1' then
+            next_nia := d_in.redirect_nia(63 downto 2) & "00";
+            v.fetch_fail := '0';
+        elsif r_int.tlbstall = '1' then
+            -- this case is needed so that the correct icache tags are read
+            next_nia := r.nia;
+        else
+            next_nia := r_int.next_nia;
+        end if;
+        if m32 = '1' then
+            next_nia(63 downto 32) := (others => '0');
+        end if;
+        v.nia := next_nia;
+
+        v_int.next_nia := std_ulogic_vector(unsigned(next_nia) + 4);
+
+        -- Use v_int.next_nia as the BTC read address before it gets possibly
+        -- overridden with the reset or interrupt address or the predicted branch
+        -- target address, in order to improve timing.  If it gets overridden then
+        -- rd_is_niap4 gets cleared to indicate that the BTC data doesn't apply.
+        btc_rd_addr <= unsigned(v_int.next_nia(BTC_ADDR_BITS + 1 downto 2));
+        v_int.rd_is_niap4 := '1';
+
+        -- If the last NIA value went down with a stop mark, it didn't get
+        -- executed, and hence we shouldn't increment NIA.
+        advance_nia <= rst or w_in.interrupt or w_in.redirect or d_in.redirect or
+                       (not r.stop_mark and not (r.req and stall_in));
+        -- reduce metavalue warnings in sim
+        if is_X(rst) then
+            advance_nia <= '1';
+        end if;
+
+        -- Translate next_nia to real if possible, otherwise we have to stall
+        -- and look up the TLB.
+        ehit := '0';
+        esel := '0';
+        eaa_priv := '1';
+        if next_nia(63 downto MIN_LG_PGSZ) = erat.epn1 and erat.valid(1) = '1' then
+            ehit := '1';
+            esel := '1';
+        end if;
+        if next_nia(63 downto MIN_LG_PGSZ) = erat.epn0 and erat.valid(0) = '1' then
+            ehit := '1';
+        end if;
+        if v.virt_mode = '0' then
+            v.rpn := v.nia(REAL_ADDR_BITS - 1 downto MIN_LG_PGSZ);
+            eaa_priv := '1';
+        elsif esel = '1' then
+            v.rpn := erat.rpn1;
+            eaa_priv := erat.priv1;
+        else
+            v.rpn := erat.rpn0;
+            eaa_priv := erat.priv0;
+        end if;
+        if advance_nia = '1' and ehit = '0' and v.virt_mode = '1' and
+                r_int.tlbcheck = '0' and v.fetch_fail = '0' then
+            v_int.tlbstall := '1';
+            v_int.tlbcheck := '1';
+        end if;
+        if ehit = '1' or v.virt_mode = '0' then
+            if eaa_priv = '1' and v.priv_mode = '0' then
+                v.fetch_fail := '1';
+            else
+                v.fetch_fail := '0';
+            end if;
+        end if;
+        erat_hit <= ehit and advance_nia;
+        erat_sel <= esel;
+
+	if rst /= '0' then
 	    if alt_reset_in = '1' then
-		v.nia :=  ALT_RESET_ADDRESS;
+		v_int.next_nia :=  ALT_RESET_ADDRESS;
 	    else
-		v.nia :=  RESET_ADDRESS;
+		v_int.next_nia :=  RESET_ADDRESS;
 	    end if;
+        elsif w_in.interrupt = '1' then
+            v_int.next_nia := 52x"0" & w_in.intr_vec(11 downto 2) & "00";
+        end if;
+	if rst /= '0' or w_in.interrupt = '1' then
+            v.req := '0';
            v.virt_mode := '0';
            v.priv_mode := '1';
            v.big_endian := '0';
            v_int.mode_32bit := '0';
-            v_int.predicted_nia := (others => '0');
-	elsif w_in.redirect = '1' then
-	    v.nia := w_in.redirect_nia(63 downto 2) & "00";
-            if w_in.mode_32bit = '1' then
-                v.nia(63 downto 32) := (others => '0');
-            end if;
-            v.virt_mode := w_in.virt_mode;
-            v.priv_mode := w_in.priv_mode;
-            v.big_endian := w_in.big_endian;
-            v_int.mode_32bit := w_in.mode_32bit;
-        elsif d_in.redirect = '1' then
-            v.nia := d_in.redirect_nia(63 downto 2) & "00";
-            if r_int.mode_32bit = '1' then
-                v.nia(63 downto 32) := (others => '0');
-            end if;
-        elsif r_int.predicted_taken = '1' then
-            v.nia := r_int.predicted_nia;
-        elsif r.req = '1' then
-            v_int.rd_is_niap4 := '1';
-            v.nia := std_ulogic_vector(unsigned(r.nia) + 4);
-            if r_int.mode_32bit = '1' then
-                v.nia(63 downto 32) := x"00000000";
-            end if;
-            if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and
+            v_int.rd_is_niap4 := '0';
+            v_int.tlbstall := '0';
+            v_int.tlbcheck := '0';
+            v.fetch_fail := '0';
+        end if;
+        if v.fetch_fail = '1' then
+            v_int.tlbstall := '1';
+        end if;
+        if v_int.tlbstall = '1' then
+            v.req := '0';
+        end if;
+
+        -- If there is a valid entry in the BTC which corresponds to the next instruction,
+        -- use that to predict the address of the instruction after that.
+        -- (w_in.redirect = '0' and d_in.redirect = '0' and r_int.tlbstall = '0')
+        -- implies v.nia = r_int.next_nia.
+        -- r_int.rd_is_niap4 implies r_int.next_nia is the address used to read the BTC.
+	if v.req = '1' and w_in.redirect = '0' and d_in.redirect = '0' and r_int.tlbstall = '0' and 
+                btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and
                btc_rd_data(BTC_WIDTH - 2) = r.virt_mode and
                btc_rd_data(BTC_WIDTH - 3 downto BTC_TARGET_BITS)
-                = v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then
-                v_int.predicted_taken := btc_rd_data(BTC_WIDTH - 1);
-                v.predicted := btc_rd_data(BTC_WIDTH - 1);
-                v.pred_ntaken := not btc_rd_data(BTC_WIDTH - 1);
+                    = r_int.next_nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then
+            v.predicted := btc_rd_data(BTC_WIDTH - 1);
+            v.pred_ntaken := not btc_rd_data(BTC_WIDTH - 1);
+            if btc_rd_data(BTC_WIDTH - 1) = '1' then
+                v_int.next_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00";
+                v_int.rd_is_niap4 := '0';
            end if;
        end if;
-        v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00";
-
-        -- If the last NIA value went down with a stop mark, it didn't get
-        -- executed, and hence we shouldn't increment NIA.
-        advance_nia <= rst or w_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in);

 	r_next <= v;
 	r_next_int <= v_int;

 	-- Update outputs to the icache
 	i_out <= r;
+        i_out.next_nia <= next_nia;
+        i_out.next_rpn <= v.rpn;

    end process;

--- a/icache.vhdl
+++ b/icache.vhdl
@ -41,10 +41,6 @@ entity icache is
        NUM_LINES : positive := 32;
        -- Number of ways
        NUM_WAYS  : positive := 4;
-        -- L1 ITLB number of entries (direct mapped)
-        TLB_SIZE : positive := 64;
-        -- L1 ITLB log_2(page_size)
-        TLB_LG_PGSZ : positive := 12;
        -- Non-zero to enable log data collection
        LOG_LENGTH : natural := 0
        );
@ -55,8 +51,6 @@ entity icache is
        i_in         : in Fetch1ToIcacheType;
        i_out        : out IcacheToDecode1Type;

-        m_in         : in MmuToIcacheType;
-
        stall_in     : in std_ulogic;
 	stall_out    : out std_ulogic;
 	flush_in     : in std_ulogic;
@ -139,49 +133,24 @@ architecture rtl of icache is
    -- The cache data BRAM organized as described above for each way
    subtype cache_row_t is std_ulogic_vector(ROW_WIDTH-1 downto 0);

-    -- The cache tags LUTRAM has a row per set. Vivado is a pain and will
-    -- not handle a clean (commented) definition of the cache tags as a 3d
-    -- memory. For now, work around it by putting all the tags
+    -- We define a cache tag RAM per way, accessed synchronously
    subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
--    type cache_tags_set_t is array(way_t) of cache_tag_t;
--    type cache_tags_array_t is array(index_t) of cache_tags_set_t;
-    constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
-    subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
-    type cache_tags_array_t is array(index_t) of cache_tags_set_t;
+    type cache_tags_set_t is array(way_t) of cache_tag_t;
+    type cache_tags_array_t is array(index_t) of cache_tag_t;
+
+    -- Set of cache tags read on the last clock edge
+    signal cache_tags_set : cache_tags_set_t;
+    -- Set of cache tags for snooping writes to memory
+    signal snoop_tags_set : cache_tags_set_t;
+    -- Flags indicating write-hit-read on the cache tags
+    signal tag_overwrite : std_ulogic_vector(NUM_WAYS - 1 downto 0);

    -- The cache valid bits
    subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
    type cache_valids_t is array(index_t) of cache_way_valids_t;
    type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
-
-    -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
-    signal cache_tags   : cache_tags_array_t;
    signal cache_valids : cache_valids_t;

-    attribute ram_style : string;
-    attribute ram_style of cache_tags : signal is "distributed";
-
-    -- L1 ITLB.
-    constant TLB_BITS : natural := log2(TLB_SIZE);
-    constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
-    constant TLB_PTE_BITS : natural := 64;
-
-    subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
-    type tlb_valids_t is array(tlb_index_t) of std_ulogic;
-    subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
-    type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
-    subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
-    type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
-
-    signal itlb_valids : tlb_valids_t;
-    signal itlb_tags : tlb_tags_t;
-    signal itlb_ptes : tlb_ptes_t;
-    attribute ram_style of itlb_tags : signal is "distributed";
-    attribute ram_style of itlb_ptes : signal is "distributed";
-
-    -- Privilege bit from PTE EAA field
-    signal eaa_priv  : std_ulogic;
-
    -- Cache reload state machine
    type state_t is (IDLE, STOP_RELOAD, CLR_TAG, WAIT_ACK);

@ -189,6 +158,7 @@ architecture rtl of icache is
 	-- Cache hit state (Latches for 1 cycle BRAM access)
 	hit_way   : way_sig_t;
 	hit_nia   : std_ulogic_vector(63 downto 0);
+        hit_ra    : real_addr_t;
 	hit_smark : std_ulogic;
 	hit_valid : std_ulogic;
        big_endian: std_ulogic;
@ -208,6 +178,9 @@ architecture rtl of icache is
        end_row_ix       : row_in_line_t;
        rows_valid       : row_per_line_valid_t;

+        stalled_hit      : std_ulogic;  -- remembers hit while stalled
+        stalled_way      : way_sig_t;
+
        -- TLB miss state
        fetch_failed     : std_ulogic;
    end record;
@ -226,9 +199,6 @@ architecture rtl of icache is
    signal req_raddr   : real_addr_t;

    signal real_addr     : real_addr_t;
-    signal ra_valid      : std_ulogic;
-    signal priv_fault    : std_ulogic;
-    signal access_ok     : std_ulogic;

    -- Cache RAM interface
    type cache_ram_out_t is array(way_t) of cache_row_t;
@ -240,14 +210,16 @@ architecture rtl of icache is
    signal plru_victim : way_sig_t;

    -- Memory write snoop signals
-    signal snoop_valid : std_ulogic;
-    signal snoop_index : index_sig_t;
-    signal snoop_hits  : cache_way_valids_t;
+    signal snoop_valid  : std_ulogic;
+    signal snoop_index  : index_sig_t;
+    signal snoop_tag    : cache_tag_t;
+    signal snoop_index2 : index_sig_t;
+    signal snoop_hits   : cache_way_valids_t;

    signal log_insn : std_ulogic_vector(35 downto 0);

    -- Return the cache line index (tag index) for an address
-    function get_index(addr: std_ulogic_vector) return index_sig_t is
+    function get_index(addr: real_addr_t) return index_sig_t is
    begin
        return unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS));
    end;
@ -321,29 +293,6 @@ architecture rtl of icache is
        return endian & addr(addr'left downto SET_SIZE_BITS);
    end;

-    -- Read a tag from a tag memory row
-    function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is
-    begin
-	return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
-    end;
-
-    -- Write a tag to tag memory row
-    procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
-			tag: cache_tag_t) is
-    begin
-	tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
-    end;
-
-    -- Simple hash for direct-mapped TLB index
-    function hash_ea(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
-        variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
-    begin
-        hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
-                xor addr(TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto TLB_LG_PGSZ + TLB_BITS)
-                xor addr(TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto TLB_LG_PGSZ + 2 * TLB_BITS);
-        return hash;
-    end;
-
 begin

    -- byte-swap read data if big endian
@ -415,7 +364,9 @@ begin
 	signal wr_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
 	signal dout     : cache_row_t;
 	signal wr_sel   : std_ulogic_vector(0 downto 0);
+        signal ic_tags  : cache_tags_array_t;
    begin
+        -- Cache data RAMs, one per way
 	way: entity work.cache_ram
 	    generic map (
 		ROW_BITS => ROW_BITS,
@ -443,6 +394,49 @@ begin
 	    wr_addr <= std_ulogic_vector(r.store_row);
            wr_sel(0) <= do_write;
 	end process;
+
+        -- Cache tag RAMs, one per way, are read and written synchronously.
+        -- They are instantiated like this instead of trying to describe them as
+        -- a single array in order to avoid problems with writing a single way.
+        process(clk)
+            variable replace_way : way_sig_t;
+            variable snoop_addr : real_addr_t;
+            variable next_raddr : real_addr_t;
+        begin
+            replace_way := to_unsigned(0, WAY_BITS);
+            if NUM_WAYS > 1 then
+                -- Get victim way from plru
+                replace_way := plru_victim;
+            end if;
+            if rising_edge(clk) then
+                -- Read tags using NIA for next cycle
+                if flush_in = '1' or i_in.req = '0' or (stall_in = '0' and stall_out = '0') then
+                    next_raddr := i_in.next_rpn & i_in.next_nia(MIN_LG_PGSZ - 1 downto 0);
+                    cache_tags_set(i) <= ic_tags(to_integer(get_index(next_raddr)));
+                    -- Check for simultaneous write to the same location
+                    tag_overwrite(i) <= '0';
+                    if r.state = CLR_TAG and r.store_index = get_index(next_raddr) and
+                        to_unsigned(i, WAY_BITS) = replace_way then
+                        tag_overwrite(i) <= '1';
+                    end if;
+                end if;
+
+                -- Second read port for snooping writes to memory
+                if (wb_snoop_in.cyc and wb_snoop_in.stb and wb_snoop_in.we) = '1' then
+                    snoop_addr := addr_to_real(wb_to_addr(wb_snoop_in.adr));
+                    snoop_tags_set(i) <= ic_tags(to_integer(get_index(snoop_addr)));
+                end if;
+
+                -- Write one tag when in CLR_TAG state
+                if r.state = CLR_TAG and to_unsigned(i, WAY_BITS) = replace_way then
+                    ic_tags(to_integer(r.store_index)) <= r.store_tag;
+                end if;
+
+                if rst = '1' then
+                    tag_overwrite(i) <= '0';
+                end if;
+            end if;
+        end process;
    end generate;
    
    -- Generate PLRUs
@ -468,10 +462,10 @@ begin
        process(all)
        begin
            -- Read PLRU bits from array
-            if is_X(r.hit_nia) then
+            if is_X(r.hit_ra) then
                plru_cur <= (others => 'X');
            else
-                plru_cur <= plru_ram(to_integer(get_index(r.hit_nia)));
+                plru_cur <= plru_ram(to_integer(get_index(r.hit_ra)));
            end if;

            -- PLRU interface
@ -484,92 +478,32 @@ begin
        begin
            if rising_edge(clk) then
                if r.hit_valid = '1' then
-                    assert not is_X(r.hit_nia) severity failure;
-                    plru_ram(to_integer(get_index(r.hit_nia))) <= plru_upd;
+                    assert not is_X(r.hit_ra) severity failure;
+                    plru_ram(to_integer(get_index(r.hit_ra))) <= plru_upd;
                end if;
            end if;
        end process;
    end generate;

-    -- TLB hit detection and real address generation
-    itlb_lookup : process(all)
-        variable pte : tlb_pte_t;
-        variable ttag : tlb_tag_t;
-	variable tlb_req_index : std_ulogic_vector(TLB_BITS - 1 downto 0);
-    begin
-        tlb_req_index := hash_ea(i_in.nia);
-	if is_X(tlb_req_index) then
-	    pte := (others => 'X');
-	    ttag := (others => 'X');
-	else
-	    pte := itlb_ptes(to_integer(unsigned(tlb_req_index)));
-	    ttag := itlb_tags(to_integer(unsigned(tlb_req_index)));
-	end if;
-        if i_in.virt_mode = '1' then
-            real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
-                         i_in.nia(TLB_LG_PGSZ - 1 downto 0);
-            if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
-		if is_X(tlb_req_index) then
-		    ra_valid <= 'X';
-		else
-		    ra_valid <= itlb_valids(to_integer(unsigned(tlb_req_index)));
-		end if;
-            else
-                ra_valid <= '0';
-            end if;
-            eaa_priv <= pte(3);
-        else
-            real_addr <= addr_to_real(i_in.nia);
-            ra_valid <= '1';
-            eaa_priv <= '1';
-        end if;
-
-        -- no IAMR, so no KUEP support for now
-        priv_fault <= eaa_priv and not i_in.priv_mode;
-        access_ok <= ra_valid and not priv_fault;
-    end process;
-
-    -- iTLB update
-    itlb_update: process(clk)
-	variable wr_index : std_ulogic_vector(TLB_BITS - 1 downto 0);
-    begin
-        if rising_edge(clk) then
-            wr_index := hash_ea(m_in.addr);
-            if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then
-                -- clear all valid bits
-                for i in tlb_index_t loop
-                    itlb_valids(i) <= '0';
-                end loop;
-            elsif m_in.tlbie = '1' then
-		assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE;
-                -- clear entry regardless of hit or miss
-                itlb_valids(to_integer(unsigned(wr_index))) <= '0';
-            elsif m_in.tlbld = '1' then
-		assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE;
-                itlb_tags(to_integer(unsigned(wr_index))) <= m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
-                itlb_ptes(to_integer(unsigned(wr_index))) <= m_in.pte;
-                itlb_valids(to_integer(unsigned(wr_index))) <= '1';
-            end if;
-            ev.itlb_miss_resolved <= m_in.tlbld and not rst;
-        end if;
-    end process;
-
    -- Cache hit detection, output to fetch2 and other misc logic
    icache_comb : process(all)
 	variable is_hit  : std_ulogic;
 	variable hit_way : way_sig_t;
        variable insn    : std_ulogic_vector(ICWORDLEN - 1 downto 0);
        variable icode   : insn_code;
+        variable ra      : real_addr_t;
    begin
 	-- Extract line, row and tag from request
-        req_index <= get_index(i_in.nia);
-        req_row <= get_row(i_in.nia);
-	req_tag <= get_tag(real_addr, i_in.big_endian);
+        ra := i_in.rpn & i_in.nia(MIN_LG_PGSZ - 1 downto 0);
+        real_addr <= ra;
+        req_index <= get_index(ra);
+        req_row <= get_row(ra);
+	req_tag <= get_tag(ra, i_in.big_endian);

 	-- Calculate address of beginning of cache row, will be
 	-- used for cache miss processing if needed
 	--
-	req_raddr <= real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
+	req_raddr <= ra(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
 		     (ROW_OFF_BITS-1 downto 0 => '0');

 	-- Test if pending request is a hit on any way
@ -580,20 +514,27 @@ begin
        end if;
 	for i in way_t loop
 	    if i_in.req = '1' and
-                (cache_valids(to_integer(req_index))(i) = '1' or
-                 (r.state = WAIT_ACK and
-                  req_index = r.store_index and
-                  to_unsigned(i, WAY_BITS) = r.store_way and
-                  r.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '1')) then
-		if read_tag(i, cache_tags(to_integer(req_index))) = req_tag then
-		    hit_way := to_unsigned(i, WAY_BITS);
-		    is_hit := '1';
-		end if;
+                cache_valids(to_integer(req_index))(i) = '1' and
+                tag_overwrite(i) = '0' and
+                cache_tags_set(i) = req_tag then
+                hit_way := to_unsigned(i, WAY_BITS);
+                is_hit := '1';
 	    end if;
 	end loop;
+        if r.state = WAIT_ACK and r.store_valid = '1' and
+            req_index = r.store_index and
+            req_tag = r.store_tag and
+            r.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '1' then
+            is_hit := '1';
+            hit_way := r.store_way;
+        end if;
+        if r.stalled_hit = '1' then
+            is_hit := '1';
+            hit_way := r.stalled_way;
+        end if;

 	-- Generate the "hit" and "miss" signals for the synchronous blocks
-        if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' then
+        if i_in.req = '1' and flush_in = '0' and rst = '0' then
            req_is_hit  <= is_hit;
            req_is_miss <= not is_hit;
        else
@ -610,19 +551,22 @@ begin
 	--       I prefer not to do just yet as it would force fetch2 to know about
 	--       some of the cache geometry information.
 	--
-        insn := (others => '0');
        icode := INSN_illegal;
-	if r.hit_valid = '1' then
-            assert not is_X(r.hit_way) severity failure;
+        if is_X(r.hit_way) then
+            insn := (others => 'X');
+        else
            insn := read_insn_word(r.hit_nia, cache_out(to_integer(r.hit_way)));
-            -- Currently we use only the top bit for indicating illegal
-            -- instructions because we know that insn_codes fit into 9 bits.
-            if is_X(insn) then
-                insn := (others => '0');
-            elsif insn(ICWORDLEN - 1) = '0' then
-                icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS))));
-            end if;
-	end if;
+        end if;
+        assert not (r.hit_valid = '1' and is_X(r.hit_way)) severity failure;
+        -- Currently we use only the top bit for indicating illegal
+        -- instructions because we know that insn_codes fit into 9 bits.
+        if is_X(insn) then
+            insn := (others => '0');
+        elsif insn(ICWORDLEN - 1) = '0' then
+            icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS))));
+            insn(31 downto 26) := recode_primary_opcode(icode);
+        end if;
+
        i_out.insn <= insn(31 downto 0);
        i_out.icode <= icode;
        log_insn <= insn;
@ -634,8 +578,8 @@ begin
        i_out.next_predicted <= r.predicted;
        i_out.next_pred_ntaken <= r.pred_ntaken;

-	-- Stall fetch1 if we have a miss on cache or TLB or a protection fault
-	stall_out <= not (is_hit and access_ok);
+	-- Stall fetch1 if we have a cache miss
+	stall_out <= i_in.req and not is_hit and not flush_in;

 	-- Wishbone requests output (from the cache miss reload machine)
 	wishbone_out <= r.wb;
@ -647,9 +591,17 @@ begin
        if rising_edge(clk) then
            -- keep outputs to fetch2 unchanged on a stall
            -- except that flush or reset sets valid to 0
-            if stall_in = '1' then
-                if rst = '1' or flush_in = '1' then
-                    r.hit_valid <= '0';
+            if rst = '1' or flush_in = '1' then
+                r.hit_valid <= '0';
+                r.stalled_hit <= '0';
+                r.stalled_way <= to_unsigned(0, WAY_BITS);
+            elsif stall_in = '1' then
+                if r.state = CLR_TAG then
+                    r.stalled_hit <= '0';
+                elsif req_is_hit = '1' then
+                    -- if we have a hit while stalled, remember it
+                    r.stalled_hit <= '1';
+                    r.stalled_way <= req_hit_way;
                end if;
            else
                -- On a hit, latch the request for the next cycle, when the BRAM data
@ -669,14 +621,17 @@ begin
                        " way:" & to_hstring(req_hit_way) &
                        " RA:" & to_hstring(real_addr);
                end if;
+                r.stalled_hit <= '0';
 	    end if;
            if stall_in = '0' then
                -- Send stop marks and NIA down regardless of validity
                r.hit_smark <= i_in.stop_mark;
                r.hit_nia <= i_in.nia;
+                r.hit_ra <= real_addr;
                r.big_endian <= i_in.big_endian;
                r.predicted <= i_in.predicted;
                r.pred_ntaken <= i_in.pred_ntaken;
+                r.fetch_failed <= i_in.fetch_fail and not flush_in;
            end if;
            if i_out.valid = '1' then
                assert not is_X(i_out.insn) severity failure;
@ -689,7 +644,6 @@ begin
 	variable tagset    : cache_tags_set_t;
        variable tag       : cache_tag_t;
        variable snoop_addr : real_addr_t;
-        variable snoop_tag : cache_tag_t;
        variable snoop_cache_tags : cache_tags_set_t;
        variable replace_way : way_sig_t;
    begin
@ -722,15 +676,14 @@ begin
                snoop_valid <= wb_snoop_in.cyc and wb_snoop_in.stb and wb_snoop_in.we;
                snoop_addr := addr_to_real(wb_to_addr(wb_snoop_in.adr));
                snoop_index <= get_index(snoop_addr);
-                snoop_tag := get_tag(snoop_addr, '0');
+                snoop_tag <= get_tag(snoop_addr, '0');
                snoop_hits <= (others => '0');
+
+                -- On the next cycle, match up tags with the snooped address
+                -- to see if any ways need to be invalidated
                if snoop_valid = '1' then
-                    if is_X(snoop_addr) then
-                        report "metavalue in snoop_addr" severity FAILURE;
-                    end if;
-                    snoop_cache_tags := cache_tags(to_integer(get_index(snoop_addr)));
                    for i in way_t loop
-                        tag := read_tag(i, snoop_cache_tags);
+                        tag := snoop_tags_set(i);
                        -- Ignore endian bit in comparison
                        tag(TAG_BITS - 1) := '0';
                        if tag = snoop_tag then
@ -738,6 +691,7 @@ begin
                        end if;
                    end loop;
                end if;
+                snoop_index2 <= snoop_index;

                -- Process cache invalidations
                if inval_in = '1' then
@ -746,12 +700,12 @@ begin
                    end loop;
                    r.store_valid <= '0';
                else
-                    -- Do invalidations from snooped stores to memory, one
-                    -- cycle after the address appears on wb_snoop_in.
+                    -- Do invalidations from snooped stores to memory,
+                    -- two cycles after the address appears on wb_snoop_in.
                    for i in way_t loop
                        if snoop_hits(i) = '1' then
-                            assert not is_X(snoop_index) severity failure;
-                            cache_valids(to_integer(snoop_index))(i) <= '0';
+                            assert not is_X(snoop_index2) severity failure;
+                            cache_valids(to_integer(snoop_index2))(i) <= '0';
                        end if;
                    end loop;
                end if;
@ -809,15 +763,6 @@ begin
                        assert not is_X(replace_way) severity failure;
                        cache_valids(to_integer(r.store_index))(to_integer(replace_way)) <= '0';

-			-- Store new tag in selected way
-			for i in 0 to NUM_WAYS-1 loop
-			    if to_unsigned(i, WAY_BITS) = replace_way then
-				tagset := cache_tags(to_integer(r.store_index));
-				write_tag(i, tagset, r.store_tag);
-				cache_tags(to_integer(r.store_index)) <= tagset;
-			    end if;
-			end loop;
-
                        r.state <= WAIT_ACK;
                    end if;

@ -879,13 +824,6 @@ begin
 		    end if;
 		end case;
 	    end if;
-
-            -- TLB miss and protection fault processing
-            if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
-                r.fetch_failed <= '0';
-            elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
-                r.fetch_failed <= '1';
-            end if;
 	end if;
    end process;

@ -915,8 +853,8 @@ begin
                            wstate &
                            std_ulogic_vector(resize(lway, 3)) &
                            req_is_hit & req_is_miss &
-                            access_ok &
-                            ra_valid;
+                            '1' & -- was access_ok
+                            '1';  -- was ra_valid
            end if;
        end process;
        log_out <= log_data;
--- a/icache_tb.vhdl
+++ b/icache_tb.vhdl
@ -15,8 +15,6 @@ architecture behave of icache_tb is
    signal i_out        : Fetch1ToIcacheType;
    signal i_in         : IcacheToDecode1Type;

-    signal m_out        : MmuToIcacheType;
-
    signal wb_bram_in   : wishbone_master_out;
    signal wb_bram_out  : wishbone_slave_out;

@ -32,7 +30,6 @@ begin
            rst => rst,
            i_in => i_out,
            i_out => i_in,
-            m_in => m_out,
            stall_in => '0',
            flush_in => '0',
            inval_in => '0',
@ -77,19 +74,21 @@ begin
        i_out.priv_mode <= '1';
        i_out.virt_mode <= '0';
        i_out.big_endian <= '0';
-
-        m_out.tlbld <= '0';
-        m_out.tlbie <= '0';
-        m_out.addr <= (others => '0');
-        m_out.pte <= (others => '0');
+        i_out.fetch_fail <= '0';
+        i_out.predicted <= '0';
+        i_out.pred_ntaken <= '0';

        wait until rising_edge(clk);
        wait until rising_edge(clk);
        wait until rising_edge(clk);
+
+        i_out.next_nia <= x"0000000000000004";
+        i_out.next_rpn <= (others => '0');
        wait until rising_edge(clk);

        i_out.req <= '1';
        i_out.nia <= x"0000000000000004";
+        i_out.rpn <= (others => '0');

        wait for 30*clk_period;
        wait until rising_edge(clk);
@ -102,6 +101,7 @@ begin
            severity failure;

        i_out.req <= '0';
+        i_out.next_nia <= x"0000000000000008";

        wait until rising_edge(clk);

@ -116,6 +116,8 @@ begin
            "=" & to_hstring(i_in.insn) &
            " expected 00000002"
            severity failure;
+
+        i_out.next_nia <= x"0000000000000040";
        wait until rising_edge(clk);

        -- another miss
@ -133,6 +135,9 @@ begin
            severity failure;

        -- test something that aliases
+        i_out.next_nia <= x"0000000000000100";
+        wait until rising_edge(clk);
+
        i_out.req <= '1';
        i_out.nia <= x"0000000000000100";
        wait until rising_edge(clk);
--- a/mmu.vhdl
+++ b/mmu.vhdl
@ -20,7 +20,7 @@ entity mmu is
        d_out : out MmuToDcacheType;
        d_in  : in DcacheToMmuType;

-        i_out : out MmuToIcacheType
+        i_out : out MmuToITLBType
        );
 end mmu;

--- a/predecode.vhdl
+++ b/predecode.vhdl
@ -38,8 +38,38 @@ architecture behaviour of predecoder is
        2#011100_00000# to 2#011100_11111# =>  INSN_andi_dot,
        2#011101_00000# to 2#011101_11111# =>  INSN_andis_dot,
        2#000000_00000#                    =>  INSN_attn,
-        2#010010_00000# to 2#010010_11111# =>  INSN_b,
-        2#010000_00000# to 2#010000_11111# =>  INSN_bc,
+        2#010010_00000# to 2#010010_00001# =>  INSN_brel,
+        2#010010_00010# to 2#010010_00011# =>  INSN_babs,
+        2#010010_00100# to 2#010010_00101# =>  INSN_brel,
+        2#010010_00110# to 2#010010_00111# =>  INSN_babs,
+        2#010010_01000# to 2#010010_01001# =>  INSN_brel,
+        2#010010_01010# to 2#010010_01011# =>  INSN_babs,
+        2#010010_01100# to 2#010010_01101# =>  INSN_brel,
+        2#010010_01110# to 2#010010_01111# =>  INSN_babs,
+        2#010010_10000# to 2#010010_10001# =>  INSN_brel,
+        2#010010_10010# to 2#010010_10011# =>  INSN_babs,
+        2#010010_10100# to 2#010010_10101# =>  INSN_brel,
+        2#010010_10110# to 2#010010_10111# =>  INSN_babs,
+        2#010010_11000# to 2#010010_11001# =>  INSN_brel,
+        2#010010_11010# to 2#010010_11011# =>  INSN_babs,
+        2#010010_11100# to 2#010010_11101# =>  INSN_brel,
+        2#010010_11110# to 2#010010_11111# =>  INSN_babs,
+        2#010000_00000# to 2#010000_00001# =>  INSN_bcrel,
+        2#010000_00010# to 2#010000_00011# =>  INSN_bcabs,
+        2#010000_00100# to 2#010000_00101# =>  INSN_bcrel,
+        2#010000_00110# to 2#010000_00111# =>  INSN_bcabs,
+        2#010000_01000# to 2#010000_01001# =>  INSN_bcrel,
+        2#010000_01010# to 2#010000_01011# =>  INSN_bcabs,
+        2#010000_01100# to 2#010000_01101# =>  INSN_bcrel,
+        2#010000_01110# to 2#010000_01111# =>  INSN_bcabs,
+        2#010000_10000# to 2#010000_10001# =>  INSN_bcrel,
+        2#010000_10010# to 2#010000_10011# =>  INSN_bcabs,
+        2#010000_10100# to 2#010000_10101# =>  INSN_bcrel,
+        2#010000_10110# to 2#010000_10111# =>  INSN_bcabs,
+        2#010000_11000# to 2#010000_11001# =>  INSN_bcrel,
+        2#010000_11010# to 2#010000_11011# =>  INSN_bcabs,
+        2#010000_11100# to 2#010000_11101# =>  INSN_bcrel,
+        2#010000_11110# to 2#010000_11111# =>  INSN_bcabs,
        2#001011_00000# to 2#001011_11111# =>  INSN_cmpi,
        2#001010_00000# to 2#001010_11111# =>  INSN_cmpli,
        2#100010_00000# to 2#100010_11111# =>  INSN_lbz,
@ -220,9 +250,9 @@ architecture behaviour of predecoder is
        2#0_11111_01001#  =>  INSN_divd, -- divdo
        2#0_01111_01011#  =>  INSN_divw,
        2#0_11111_01011#  =>  INSN_divw, -- divwo
-        2#0_11001_10110#  =>  INSN_nop, -- dss
-        2#0_01010_10110#  =>  INSN_nop, -- dst
-        2#0_01011_10110#  =>  INSN_nop, -- dstst
+        2#0_11001_10110#  =>  INSN_rnop, -- dss
+        2#0_01010_10110#  =>  INSN_rnop, -- dst
+        2#0_01011_10110#  =>  INSN_rnop, -- dstst
        2#0_11010_10110#  =>  INSN_eieio,
        2#0_01000_11100#  =>  INSN_eqv,
        2#0_11101_11010#  =>  INSN_extsb,
@ -322,14 +352,14 @@ architecture behaviour of predecoder is
        2#0_00011_01000#  =>  INSN_neg,
        2#0_10011_01000#  =>  INSN_neg, -- nego
        -- next 8 are reserved no-op instructions
-        2#0_10000_10010#  =>  INSN_nop,
-        2#0_10001_10010#  =>  INSN_nop,
-        2#0_10010_10010#  =>  INSN_nop,
-        2#0_10011_10010#  =>  INSN_nop,
-        2#0_10100_10010#  =>  INSN_nop,
-        2#0_10101_10010#  =>  INSN_nop,
-        2#0_10110_10010#  =>  INSN_nop,
-        2#0_10111_10010#  =>  INSN_nop,
+        2#0_10000_10010#  =>  INSN_rnop,
+        2#0_10001_10010#  =>  INSN_rnop,
+        2#0_10010_10010#  =>  INSN_rnop,
+        2#0_10011_10010#  =>  INSN_rnop,
+        2#0_10100_10010#  =>  INSN_rnop,
+        2#0_10101_10010#  =>  INSN_rnop,
+        2#0_10110_10010#  =>  INSN_rnop,
+        2#0_10111_10010#  =>  INSN_rnop,
        2#0_00011_11100#  =>  INSN_nor,
        2#0_01101_11100#  =>  INSN_or,
        2#0_01100_11100#  =>  INSN_orc,
--- a/writeback.vhdl
+++ b/writeback.vhdl
@ -160,34 +160,21 @@ begin
        end if;

        -- Outputs to fetch1
+        f.interrupt := intr;
+        f.intr_vec := std_ulogic_vector(to_unsigned(vec, 12));
        f.redirect := e_in.redirect;
+        f.redirect_nia := e_in.write_data;
        f.br_nia := e_in.last_nia;
-        f.br_last := e_in.br_last;
+        f.br_last := e_in.br_last and not intr;
        f.br_taken := e_in.br_taken;
-        if intr = '1' then
-            f.redirect := '1';
-            f.br_last := '0';
-            f.redirect_nia := std_ulogic_vector(to_unsigned(vec, 64));
-            f.virt_mode := '0';
-            f.priv_mode := '1';
-            -- XXX need an interrupt LE bit here, e.g. from LPCR
-            f.big_endian := '0';
-            f.mode_32bit := '0';
-        else
-            if e_in.abs_br = '1' then
-                f.redirect_nia := e_in.br_offset;
-            else
-                f.redirect_nia := std_ulogic_vector(unsigned(e_in.last_nia) + unsigned(e_in.br_offset));
-            end if;
-            -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1
-            f.virt_mode := e_in.redir_mode(3);
-            f.priv_mode := e_in.redir_mode(2);
-            f.big_endian := e_in.redir_mode(1);
-            f.mode_32bit := e_in.redir_mode(0);
-        end if;
+        -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1
+        f.virt_mode := e_in.redir_mode(3);
+        f.priv_mode := e_in.redir_mode(2);
+        f.big_endian := e_in.redir_mode(1);
+        f.mode_32bit := e_in.redir_mode(0);

        f_out <= f;
-        flush_out <= f_out.redirect;
+        flush_out <= f_out.redirect or intr;

        -- Register write data bypass to decode2
        wb_bypass.tag.tag <= complete_out.tag;
Author	SHA1	Message	Date
Paul Mackerras	fdcb6ec449	Merge pull request #422 from paulusmack/real-icache Icache improvements - use synchronous RAMs and remove 4kB per set limit	4 months ago
Paul Mackerras	73a2fcbc7f	icache_tb: Update for recent icache changes - Provide next_nia before clock edge where req is asserted - Set rpn and next_rpn to zero - There is no longer an input to the icache from the MMU Signed-off-by: Paul Mackerras <paulus@ozlabs.org>	8 months ago
Paul Mackerras	73b6004ac6	icache: Use next real address to index icache Now that we are translating the fetch effective address to real one cycle earlier, we can use the real address to index the icache array. This has the benefit that the set size can be larger than a page, enabling us to configure the icache to be larger without having to increase its associativity. Previously the set size was limited to the page size to avoid aliasing problems. Thus for example a 32kB icache would need to be 8-way associative, resulting in large numbers of LUTs being used for tag comparisons in FPGA implementations, and poor timing. With this change, a 32kB icache can be 1 or 2-way associative, which means deeper and narrower tag and data RAMs and fewer tag comparators. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>	8 months ago
Paul Mackerras	f9e5622327	Move iTLB from icache to fetch1 This moves the address translation step for instruction fetches one cycle earlier, so that it now happens in the fetch1 stage. There is now a 2-entry mini translation cache ("ERAT", or effective to real address translation cache) which operates on the output of the multiplexer that selects the instruction address for the next cycle. The ERAT consists of two effective address registers and two corresponding real address registers. They store the page number part of the addresses for a 4kB page size, which is the smallest page size supported by the architecture. If the effective address doesn't match either of the EA registers, and address translation is enabled, then i_out.req goes low for two cycles while the iTLB is looked up. Experimentally, this delay results in a 0.1% drop in coremark performance; allowing two cycles for the lookup results in better timing. The result from the iTLB is placed into the least recently used ERAT entry and then used to translate the address as normal. If address translation is not enabled then the EA is used directly as the real address. The iTLB structure is the same as it was before; direct mapped, indexed using a hashed EA. The "fetch failed" signal, which indicates a TLB miss or protection violation, is now generated in fetch1 and passed through icache. When it is asserted, fetch1 goes into a stalled state until a PTE arrives from the MMU (which gets put into both the iTLB and the ERAT), or an interrupt or redirect occurs. Any TLB invalidations from the MMU invalidate the whole ERAT. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>	8 months ago
Paul Mackerras	27c50bc311	Makefile: Remove overriding of ICACHE_NUM_LINES on ECP5 platforms Now that the icache tag RAM is accessed synchronously, the free tools recognize it as block RAM on ECP5-based platforms; thus we no longer need to force it to a very small value. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>	8 months ago
Paul Mackerras	963c225955	icache: Read icache tag RAM synchronously This uses the next_nia provided to us by fetch1 to enable the icache tag RAM to be read synchronously (using a clock edge), which should enable block RAMs to be used on FPGAs rather than LUT RAM or flip-flops. We define a separate RAM per way to avoid any problems with the tools trying to inference byte write enables for writing to a single way. Since next_nia can move on, we only get one shot at reading it the cache tag RAM entry for the current access. If it is a miss, then the state machine will read the cache line from RAM, and we can consider the access to be a hit once the state machine has brought in the doubleword we need. The TLB hit/miss check has been modified to check r.store_tag rather than the tag read from the tag RAM for this case. However, it is also possible that stall_in will be asserted for the whole time until the cache line refill is completed. To handle this case, we remember (in r.stalled_hit) that we detected a hit while stalled, and use that hit once stall_in is deasserted. This avoids doing an unnecesary second reload of the same cache line. The r.stalled_hit flag gets cleared in CLR_TAG state since that is when cache tags can be overwritten, meaning that a previously detected hit might no longer be valid. There is also the case where the tag read from the tag RAM is the one we are looking for, and is the same index as the line that is starting to be reloaded by the state machine. If the icache gets stalled for long enough that the line reload finishes, it would then be possible for the access to be detected as a hit even though the cache line has been overwritten. To counter this, we detect the case where the cache tag RAM entry being read is the same as the entry being written and set a 'tag_overwrite' flag bit to indicate that one of the tags in cache_tags_set is no longer valid. For snooping writes to memory, we have a second read port on the cache tag RAM. These tags are also read synchronously, so the logic for clearing cache line valid bits on a snoop has been adjusted (the tag comparisons and valid bit clearing now happen in the same cycle). This also simplifies the expression for 'insn' by removing a dependency on r.hit_valid, fixes the instruction value sent to the log, and deasserts stall_out when flush_in is true. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>	8 months ago
Paul Mackerras	723008b8c2	icache: Read iTLB using early next NIA from fetch1 Using i_in.next_nia means that we can read the iTLB RAM arrays synchronously rather than asynchronously, which gives more opportunity for using block RAMs in FPGA implementations. The reading is gated by the stall signals because the next_nia can advance when stalled, but we need the iTLB entry for the instruction that i_in.nia points to. If we are stalled because of an iTLB miss, that means we don't see the new iTLB entry when it is written. Instead we save the new entry directly when it arrives and use it instead of the values read from the iTLB RAM. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>	8 months ago
Paul Mackerras	f34a54d295	fetch1: Streamline next NIA generation further This reduces the number of possible sources for the next NIA from 4 down to 3, by routing interrupt vector addresses through the r_int.next_nia register, as is already done for reset. This adds one extra cycle of latency when taking interrupts. During this extra cycle, i_out.req is 0. Writeback now no longer combines redirects (branches, rfid, isync) with interrupts; they are presented separately to fetch1. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>	8 months ago
Paul Mackerras	e92d49375f	fetch1: Reorganize fetch1 to provide an asynchronous early next NIA to icache This adds a next_nia field to the Fetch1ToIcacheType record, which provides an indication of what will be in the nia field on the next non-stalled cycle. This is intended to be as fast as possible, being a selection from two redirect addresses (from writeback and decode1) or an internal register (r_int.next_nia). Reset addresses and predicted branch targets come through this internal register. The rearrangement here has the side effect that we can now use the BTC on the first instruction after a taken branch, whereas previously the BTC was only active starting with the second instruction after a taken branch. This provides a slight improvement in performance. This also fixes a buglet in icache where it would assert its stall output when i_in.req was false. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>	8 months ago
Paul Mackerras	2dceb28830	Improve timing of redirect_nia going from decode1 to fetch1 This moves the addition that computes the branch target address for statically predicted taken branches before a clock edge, so the redirect_nia signal going to fetch1 comes from a clean latch. The address generation logic is also simplified somewhat, and conditional absolute branches to negative addresses are no longer predicted taken (this should have no impact on performance as such branches are basically never used). Signed-off-by: Paul Mackerras <paulus@ozlabs.org>	8 months ago
Paul Mackerras	1c4b5def36	Improve timing of redirect_nia going from writeback to fetch1 This gets rid of the adder in writeback that computes redirect_nia. Instead, the main adder in the ALU is used to compute the branch target for relative branches. We now decode b and bc differently depending on the AA field, generating INSN_brel, INSN_babs, INSN_bcrel or INSN_bcabs as appropriate. Each one has a separate entry in the decode table in decode1; the *rel versions use CIA as the A input. The bclr/bcctr/bctar and rfid instructions now select ramspr_result for the main result mux to get the redirect address into ex1.e.write_data. For branches which are predicted taken but not actually taken, we need to redirect to the following instruction. We also need to do that for isync. We do this in the execute2 stage since whether or not to do it depends on the branch result. The next_nia computation is moved to the execute2 stage and comes in via a new leg on the secondary result multiplexer, making next_nia available ultimately in ex2.e.write_data. This also means that the next_nia leg of the primary result multiplexer is gone. Incrementing last_nia by 4 for sc (so that SRR0 points to the following instruction) is also moved to execute2. Writing CIA+4 to LR was previously done through the main result multiplexer. Now it comes in explicitly in the ramspr write logic. Overall this removes the br_offset and abs_br fields and the logic to add br_offset and next_nia, and one leg of the primary result multiplexer, at the cost of a few extra control signals between execute1 and execute2 and some multiplexing for the ramspr write side and an extra input on the secondary result multiplexer. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>	8 months ago
Paul Mackerras	06ff486567	icache: Restore primary opcode to instruction word The icache stores a predecoded insn_code value for each instruction, and so as to fit in 36 bits, omits the primary opcode (the most significant 6 bits) of each instruction. Previously, for valid instructions, the primary opcode field of the instruction delivered to decode1 was a part-representation of the insn_code value rather than the actual primary opcode. This adds a lookup table to compute the primary opcode from the insn_code and deliver it in the instruction words supplied to decode1. In order that each insn_code can be associated with a single primary opcode value, the various no-operation instructions with primary opcode 31 (the reserved no-ops and dss, dst and dstst) have been given a new insn_code, INSN_rnop, leaving INSN_nop for the preferred no-op (ori r0,r0,0). Signed-off-by: Paul Mackerras <paulus@ozlabs.org>	8 months ago