From 12a3d7621771f6da70219fbdf3605f2ba8fff943 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 18 Aug 2023 19:29:10 +1000 Subject: [PATCH 01/21] Implement hrfid and make MSR[HV] always 1 Implementations without hypervisor/LPAR support are permitted by the architecture, but should have MSR[HV] forced to be 1 at all times, not 0, and should implement various instructions and registers that are only accessible in hypervisor mode. This commit implements MSR[HV] as a constant 1 bit and adds the hrfid instruction, which behaves exactly the same as rfid except that it reads HSRR0/1 instead of SRR0/1. We already have HSRR0/1 and HSPRG0/1 implemented. When HV=1, Linux expects external interrupts to arrive as hypervisor interrupts, so this adds support for hypervisor interrupts (i.e., those that set HSRR0/1) and makes the external interrupt be a hypervisor interrupt. (If we had an LPCR register, the LPES bit would control this, but we don't.) The xics test is updated to read HSRR0/1 after an external interrupt. Signed-off-by: Paul Mackerras --- common.vhdl | 10 +++++++--- decode2.vhdl | 11 +++++++++-- execute1.vhdl | 19 +++++++++++++++---- predecode.vhdl | 1 + tests/mmu/mmu.c | 39 ++++++++++++++++++++------------------- tests/prefix/prefix.c | 5 +++-- tests/test_mmu.bin | Bin 24608 -> 24608 bytes tests/test_prefix.bin | Bin 12320 -> 12320 bytes tests/test_xics.bin | Bin 12392 -> 12392 bytes tests/xics/head.S | 4 ++-- writeback.vhdl | 4 ++++ 11 files changed, 61 insertions(+), 32 deletions(-) diff --git a/common.vhdl b/common.vhdl index eefa2fd..64fb755 100644 --- a/common.vhdl +++ b/common.vhdl @@ -12,6 +12,7 @@ package common is -- MSR bit numbers constant MSR_SF : integer := (63 - 0); -- Sixty-Four bit mode + constant MSR_HV : integer := (63 - 3); -- Hypervisor mode (always 1) constant MSR_EE : integer := (63 - 48); -- External interrupt Enable constant MSR_PR : integer := (63 - 49); -- PRoblem state constant MSR_FP : integer := (63 - 50); -- Floating Point available @@ -662,6 +663,7 @@ package common is write_xerc_enable : std_ulogic; xerc : xer_common_t; interrupt : std_ulogic; + hv_intr : std_ulogic; intr_vec : intr_vector_t; redirect: std_ulogic; redir_mode: std_ulogic_vector(3 downto 0); @@ -678,7 +680,8 @@ package common is write_xerc_enable => '0', xerc => xerc_init, write_data => (others => '0'), write_cr_mask => (others => '0'), write_cr_data => (others => '0'), write_reg => (others => '0'), - interrupt => '0', intr_vec => 0, redirect => '0', redir_mode => "0000", + interrupt => '0', hv_intr => '0', intr_vec => 0, + redirect => '0', redir_mode => "0000", last_nia => (others => '0'), br_last => '0', br_taken => '0', abs_br => '0', srr1 => (others => '0'), msr => (others => '0')); @@ -795,8 +798,9 @@ package common is write_cr_data => (others => '0')); type WritebackToExecute1Type is record - intr : std_ulogic; - srr1 : std_ulogic_vector(15 downto 0); + intr : std_ulogic; + hv_intr : std_ulogic; + srr1 : std_ulogic_vector(15 downto 0); end record; type WritebackEventType is record diff --git a/decode2.vhdl b/decode2.vhdl index a68bc8b..74809f5 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -539,8 +539,15 @@ begin v.e.ramspr_write_odd := d_in.ram_spr.valid and d_in.ram_spr.isodd; v.e.spr_is_ram := d_in.ram_spr.valid; when OP_RFID => - v.e.ramspr_even_rdaddr := RAMSPR_SRR0; - v.e.ramspr_odd_rdaddr := RAMSPR_SRR1; + if d_in.insn(9) = '0' then + -- rfid + v.e.ramspr_even_rdaddr := RAMSPR_SRR0; + v.e.ramspr_odd_rdaddr := RAMSPR_SRR1; + else + -- hrfid + v.e.ramspr_even_rdaddr := RAMSPR_HSRR0; + v.e.ramspr_odd_rdaddr := RAMSPR_HSRR1; + end if; sprs_busy := '1'; when others => end case; diff --git a/execute1.vhdl b/execute1.vhdl index cf73de5..34be583 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -322,6 +322,7 @@ architecture behaviour of execute1 is -- 48:63, and partial function MSR bits lie in the range -- 33:36 and 42:47. (Note this is IBM bit numbering). msr_out := (others => '0'); + msr_out(MSR_HV) := '1'; -- HV is always set msr_out(63 downto 31) := msr(63 downto 31); msr_out(26 downto 22) := msr(26 downto 22); msr_out(15 downto 0) := msr(15 downto 0); @@ -332,6 +333,9 @@ architecture behaviour of execute1 is return std_ulogic_vector is variable srr1: std_ulogic_vector(63 downto 0); begin + srr1(63 downto 61) := msr(63 downto 61); + srr1(MSR_HV) := '1'; + srr1(59 downto 31) := msr(59 downto 31); srr1(63 downto 31) := msr(63 downto 31); srr1(30 downto 27) := flags(14 downto 11); srr1(26 downto 22) := msr(26 downto 22); @@ -533,7 +537,11 @@ begin even_wr_enab := (ex1.se.ramspr_write_even and doit) or interrupt_in.intr; odd_wr_enab := (ex1.se.ramspr_write_odd and doit) or interrupt_in.intr; if interrupt_in.intr = '1' then - wr_addr := RAMSPR_SRR0; + if interrupt_in.hv_intr = '0' then + wr_addr := RAMSPR_SRR0; + else + wr_addr := RAMSPR_HSRR0; + end if; else wr_addr := ex1.ramspr_wraddr; end if; @@ -610,8 +618,8 @@ begin ex1 <= reg_stage1_type_init; ex2 <= reg_stage2_type_init; ctrl <= ctrl_t_init; - ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); - ex1.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); + ctrl.msr <= (MSR_SF => '1', MSR_HV => '1', MSR_LE => '1', others => '0'); + ex1.msr <= (MSR_SF => '1', MSR_HV => '1', MSR_LE => '1', others => '0'); else ex1 <= ex1in; ex2 <= ex2in; @@ -1166,7 +1174,9 @@ begin not srr1(MSR_LE) & not srr1(MSR_SF); -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. - v.new_msr(63 downto 31) := srr1(63 downto 31); + v.new_msr(63 downto 61) := srr1(63 downto 61); + v.new_msr(MSR_HV) := '1'; + v.new_msr(59 downto 31) := srr1(59 downto 31); v.new_msr(26 downto 22) := srr1(26 downto 22); v.new_msr(15 downto 0) := srr1(15 downto 0); if srr1(MSR_PR) = '1' then @@ -1474,6 +1484,7 @@ begin v.e.intr_vec := 16#500#; report "IRQ valid: External"; v.ext_interrupt := '1'; + v.e.hv_intr := '1'; end if; v.e.srr1 := (others => '0'); exception := '1'; diff --git a/predecode.vhdl b/predecode.vhdl index d3ca015..858910c 100644 --- a/predecode.vhdl +++ b/predecode.vhdl @@ -447,6 +447,7 @@ architecture behaviour of predecoder is 2#1_00100_11110# => INSN_isync, 2#1_00000_10000# => INSN_mcrf, 2#1_00000_11010# => INSN_rfid, + 2#1_01000_11010# => INSN_rfid, -- hrfid -- Major opcode 59 -- Address bits are 1, insn(10..6), 1, 0, insn(3..1) diff --git a/tests/mmu/mmu.c b/tests/mmu/mmu.c index 64afa44..ff6a582 100644 --- a/tests/mmu/mmu.c +++ b/tests/mmu/mmu.c @@ -7,6 +7,7 @@ #define MSR_LE 0x1 #define MSR_DR 0x10 #define MSR_IR 0x20 +#define MSR_HV 0x1000000000000000ul #define MSR_SF 0x8000000000000000ul extern int test_read(long *addr, long *ret, long init); @@ -450,11 +451,11 @@ int mmu_test_11(void) unsigned long ptr = 0x523000; /* this should fail */ - if (test_exec(0, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(0, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != (long) ptr || - mfspr(SRR1) != (MSR_SF | 0x40000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x40000000 | MSR_IR | MSR_LE)) return 2; return 0; } @@ -468,12 +469,12 @@ int mmu_test_12(void) /* create PTE */ map((void *)ptr, (void *)mem, PERM_EX | REF); /* this should succeed and be a cache miss */ - if (!test_exec(0, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (!test_exec(0, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* create a second PTE */ map((void *)ptr2, (void *)mem, PERM_EX | REF); /* this should succeed and be a cache hit */ - if (!test_exec(0, ptr2, MSR_SF | MSR_IR | MSR_LE)) + if (!test_exec(0, ptr2, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 2; return 0; } @@ -487,18 +488,18 @@ int mmu_test_13(void) /* create a PTE */ map((void *)ptr, (void *)mem, PERM_EX | REF); /* this should succeed */ - if (!test_exec(1, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (!test_exec(1, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* invalidate the PTE */ unmap((void *)ptr); /* install a second PTE */ map((void *)ptr2, (void *)mem, PERM_EX | REF); /* this should fail */ - if (test_exec(1, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(1, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 2; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != (long) ptr || - mfspr(SRR1) != (MSR_SF | 0x40000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x40000000 | MSR_IR | MSR_LE)) return 3; return 0; } @@ -513,16 +514,16 @@ int mmu_test_14(void) /* create a PTE */ map((void *)ptr, (void *)mem, PERM_EX | REF); /* this should fail due to second page not being mapped */ - if (test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != ptr2 || - mfspr(SRR1) != (MSR_SF | 0x40000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x40000000 | MSR_IR | MSR_LE)) return 2; /* create a PTE for the second page */ map((void *)ptr2, (void *)mem2, PERM_EX | REF); /* this should succeed */ - if (!test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (!test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 3; return 0; } @@ -535,11 +536,11 @@ int mmu_test_15(void) /* create a PTE without execute permission */ map((void *)ptr, (void *)mem, DFLT_PERM); /* this should fail */ - if (test_exec(0, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(0, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != ptr || - mfspr(SRR1) != (MSR_SF | 0x10000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x10000000 | MSR_IR | MSR_LE)) return 2; return 0; } @@ -556,16 +557,16 @@ int mmu_test_16(void) /* create a PTE for the second page without execute permission */ map((void *)ptr2, (void *)mem2, PERM_RD | REF); /* this should fail due to second page being no-execute */ - if (test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != ptr2 || - mfspr(SRR1) != (MSR_SF | 0x10000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x10000000 | MSR_IR | MSR_LE)) return 2; /* create a PTE for the second page with execute permission */ map((void *)ptr2, (void *)mem2, PERM_RD | PERM_EX | REF); /* this should succeed */ - if (!test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (!test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 3; return 0; } @@ -578,22 +579,22 @@ int mmu_test_17(void) /* create a PTE without the ref bit set */ map((void *)ptr, (void *)mem, PERM_EX); /* this should fail */ - if (test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != (long) ptr || - mfspr(SRR1) != (MSR_SF | 0x00040000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x00040000 | MSR_IR | MSR_LE)) return 2; /* create a PTE without ref or execute permission */ unmap((void *)ptr); map((void *)ptr, (void *)mem, 0); /* this should fail */ - if (test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ /* RC update fail bit should not be set */ if (mfspr(SRR0) != (long) ptr || - mfspr(SRR1) != (MSR_SF | 0x10000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x10000000 | MSR_IR | MSR_LE)) return 2; return 0; } diff --git a/tests/prefix/prefix.c b/tests/prefix/prefix.c index 94ac500..d594037 100644 --- a/tests/prefix/prefix.c +++ b/tests/prefix/prefix.c @@ -7,6 +7,7 @@ #define MSR_LE 0x1 #define MSR_DR 0x10 #define MSR_IR 0x20 +#define MSR_HV 0x1000000000000000ul #define MSR_SF 0x8000000000000000ul #define DSISR 18 @@ -103,7 +104,7 @@ long int prefix_test_2(void) return 1; if (mfspr(SRR0) != (unsigned long)&test_paddi_mis + 8) return 2; - if (mfspr(SRR1) != (MSR_SF | MSR_LE | (1ul << (63 - 35)) | (1ul << (63 - 34)))) + if (mfspr(SRR1) != (MSR_SF | MSR_HV | MSR_LE | (1ul << (63 - 35)) | (1ul << (63 - 34)))) return 3; ret = trapit((long)&x, test_plfd); @@ -111,7 +112,7 @@ long int prefix_test_2(void) return ret; if (mfspr(SRR0) != (unsigned long)&test_plfd + 8) return 6; - if (mfspr(SRR1) != (MSR_SF | MSR_LE | (1ul << (63 - 34)))) + if (mfspr(SRR1) != (MSR_SF | MSR_HV | MSR_LE | (1ul << (63 - 34)))) return 7; return 0; } diff --git a/tests/test_mmu.bin b/tests/test_mmu.bin index 1ade44ee74b94b62ae2714ae7a476a7d6bf9ed58..6e352a773ba39b843f5e1bc1aaa9abb2beaa0d21 100755 GIT binary patch literal 24608 zcmeHPe{59Ol|D0L?6EP%81L2z+42mIN959g0lJ+@c3WOvZ z8vYo^?`6hxwkj*6C9ABIP#Vk%kJzY0u9QDg6(@!emh^{YwAG4q$;1?=vM~*IXj&6P z?|$dL_m~+Q8yebmSA8Rm@4NS&d+s^kJ?GqWul-D<))R%=PUVC;P;QB7p*A87Z9CAm z18qCDP}`2SmdBnlR@07}Z=TuFQB1o73D0)p(k9YX3e`*qY9LbTh)T6nIi(Sl?R2PA zBO?E}a7_@!z;7@3?FGNRTZ$jt+m`U`F-o&y50e{IM8O0pMj@5AwY$iOEcv<)8T>?v zVHe7Fmtqhdo=_XP=of0p?o&xt@Q{fVU=g^q=b$nA`cC_5bt6-&y~;UH>_r zeNOlOIo4fH@7VwBt8goP{@DGqOLynJe{A{hl>eP;cqycLi9s^N`JV6PXh3yNz{_LCR&xQ7&cs2ZQ!0^`~-t z9i;lUl#;J45(*hYRIDg`)MwOreH#=@Ch#9lc)WVr-%Gh-5Hejh#!rX~wJggS$l{{t zUdAOKb5(&)|J@XPGbgMc`jXU78x#^hR&|=5W8(NE^NTEbQO5$^;Q6=L-jA-z@omVZ z+{$w5Qa6#WhB3P)L~WDB_nKH*c~+Qn1~F$9XwIe2g+J^_mpgqfA)c(u!+b|}ge9h| z$GsT)-;=+KR#DWSM`9`K@Zn#YI<$;MvCd(vTUdYJYP%2X(Purg-Y4%$TTk=vVPG$1 zl?v@%J3wpjUP)+YH85IDW#yI7Y4@3bjz}BvG#Qs3;de}Y@*R`*>^+edWshDQ3hVul zNo^_9Bg^-|e!A4i`X<9gBY0;-#ozId=x2r6@W!Jaw1sMZozk7Lo~ApRG}HD;^Tw8q z6vi4xcn!Unm-R1Ss~Ux|Cx@vp=D#1jm?rUj6?iK(iTPD{H-$7(V*V1)Fjj@=FOfWa zHFsX2jUC0m@L_xm@8^)#0zKlfWnC_6 zf(`nNvZcO`c&yw-9R-IzDQ~s8#wA;3wyIX?5vycgT6V{Y7Z!IhZHU!#sHo=D*hB&9RI9 ze6!?zel&~gW2^Ot~I5lzwOFnQ*T&s3G*s7u|2! zud)#bI9B);Q?RTw`_?O^A7<=XvMl=B;Dzbf16Dt(u1FEX;M zHHA&9jCJJ7yHUp5#+Y$kVa>tugZDh%2NUk~27$ld=HoVPgL4-4Ga9bY3*jdmBfNNb zlKq5ZG3Nnl5e3=)$5MNqE9S|x5stalMg6N~ez7qbX3Q89#_QPrwco|%j<||X;CF@Dhk!8V#e@FJyG%?+NzVcos2EVWU zkk?%Ine2yg8$W5zF`NBR-tF5D0f#@7{bbExKY#VxTieh0yP4SC%6>*H`rEZ18o$<9Dl4PbKERYnFMoq|Jjkr+OeC$2m7j&Yj=P zxe(p8AZ=Zy?18*4y#GyIm!4nGye{L%P2KE$nq}p4<4VMEX;U(`+wmt&zrsJvxpB5N z`PCKC%>C?7e9qu}?gO;3jooaW?zMFJz1GRn!(OL()?7PhnWwKmWe+p0(@*BlyiRuB z&ueAp{I(BW{nU)7-bUnz);#vwU|$T}EN6d@$H12gG3NR^rfgr-PpXTQ`p8CK8}AXE z12gu=_9F*|pN}PdZ9mAPpb*`)d@f=CQL_3rMMe)tiR!P`Lk`~WdF;*hx_V1CULV9Y z*cs;#@FA6bXqEn$Y(s98hc@_45Ov0g`LoY)9`V7rImgtTg7<_E-=2A0%zg1fv;^{# z`N_x+&?L{E6T6;jt!T-`*|uDs)5}|NHpIEm-?y!m&w_+pZzwAqsb~rs?RTq&GUXle z^z*EJzrdIj1Cvh{_RDh~?L!`b_(LU0ob!^%)66);m|px@GR*f-oOAR2(*W+DK9+y0 zqSP>tZ7<<`V%G1$qCTpRfmW6fiE&byEw@pC#4!x}IB2DzA> ztMfU`j0cO2V#I@Ohs?XIV!2MKdo1`sfcH+0Rh+ZfImk(`nWJUoWK(jHYRGb957?L5 zOU5rlC-`}_9gl$-$76m!r+$jQ$VUUrhcLbd=hqCL(79(8UgO77Hp6)+>(22HxqUtD zGxb$u+Y;QFskl?X{|iw=xHB{RSa;UhjMGYuV&$Xo=i9nBld^Iy&wHt*Ki|EKe{ysE zc|WR8lNoCfU`)*YA*{FN;D3s^KaW z7ot~jmg5|k<>C5vOAgM1c~56PybeG3VG6ex#wOhN88qS3Uy;wXDXa=jtRm}txL@2} zRoE2P>%8IC7RW;8XWAX$hHMJ5FQ8j9zk{{$*ooS2$Ns&RXjK6@f+9AnzI zB(128LGC{E^WNU&ma$38qc7v`v`bx#5;0%$?g#z9fiB^GUjG0k^Ei#RF4^`C{XMH~ z5N()1_7C8>P4>T__gMXB(Y98$b?QG!$7YIO+WXOGmP|i|{zB}oV_Tz^xc@l(jO%O< z&$Y(=-?!=>)E|2??muMJvrzBf5cj`i)#ss}zbWqTvg+BWpG1A9RnI}a_p!LY&8mA* zAO8Ee|52-+i~88(aeu(7=b>J{IqqL!)#syr81j``^?cN;o{0MwS@i{|>*2WHWz`E% zA42~)?r2z_g{YtYYTSR`s^i|pANg9`f5xgW;(9yux9ZsI{Lg97->NT0J--e5qt3B_ z?Q=KoCvO!Gn65XrY!8=5aEC~^=dOa^5><>8<39F#S){-Iu!6f+-_A>d+n*YiZC}>( zRoUTIb04a}an7W@n4(pg){DDproRaKG@rn9DWAaOG#PM*F-->Lh71)WQiew~>-Phh z^Wz?wWmQ*YZ*Rr_Sdz3CHz~93ne30|-Dvvn6F&ME_OKdncZrk}hkt3PqRZE z&S7C1`ODx9xb0tYFCCW{^Sh`@ z4Q$|59+Qv0m{72vc;#HLKd2Z-aCdFut;%QjiBJb(xM>@Ym~9^9C;3y_Y@2x9qR9ab z-|^eLtQx2GL^pB!{l>HMlGG5!Un`!u&`|McG zZH1VZKdu+nQC1}2T9S{IL%jswBNv1?jFK9219Z^URG=;jytt`=8a~PnV9lux-|`_L z4lqq~fr8!*1+*p`fKeH~oAEu2e)-S2Ode|rszBA4S3m(UnUC?HiQ{`3bWfwC`!UyR z?t`vIN%@eZOiEJ8vlk@ZkA=|FX&;cZEzA_8xc(qIi|=O8#h4Bsmgf?_r_uH@H?Taf zxf>3;%)t;`4z5C1v5TnsmqK7eH025G&;HK516kkouwanCp`hjK1=RFW^22lg^YiQg z^A2LnFvhf?qz5HluepN~F}KQ)K`^F!4*iA0z=Y{SmaY#-x)&`nQqc7Z`m0BTcni-K z{>_r=@f7~YKyw;2tQUnO{!iKXSEA`0`d_{%gdZh6fVl!UG+0y6VgOo*^#5eL literal 24608 zcmeHPeQZ=$c0V&V_V8tlLt1ZGZJvXpdR(Ns)M>{t#xusS{(xi_93a_r^PI86R~nIq zmBwKv9@xn$u8;~HMcZj*^gOl4ydoF(iIYuMZ0 z@4ol!F?>+Z?{O-Bu+hdCrK7_a+6~znSl(luAqajmuu!%I4JjIsv_Hn@ zF7L{Cff!X%=5Lm|(iq+uK;4SC1W|`_KHee7!fy#8z>c|6=f(3EO7o>{ zo`eT~GIW$gC-gcPT_K5uMRL%TvZTp(1*qG3=;pskC;qm3MTQ=R*5li8Nf=dUa{X-* z^tTp?r6YBsd`BnPL%#i8S-x1t@1#5lDwd%WaZ~D(O6qVHI(VdgJ29C7S!LjH>^`aM z_(Q2fH?wa7+lJK1Ve*R3eF+%$j_~{Q>+iKM%Jn}sRi@6LFI$7F#b0qWcV7G3b%nR;i+Q|jr3(;)L7H%2s;wBtea{g2^ax0gwK zV4A4=kLE^VC;z~*BRtd|E0aiob%*V`hk@OG`k`QslSjN$zn z|NL6ursYF7f&7ENw&o)H3v`f#z`uQMQ^nUj=V$(Sba6vm>%;bZe}yC-xaPC;DChxHx%{&&-PtGaub$PDJ9ob568%&D4`R zG9SJ=Y>n>_=JJiVIF^aP;Z)M3&LZ`@Qm8+Zn~Yw5GucAjF{gJOM_td%r0K=;Zain< z9z_hx#(fp;Q*bZGJqPy!+;eeX|E@I-w0Zt`cGisM{x6efIj3?E0~ja#_efoNL1(p6 zy_Gsnt~wDhveoJ{xhJ;J=a26`vG#B(6&XsMh|u4o!+q^Dds1sf-a@=cMRxv5HIiTS z$~$^3L!R+}9qZ#e^w@<)d_1A;`uBF&T%~kpCqu|N^!t9?#kxF+ ztg-Qyv9`RRyIQX+R;41ui+B*L#vdbt?`WQh=(nh zhvt$qV{xaJ)_=q|vHuV|k1b<7|B-2T_MaKB{WtAD z+2i<+|Ce|6pDCxu;&vzhx$(9|e<%OB1&iDGPvNY)_n!su9}oS9YcBYZ6JwYB0{UQI zvH*FG>jeoWO}#h4dU%(utFsoVga0l-F3jXJ#p&<4wg2OGf97b$R``FA{?5G#V*zxy ztNlq?rtY1V?p)(`T(xvBb9Berfx0t3e|XBlxgCp6&eIpM<~OS32=`0#V$bTj+wY~q zXwx32Uq&U&9?SuqBKA1=O=remn$1gue$_tFDk^S&0f3s5E$dmpyBRhpW>2IZeLn(jrm^g>E zGUI^AqH7`O!<p zeg1QcKc4$D*A&bNpT2qP80q|oIfRY-_;V;DKMMA5+&iW>Z)hqFPsQH0MDNo}ny@#- zzA$iNT@&|$g7scsb|hL_Ul(t=FBms&dUrf?KkML0T;>3m&t`S&y`JpAngB6K3RBqY zrLbPJ&N{f?yZC-8!t*DtyLs-^gL9_~`Zp>@1$nI7i9L#a{|3Ae0tM+on?j4NX)v=@= zyV2Z~jlHaM&bJP2%r7&sH|5%vI&aH8DpiazmhoadYq<%f03 zI!iP&*XB3ntkxWAEekpJd{a5$kx??c|;twobHNscj4CQxZ-F-oadAnLg0clnlIK zmw9F#emWW8c^liaQ9k-uGVl*}c?$clP6l4K%Q+|?L;1UQITz*5HOWA$UG|~;{^QBO zV|IBe%Dqn{1551kG?eE&nG8H+m#3qA0Qx;>muH}C{ADt5uU(#r@}@{KFvTwCp?nVQ zSK;f_cNWUWzmp7Hu**1G3A8+w4E)qC&qjG&3+%Vc$Y+5kP1tXj??L%G$_G(4v`>op z1Drq7A0|2{r+*(wJb*QpzLqVqA3kE>eAWN_CCnd@ z4gI?A8)j@#PNd1oHKo|PS+o~NX@jKo;XIo37eU`MI`$AbIw$aWTRm`Qaf=?th#sW_ zT8~Ff`+I<~I^evR`UV%}tZ%~qP|~y)R~r-N#?f=o%bo&l&{#kCTI>TdEA4598CO zaa$^J=|&9A@wLGw)cdV^;7lwEi4od^H?>JOas~wRb0qOG)~*fc+t7w_zV>zI6O{jM zr7xbHE^T{zRzq7t$8N0E&~FCXME+{M! zWX4EcP}4cS@xw>0axQ3io^SE8*DJJ9zPU>&UqzvjVJG;Gx1OsyJWHCozos8Vn&^)TIEf?**7z?(0C1L_ot=Gj7zXC`5+T8lWW&_XR(8`rF zr->{L)h}b&s12*Q6nE10sL=2^)jZs*whs5J{rbF}{>GR` z`_Mj!_Cd4{a^8YQ$8aZ}lZMZ1bK*Pe@*$U(=fxG09SwQr&hSVv%7yqG`$VY&Y^fnl?=3Kq|EB z1AI)-B}j*hb`9b46Vz#aY1dwF%|4IS`7yh58E6VU_(x~|tyCYLwLYQUIsY_ePxe1& zQ8DOWlNWv_PwGBOefYtD{VWH*f`2-OKKKMpu>>4%IgEIzjZ0SS$_KQlD zQ{Ga@3yoN@I4}Gnm_fApOX}So@ZN_Ty;3gmyhf4SM|dd7jYr1U~0bH=nxUVjbha&^4u^tZ*+E0T%%m0T%%m0T%%m0T%%m d0T%%m0T%%mf&Whk1nmuOnSB@gt`8VQ{s)Vl;%EQ> diff --git a/tests/test_prefix.bin b/tests/test_prefix.bin index a5f9ff7c9b6deb6cc20d68be124c5ca137c0cfbf..8690be78d4cf9fe71c2d425ac617fee2fa92e122 100755 GIT binary patch delta 20 ccmZ3GupnW>RAt5qlcy;MGfvq2O4*DD09`c*EC2ui delta 20 ccmZ3GupnW>RAt77$k7Ml}J+3T^lm}&Ye5n z@!B9&YUN7omB#npbI(2JeCM3|S`!&9L?=R5%1;cKbYh=ro(PS*PsV5@PsjK^w{ar0 zS21ElYBP~nzf$gvpx#SuUY#5=nD_aoSjMX{Z;7#qT*ReQ{4M?|df@-h1B;!TMg5DI z6D6KUes5ar{9DB63)MyZiyeW*)xX&Bf7JCaZuEBwQnJ+)%+^pS ztI|-mj>fW$bUEv%sjNmyu9LjE9t!6AD3sG_D7T-+a);=0E<#f|gOusx2owTeTgnle@Ii)2~uW0BJa#31!t~OXyz6T%_M1T<_=w+$xut3~*$CBcpjj$#M67*v@r7Y!4;X&wdKqxjzVuT)!Btp-__TTuroL zv4j7p>z_1{lO}S~L{6H>NfS9~PR&$fJ*Z@C!gvPb8H{H@OM)je?)F5%mjPdfdAQAE z+~+Z-DWo;irFK93s~<8}r4;goQ0FM2wV0;bRHb!=RO%YISVGYYJJlG^amJ!>eu`V+EUWzgM6QmU%!cBR7owYt9~cirLw)5VuH3q*34(-`NY^Pk7n_1 zf_&2bg?8BX@){;GfLr&o52vtZ->(vH3?2A%mAroT2m6bC<(iL6F~CTPa5HIP=m|%J zk5=l@*0OMKq276XPaDGjEMm4KYFkM2HRb)G##$4#+gG;NZfWhP-NHHw|MtUTtv11CeZR5lby=goe(38|*?_%l zBexamV%qCT)9jwbhsIq0YfaDGH@n}5@81rbZHp|Ab4&vChly|{>46GbQFk(PGroe3 zX^r%f|AX>ybi#TE>OF{0HRdYRd03&-0Tn!2_M6zVhO|GYZp2Ssn~3*hzly754t@w5 zcs|ov#(0{uUC^jUvQ_HF&;3+Nw3d~N-1^3BoTBIgVIWfuPHj309Zeo33fU!ecK9Jlpz zObhYb`@^X8o!G;`ckjn}Kh=x4TkD|_Im!D8@1LipXY*b$l)_w6WiI)UOPW~kb8>03 zhFsEQF16=OQ)?3W1o|uY7S4xfBp${-T59>@b8ds5X|G5c`y;{ecl0?P|B!w5hn=%y zc8ri8rNCX7o4J_>$LAd>k9Akbc8=T9p1UQMjkY~66xg%NVUO+i`SH==Z^NInV+k8M zUz~BM&i(h|KqFeqmGZ6uH+A7Rb}i0g6p=_q4%)Vu^f62OTD`a^g8We zy}y^^LN72cyl&>l;rPEE5(nToV61wTsG?yR`1m|j&k(jaS2F*u+IK-1)s0eccdw1%Ja<&M!ao5Y?w1C!e*a;+}{* z@8M5PrZeB3q}XfaJ&x}S)}5;!cR_1TIzLCV=XfaW=Xq(&`zY^o3_skS#s%!b{m)^~ zs+6_&+i{y~hsWa8TFY>k6ZW?Y{xr@d%Wh$U_~VQ--(~m>VVbuC_SvJ*&0i3$Y^X-w z*?Z_8zc9NiL8}7oG2}5G1>NlPAEKVj<5UdfZDe^r?r*yP*EQ>Le(_!K*hU3@xs&Ic zV+ZFku!ijWF8fvN^<_R>(eh_(dWQxqU5CY(YSFE=A>x`Nc~%-m`{)Y7o&=sZw)upt z8p&^(GIq9K64avMp{tTdygkw#6uVvS8GwPDE4jR!^|7-uH+b(rK8+IQR;iwlf%lO; z|H5->EP{KQ6)R1&UBBrXzEYx$qkW{4aE+H`_S|X|hE6!$-aoCaUwP z!EM{tZSCFN5nR7v`_^^dN|jl9_Z>OP4W?G2?g1CqJa6|=4DS9udd~j$3i{s_{r)3A zqiWH@dUK^d&JT9!; zz_%APjWMGz?M-|m_e_)di>2s+q6dl|D0-mifuaYB9w>UC=z*dKiXJF>py+}BwFeUN maeVX}mZroLR^9pJT+MxwkAEWI(z(>CC7Djy4U0VQ@4o?u^PO=3 literal 12392 zcmeHNZ)jWB6+h3CZN<3Mr499(EqSuNty0=L+bBqtWj{M4P3<&s8Yl5KMxLe^?7^~R zNkN{a>3L*O7mqoEg}O9ZytXC@?MPvR3R@O;Y(zoBa z?>_rkb_0Wf26=+`-h1wO_nhDFoO|ziQizNw(PZpO{p6@glS8IC8JqQ-O3~D^9P>jS zV={I~F;YZo2a#XDQtwZoJw#o8og8bh?#_pJjfs@sYwV&L;?EB8*Zt!%@c+j^wR6+8 zU&Wm8`tJI(soMEh#pnyuF8*pqpt|;}9sj%Ces!bo)~40YzpBUo-CO>mkCb8)br+i{ zR#a)c_%O{Bx6oWML<>cYl+q6Jm-?x@G)S?MPUEFvnkgNjxl)1_N(L#5&yjy|in-inDStw9lsOkcP&ifoh%D1 zSzyTmM;17;z>x)xEO6w2BL^Hg;K%_-4mfhakpqq#aO8m_4;*>m$OA_nIP$=e2aY^& z6o8`u90lMg07n5h3cyhSj)LZkWoJFZ@SWQc_#VruH-7}*c{~P;-2NolOtCEc*+#Up z+QGlu?O!#KmnQPkL|&T6OA~o%A}`H_aufE1O2#he6rfXpP60Xv$g|KZ%zAuD=oX+` zU_I{h9FKX9WeRB>bg?G{p7kR}Pnal}Qz#HaTcV5>HBB|#sP)EF>K%Q}OUVld)D%DG zjLTYUc;yysX;^uzX<_cW0K{CbG_T)-YqHHvo*9T){CD*1U1dm@-)-fC83Qw%6m zCf-3>9Jb;K!PQ1R*;yAKsIGIG);*ouTlcVy z%5|S;8X2HW0Q~;9&z%_P%QH2aqDp|dguTKFrs&AvCl|BI3NuEd6)a$F+x7c<^K zdgN63&Gb5YX|qb_#~z?F-9#7K8|jQjbiTbA<1Lg5{k}e)9Jk&Vb*vr!N9m90Q}3B+ zg*w6wG=XP3M*Baxkyh!0F_p}**UU3oKVp0J){g;CE5{fIfzdR- z-*NG9Jmi-cpTOED>#eolM*lMOS(cJA)*pucO~?2W#tU*RzWamiI@o4>&OIKGwiP{%&e zJ`#-IF=jsAm1E|^&ed@{KFEt&;I7Qc`=l?GvH68ui|yX_TE3Lf|ElyQCuO!T->dND zX@@U1=c{AGTpOVqb{yfKGX`y?|7{FtN{e2p?;Z6}FTPvXQqGvLk8AOBQ{=(wSp87q zXMS#YEPftT^iXZtiq$nKV}71PY;YiO-LLbQWA&&Ui`<=3;a6bPV+7!rno3EVm3y@$ z*JPU`vcCZ~PePw%`zmZYePElH<@}xZ%gPv>{r4Bg;BjkCA2xm)nGv;ebb@eCIM0`D z4xj%}V{tIboFTUC^Nf@+FFJf&@4&!5*E)PwkRO!Eba}N8;zZGDKKFS~1wYHAJ2USe zV176DyuTM~b#-p%wx`Q9Te=8m`NhLy5&iIYB9{(_Bb_xbLD!1uPTV<(_u}c((hv6O z9@&2${l}%BCB^bR@%GmAQ8}OZYioK)_L);3b&b>fL|PN~SXZtY?>DoaKq}Gqi1=*= z6u>TrtzuA6V-bF8f?_W!tDt$B-E?P0VZN0PO zz2{n*F(p3zTiTD`ngkwt8FhhmmnpD&7HjO0Ybd}m1sojPbC8Fnoa+V+coT(}({1n@ zKJiB}Jj>@S#(F~dO}Zg!kbuaoEwIIP+XJXU#CeK(5cSB>kIkP!{)4QLt*Fz2xhAQb zU)I%F*GKUX>*t|=eVK3e1?T5-9le$xkF4PbYCGviFem)jZO3e-4?asz>#ReKM!iBe zk2|Qc{0%y#wW0_OR5bB@+u2>)BK2h`M zw-eU#9p$`hCd3BktA=yeu1WIyaYyoVh%^677$1g6wVpSOn$nlC!Fl|vt$~!cxMTFK zdwrt@)&VCt11Q>vIie?V20(7#txswxa}u3$XZZ`XyHaXe{dfg zM$1x;T#(~>=par@ceaQPFc8&Gn`R)~tU9pVA$Arv-hZz=Mr7qM%h)F8avPoMu3=2r zM?Q=daD;cQm3A>dkNEk7Zd?cVVm`l=UszFoMvnbh^b#ZSV+W599Xo#T8x`YkR*b(T z#sg4Q(c9kr)XoO9{oQ-_a?9&HhST=pfC|aBX9f{kROc47d!q47d!q z47d!q47d!q47d!q47d!q47d!q3{)_nH(3Jfzv&YV-e--am|n3v70=nj>vI11{g&j? NdaLbt(AKcp{{Z)<`6~bb diff --git a/tests/xics/head.S b/tests/xics/head.S index c513a02..4de3e29 100644 --- a/tests/xics/head.S +++ b/tests/xics/head.S @@ -115,7 +115,7 @@ __isr: std %r29, 29*8(%r1) std %r30, 30*8(%r1) std %r31, 31*8(%r1) - mfsrr0 %r0 + mfhsrr0 %r0 std %r0, SAVE_NIA*8(%r1) mflr %r0 std %r0, SAVE_LR*8(%r1) @@ -123,7 +123,7 @@ __isr: std %r0, SAVE_CTR*8(%r1) mfcr %r0 std %r0, SAVE_CR*8(%r1) - mfsrr1 %r0 + mfhsrr1 %r0 std %r0, SAVE_SRR1*8(%r1) stdu %r1,-STACK_FRAME_C_MINIMAL(%r1) diff --git a/writeback.vhdl b/writeback.vhdl index 6a86fb7..c479c20 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -72,11 +72,13 @@ begin variable vec : integer range 0 to 16#fff#; variable srr1 : std_ulogic_vector(15 downto 0); variable intr : std_ulogic; + variable hvi : std_ulogic; begin w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; f := WritebackToFetch1Init; vec := 0; + hvi := '0'; complete_out <= instr_tag_init; if e_in.valid = '1' then @@ -96,6 +98,7 @@ begin if e_in.interrupt = '1' then vec := e_in.intr_vec; srr1 := e_in.srr1; + hvi := e_in.hv_intr; elsif l_in.interrupt = '1' then vec := l_in.intr_vec; srr1 := l_in.srr1; @@ -103,6 +106,7 @@ begin vec := fp_in.intr_vec; srr1 := fp_in.srr1; end if; + interrupt_out.hv_intr <= hvi; interrupt_out.srr1 <= srr1; if intr = '0' then From e3f4ccedecca091086ef3328c04d76ff4ee0ed6b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 21 Aug 2023 21:43:35 +1000 Subject: [PATCH 02/21] Implement facility unavailable and hypervisor facility unavailable interrupts This adds the FSCR and HFSCR registers and implements the associated behaviours of taking a facility unavailable or hypervisor facility unavailable interrupt if certain actions are attempted while the relevant [H]FSCR bit is zero. At present, two FSCR enable bits and three HFSCR enable bits are implemented. FSCR has bits for prefixed instructions and accesses to the TAR register, and HFSCR has those plus a bit that enables access to floating-point registers and instructions. FSCR and HFSCR can be accessed through the debug interface using register addresses 0x2e and 0x2f. Signed-off-by: Paul Mackerras --- common.vhdl | 53 +++++++++++++----- core_debug.vhdl | 14 ++++- decode1.vhdl | 8 ++- decode2.vhdl | 5 ++ execute1.vhdl | 106 ++++++++++++++++++++++++++++++++++-- scripts/mw_debug/mw_debug.c | 1 + 6 files changed, 166 insertions(+), 21 deletions(-) diff --git a/common.vhdl b/common.vhdl index 64fb755..6759f8f 100644 --- a/common.vhdl +++ b/common.vhdl @@ -55,6 +55,8 @@ package common is constant SPR_PID : spr_num_t := 48; constant SPR_PTCR : spr_num_t := 464; constant SPR_PVR : spr_num_t := 287; + constant SPR_FSCR : spr_num_t := 153; + constant SPR_HFSCR : spr_num_t := 190; -- PMU registers constant SPR_UPMC1 : spr_num_t := 771; @@ -140,22 +142,36 @@ package common is end record; constant ram_spr_info_init: ram_spr_info := (index => to_unsigned(0,3), others => '0'); - subtype spr_selector is std_ulogic_vector(2 downto 0); + subtype spr_selector is std_ulogic_vector(3 downto 0); type spr_id is record sel : spr_selector; valid : std_ulogic; ispmu : std_ulogic; end record; - constant spr_id_init : spr_id := (sel => "000", others => '0'); - - constant SPRSEL_TB : spr_selector := 3x"0"; - constant SPRSEL_TBU : spr_selector := 3x"1"; - constant SPRSEL_DEC : spr_selector := 3x"2"; - constant SPRSEL_PVR : spr_selector := 3x"3"; - constant SPRSEL_LOGA : spr_selector := 3x"4"; - constant SPRSEL_LOGD : spr_selector := 3x"5"; - constant SPRSEL_CFAR : spr_selector := 3x"6"; - constant SPRSEL_XER : spr_selector := 3x"7"; + constant spr_id_init : spr_id := (sel => "0000", others => '0'); + + constant SPRSEL_TB : spr_selector := 4x"0"; + constant SPRSEL_TBU : spr_selector := 4x"1"; + constant SPRSEL_DEC : spr_selector := 4x"2"; + constant SPRSEL_PVR : spr_selector := 4x"3"; + constant SPRSEL_LOGA : spr_selector := 4x"4"; + constant SPRSEL_LOGD : spr_selector := 4x"5"; + constant SPRSEL_CFAR : spr_selector := 4x"6"; + constant SPRSEL_FSCR : spr_selector := 4x"7"; + constant SPRSEL_HFSCR : spr_selector := 4x"8"; + constant SPRSEL_XER : spr_selector := 4x"f"; + + -- FSCR and HFSCR bit numbers + constant FSCR_PREFIX : integer := 63 - 50; + constant FSCR_SCV : integer := 63 - 51; + constant FSCR_TAR : integer := 63 - 55; + constant FSCR_DSCR3 : integer := 63 - 61; + constant HFSCR_PREFIX : integer := 63 - 50; + constant HFSCR_MSG : integer := 63 - 53; + constant HFSCR_TAR : integer := 63 - 55; + constant HFSCR_PMUSPR : integer := 63 - 60; + constant HFSCR_DSCR : integer := 63 - 61; + constant HFSCR_FP : integer := 63 - 63; -- FPSCR bit numbers constant FPSCR_FX : integer := 63 - 32; @@ -230,9 +246,19 @@ package common is msr: std_ulogic_vector(63 downto 0); cfar: std_ulogic_vector(63 downto 0); xer_low: std_ulogic_vector(17 downto 0); + fscr_ic: std_ulogic_vector(3 downto 0); + fscr_pref: std_ulogic; + fscr_tar: std_ulogic; + hfscr_ic: std_ulogic_vector(3 downto 0); + hfscr_pref: std_ulogic; + hfscr_tar: std_ulogic; + hfscr_fp: std_ulogic; end record; constant ctrl_t_init : ctrl_t := - (xer_low => 18x"0", others => (others => '0')); + (xer_low => 18x"0", + fscr_ic => x"0", fscr_pref => '1', fscr_tar => '1', + hfscr_ic => x"0", hfscr_pref => '1', hfscr_tar => '1', hfscr_fp => '1', + others => (others => '0')); type Fetch1ToIcacheType is record req: std_ulogic; @@ -377,6 +403,7 @@ package common is prefixed : std_ulogic; illegal_suffix : std_ulogic; misaligned_prefix : std_ulogic; + uses_tar : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => ALU, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, @@ -396,7 +423,7 @@ package common is ramspr_wraddr => (others => '0'), ramspr_write_even => '0', ramspr_write_odd => '0', dbg_spr_access => '0', dec_ctr => '0', - prefixed => '0', illegal_suffix => '0', misaligned_prefix => '0', + prefixed => '0', illegal_suffix => '0', misaligned_prefix => '0', uses_tar => '0', others => (others => '0')); type MultiplyInputType is record diff --git a/core_debug.vhdl b/core_debug.vhdl index c7215ff..8e06127 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -294,7 +294,7 @@ begin -- For SPRs, use the same mapping as when the fast SPRs were in the GPR file valid := '1'; - sel := "000"; + sel := "0000"; isram := '1'; raddr := (others => '0'); odd := '0'; @@ -324,10 +324,20 @@ begin sel := SPRSEL_XER; when 5x"0d" => raddr := RAMSPR_TAR; + when 5x"0e" => + isram := '0'; + sel := SPRSEL_FSCR; + when 5x"0f" => + isram := '0'; + sel := SPRSEL_HFSCR; when others => valid := '0'; end case; - dbg_spr_addr <= isram & sel & std_ulogic_vector(raddr) & odd; + if isram = '1' then + dbg_spr_addr <= "1000" & std_ulogic_vector(raddr) & odd; + else + dbg_spr_addr <= "0000" & sel; + end if; spr_index_valid <= valid; end if; end process; diff --git a/decode1.vhdl b/decode1.vhdl index 151977d..fd20810 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -427,7 +427,7 @@ architecture behaviour of decode1 is function map_spr(sprn : spr_num_t) return spr_id is variable i : spr_id; begin - i.sel := "000"; + i.sel := "0000"; i.valid := '1'; i.ispmu := '0'; case sprn is @@ -452,6 +452,10 @@ architecture behaviour of decode1 is i.sel := SPRSEL_CFAR; when SPR_XER => i.sel := SPRSEL_XER; + when SPR_FSCR => + i.sel := SPRSEL_FSCR; + when SPR_HFSCR => + i.sel := SPRSEL_HFSCR; when others => i.valid := '0'; end case; @@ -521,7 +525,7 @@ begin v.big_endian := f_in.big_endian; if is_X(f_in.insn) then - v.spr_info := (sel => "XXX", others => 'X'); + v.spr_info := (sel => "XXXX", others => 'X'); v.ram_spr := (index => (others => 'X'), others => 'X'); else sprn := decode_spr_num(f_in.insn); diff --git a/decode2.vhdl b/decode2.vhdl index 74809f5..31a4909 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -450,6 +450,8 @@ begin v.input_ov := '1'; when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR => unit := LDST; + when SPR_TAR => + v.e.uses_tar := '1'; when others => end case; end if; @@ -468,6 +470,8 @@ begin if d_in.valid = '1' then v.sgl_pipe := '1'; end if; + when SPR_TAR => + v.e.uses_tar := '1'; when others => end case; if d_in.spr_info.valid = '1' and d_in.valid = '1' then @@ -525,6 +529,7 @@ begin v.e.ramspr_rd_odd := '1'; else v.e.ramspr_even_rdaddr := RAMSPR_TAR; + v.e.uses_tar := '1'; end if; sprs_busy := '1'; when OP_MFSPR => diff --git a/execute1.vhdl b/execute1.vhdl index 34be583..c6c0960 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -85,6 +85,10 @@ architecture behaviour of execute1 is ramspr_write_even : std_ulogic; ramspr_write_odd : std_ulogic; mult_32s : std_ulogic; + write_fscr : std_ulogic; + write_ic : std_ulogic; + write_hfscr : std_ulogic; + write_hic : std_ulogic; end record; constant side_effect_init : side_effect_type := (others => '0'); @@ -106,11 +110,12 @@ architecture behaviour of execute1 is res2_sel : std_ulogic_vector(1 downto 0); bypass_valid : std_ulogic; ramspr_odd_data : std_ulogic_vector(63 downto 0); + ic : std_ulogic_vector(3 downto 0); end record; constant actions_type_init : actions_type := (e => Execute1ToWritebackInit, se => side_effect_init, new_msr => (others => '0'), res2_sel => "00", - ramspr_odd_data => 64x"0", others => '0'); + ramspr_odd_data => 64x"0", ic => x"0", others => '0'); type reg_stage1_type is record e : Execute1ToWritebackType; @@ -141,6 +146,7 @@ architecture behaviour of execute1 is xerc_valid : std_ulogic; ramspr_wraddr : ramspr_index; ramspr_odd_data : std_ulogic_vector(63 downto 0); + ic : std_ulogic_vector(3 downto 0); end record; constant reg_stage1_type_init : reg_stage1_type := (e => Execute1ToWritebackInit, se => side_effect_init, @@ -155,7 +161,8 @@ architecture behaviour of execute1 is taken_branch_event => '0', br_mispredict => '0', msr => 64x"0", xerc => xerc_init, xerc_valid => '0', - ramspr_wraddr => (others => '0'), ramspr_odd_data => 64x"0"); + ramspr_wraddr => (others => '0'), ramspr_odd_data => 64x"0", + ic => x"0"); type reg_stage2_type is record e : Execute1ToWritebackType; @@ -369,6 +376,27 @@ architecture behaviour of execute1 is xerc.ov32 & xerc.ca32 & xer_low(17 downto 0); end; + function assemble_fscr(c: ctrl_t) return std_ulogic_vector is + variable ret : std_ulogic_vector(63 downto 0); + begin + ret := (others => '0'); + ret(59 downto 56) := c.fscr_ic; + ret(FSCR_PREFIX) := c.fscr_pref; + ret(FSCR_TAR) := c.fscr_tar; + return ret; + end; + + function assemble_hfscr(c: ctrl_t) return std_ulogic_vector is + variable ret : std_ulogic_vector(63 downto 0); + begin + ret := (others => '0'); + ret(59 downto 56) := c.hfscr_ic; + ret(HFSCR_PREFIX) := c.hfscr_pref; + ret(HFSCR_TAR) := c.hfscr_tar; + ret(HFSCR_FP) := c.hfscr_fp; + return ret; + end; + -- Tell vivado to keep the hierarchy for the random module so that the -- net names in the xdc file match. attribute keep_hierarchy : string; @@ -646,7 +674,14 @@ begin if dbg_spr_addr(7) = '1' then dbg_spr_data <= ramspr_result; else - dbg_spr_data <= assemble_xer(xerc_in, ctrl.xer_low); + case dbg_spr_addr(3 downto 0) is + when SPRSEL_FSCR => + dbg_spr_data <= assemble_fscr(ctrl); + when SPRSEL_HFSCR => + dbg_spr_data <= assemble_hfscr(ctrl); + when others => + dbg_spr_data <= assemble_xer(xerc_in, ctrl.xer_low); + end case; end if; dbg_spr_ack <= '1'; end if; @@ -1280,6 +1315,10 @@ begin v.se.write_dec := '1'; when SPRSEL_LOGA => v.se.write_loga := '1'; + when SPRSEL_FSCR => + v.se.write_fscr := '1'; + when SPRSEL_HFSCR => + v.se.write_hfscr := '1'; when others => end case; end if; @@ -1341,7 +1380,25 @@ begin end if; end case; - if misaligned = '1' then + if ex1.msr(MSR_PR) = '1' and e_in.prefixed = '1' and + (ctrl.hfscr_pref = '0' or ctrl.fscr_pref = '0') then + -- [Hypervisor] facility unavailable for prefixed instructions, + -- which has higher priority than the alignment interrupt for + -- misaligned prefixed instructions, which has higher priority than + -- other [hypervisor] facility unavailable interrupts (e.g. for + -- plfs with HFSCR[FP] = 0). + v.exception := '1'; + v.ic := x"b"; + if ctrl.hfscr_pref = '0' then + v.e.hv_intr := '1'; + v.e.intr_vec := 16#f80#; + v.se.write_hic := '1'; + else + v.e.intr_vec := 16#f60#; + v.se.write_ic := '1'; + end if; + + elsif misaligned = '1' then -- generate an alignment interrupt -- This is higher priority than illegal because a misaligned -- prefix will come down as an OP_ILLEGAL instruction. @@ -1373,6 +1430,29 @@ begin report "illegal instruction"; end if; + elsif ex1.msr(MSR_PR) = '1' and e_in.uses_tar = '1' and + (ctrl.hfscr_tar = '0' or ctrl.fscr_tar = '0') then + -- [Hypervisor] facility unavailable for TAR access + v.exception := '1'; + v.ic := x"8"; + if ctrl.hfscr_tar = '0' then + v.e.hv_intr := '1'; + v.e.intr_vec := 16#f80#; + v.se.write_hic := '1'; + else + v.e.intr_vec := 16#f60#; + v.se.write_ic := '1'; + end if; + + elsif HAS_FPU and ex1.msr(MSR_PR) = '1' and e_in.fac = FPU and + ctrl.hfscr_fp = '0' then + -- Hypervisor facility unavailable for FP instructions + v.exception := '1'; + v.ic := x"0"; + v.e.hv_intr := '1'; + v.e.intr_vec := 16#f80#; + v.se.write_hic := '1'; + elsif HAS_FPU and ex1.msr(MSR_FP) = '0' and e_in.fac = FPU then -- generate a floating-point unavailable interrupt v.exception := '1'; @@ -1414,6 +1494,7 @@ begin v.ramspr_wraddr := e_in.ramspr_wraddr; v.lr_from_next := e_in.lr; v.ramspr_odd_data := actions.ramspr_odd_data; + v.ic := actions.ic; end if; lv := Execute1ToLoadstore1Init; @@ -1669,6 +1750,8 @@ begin log_wr_addr & ex2.log_addr_spr when SPRSEL_LOGA, log_rd_data when SPRSEL_LOGD, ctrl.cfar when SPRSEL_CFAR, + assemble_fscr(ctrl) when SPRSEL_FSCR, + assemble_hfscr(ctrl) when SPRSEL_HFSCR, assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; stage2_stall <= l_in.l2stall or fp_in.f2stall; @@ -1811,6 +1894,21 @@ begin v.log_addr_spr := std_ulogic_vector(unsigned(ex2.log_addr_spr) + 1); end if; x_to_pmu.mtspr <= ex1.se.write_pmuspr; + if ex1.se.write_hfscr = '1' then + ctrl_tmp.hfscr_ic <= ex1.e.write_data(59 downto 56); + ctrl_tmp.hfscr_pref <= ex1.e.write_data(HFSCR_PREFIX); + ctrl_tmp.hfscr_tar <= ex1.e.write_data(HFSCR_TAR); + ctrl_tmp.hfscr_fp <= ex1.e.write_data(HFSCR_FP); + elsif ex1.se.write_hic = '1' then + ctrl_tmp.hfscr_ic <= ex1.ic; + end if; + if ex1.se.write_fscr = '1' then + ctrl_tmp.fscr_ic <= ex1.e.write_data(59 downto 56); + ctrl_tmp.fscr_pref <= ex1.e.write_data(FSCR_PREFIX); + ctrl_tmp.fscr_tar <= ex1.e.write_data(FSCR_TAR); + elsif ex1.se.write_ic = '1' then + ctrl_tmp.fscr_ic <= ex1.ic; + end if; end if; if interrupt_in.intr = '1' then diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index 07c1056..4cb2beb 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -550,6 +550,7 @@ static const char *fast_spr_names[] = "lr", "ctr", "srr0", "srr1", "hsrr0", "hsrr1", "sprg0", "sprg1", "sprg2", "sprg3", "hsprg0", "hsprg1", "xer", "tar", + "fscr", "hfscr", }; static const char *ldst_spr_names[] = { From d2777dd1dd84a33750c934b3e8c68a9f7a58f9ae Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 15 Sep 2023 20:18:14 +1000 Subject: [PATCH 03/21] Generate Hypervisor Emulation Assistance Interrupt for illegal instructions This implements the HEIR register (Hypervisor Emulation Instruction Register) and arranges for an illegal instruction to cause a Hypervisor Emulation Assistance Interrupt (HEAI) at vector 0xE40, and set HEIR to the illegal instruction. Signed-off-by: Paul Mackerras --- common.vhdl | 7 ++++++- core_debug.vhdl | 3 +++ decode1.vhdl | 2 ++ decode2.vhdl | 1 + execute1.vhdl | 35 +++++++++++++++++++++++++++++++---- scripts/mw_debug/mw_debug.c | 2 +- tests/illegal/head.S | 29 +++++++++++++++-------------- tests/test_illegal.bin | Bin 5224 -> 5500 bytes 8 files changed, 59 insertions(+), 20 deletions(-) diff --git a/common.vhdl b/common.vhdl index 6759f8f..fa6df86 100644 --- a/common.vhdl +++ b/common.vhdl @@ -57,6 +57,7 @@ package common is constant SPR_PVR : spr_num_t := 287; constant SPR_FSCR : spr_num_t := 153; constant SPR_HFSCR : spr_num_t := 190; + constant SPR_HEIR : spr_num_t := 339; -- PMU registers constant SPR_UPMC1 : spr_num_t := 771; @@ -159,6 +160,7 @@ package common is constant SPRSEL_CFAR : spr_selector := 4x"6"; constant SPRSEL_FSCR : spr_selector := 4x"7"; constant SPRSEL_HFSCR : spr_selector := 4x"8"; + constant SPRSEL_HEIR : spr_selector := 4x"9"; constant SPRSEL_XER : spr_selector := 4x"f"; -- FSCR and HFSCR bit numbers @@ -253,6 +255,7 @@ package common is hfscr_pref: std_ulogic; hfscr_tar: std_ulogic; hfscr_fp: std_ulogic; + heir: std_ulogic_vector(63 downto 0); end record; constant ctrl_t_init : ctrl_t := (xer_low => 18x"0", @@ -401,6 +404,7 @@ package common is dbg_spr_access : std_ulogic; dec_ctr : std_ulogic; prefixed : std_ulogic; + prefix : std_ulogic_vector(25 downto 0); illegal_suffix : std_ulogic; misaligned_prefix : std_ulogic; uses_tar : std_ulogic; @@ -423,7 +427,8 @@ package common is ramspr_wraddr => (others => '0'), ramspr_write_even => '0', ramspr_write_odd => '0', dbg_spr_access => '0', dec_ctr => '0', - prefixed => '0', illegal_suffix => '0', misaligned_prefix => '0', uses_tar => '0', + prefixed => '0', prefix => (others => '0'), illegal_suffix => '0', + misaligned_prefix => '0', uses_tar => '0', others => (others => '0')); type MultiplyInputType is record diff --git a/core_debug.vhdl b/core_debug.vhdl index 8e06127..6997477 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -330,6 +330,9 @@ begin when 5x"0f" => isram := '0'; sel := SPRSEL_HFSCR; + when 5x"10" => + isram := '0'; + sel := SPRSEL_HEIR; when others => valid := '0'; end case; diff --git a/decode1.vhdl b/decode1.vhdl index fd20810..a4e4908 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -456,6 +456,8 @@ architecture behaviour of decode1 is i.sel := SPRSEL_FSCR; when SPR_HFSCR => i.sel := SPRSEL_HFSCR; + when SPR_HEIR => + i.sel := SPRSEL_HEIR; when others => i.valid := '0'; end case; diff --git a/decode2.vhdl b/decode2.vhdl index 31a4909..e7c73fa 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -609,6 +609,7 @@ begin end if; end if; v.e.prefixed := d_in.prefixed; + v.e.prefix := d_in.prefix; v.e.illegal_suffix := d_in.illegal_suffix; v.e.misaligned_prefix := d_in.misaligned_prefix; diff --git a/execute1.vhdl b/execute1.vhdl index c6c0960..b71842d 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -89,6 +89,8 @@ architecture behaviour of execute1 is write_ic : std_ulogic; write_hfscr : std_ulogic; write_hic : std_ulogic; + write_heir : std_ulogic; + set_heir : std_ulogic; end record; constant side_effect_init : side_effect_type := (others => '0'); @@ -147,6 +149,9 @@ architecture behaviour of execute1 is ramspr_wraddr : ramspr_index; ramspr_odd_data : std_ulogic_vector(63 downto 0); ic : std_ulogic_vector(3 downto 0); + prefixed : std_ulogic; + insn : std_ulogic_vector(31 downto 0); + prefix : std_ulogic_vector(25 downto 0); end record; constant reg_stage1_type_init : reg_stage1_type := (e => Execute1ToWritebackInit, se => side_effect_init, @@ -162,7 +167,8 @@ architecture behaviour of execute1 is msr => 64x"0", xerc => xerc_init, xerc_valid => '0', ramspr_wraddr => (others => '0'), ramspr_odd_data => 64x"0", - ic => x"0"); + ic => x"0", + prefixed => '0', insn => 32x"0", prefix => 26x"0"); type reg_stage2_type is record e : Execute1ToWritebackType; @@ -679,6 +685,8 @@ begin dbg_spr_data <= assemble_fscr(ctrl); when SPRSEL_HFSCR => dbg_spr_data <= assemble_hfscr(ctrl); + when SPRSEL_HEIR => + dbg_spr_data <= ctrl.heir; when others => dbg_spr_data <= assemble_xer(xerc_in, ctrl.xer_low); end case; @@ -1319,6 +1327,8 @@ begin v.se.write_fscr := '1'; when SPRSEL_HFSCR => v.se.write_hfscr := '1'; + when SPRSEL_HEIR => + v.se.write_heir := '1'; when others => end case; end if; @@ -1421,11 +1431,13 @@ begin end if; elsif illegal = '1' then + -- generate hypervisor emulation assistance interrupt (HEAI) + -- and write the offending instruction into HEIR v.exception := '1'; v.e.srr1(47 - 34) := e_in.prefixed; - -- Since we aren't doing Hypervisor emulation assist (0xe40) we - -- set bit 44 to indicate we have an illegal - v.e.srr1(47 - 44) := '1'; + v.e.intr_vec := 16#e40#; + v.e.hv_intr := '1'; + v.se.set_heir := '1'; if e_in.valid = '1' then report "illegal instruction"; end if; @@ -1495,6 +1507,9 @@ begin v.lr_from_next := e_in.lr; v.ramspr_odd_data := actions.ramspr_odd_data; v.ic := actions.ic; + v.prefixed := e_in.prefixed; + v.insn := e_in.insn; + v.prefix := e_in.prefix; end if; lv := Execute1ToLoadstore1Init; @@ -1752,6 +1767,7 @@ begin ctrl.cfar when SPRSEL_CFAR, assemble_fscr(ctrl) when SPRSEL_FSCR, assemble_hfscr(ctrl) when SPRSEL_HFSCR, + ctrl.heir when SPRSEL_HEIR, assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; stage2_stall <= l_in.l2stall or fp_in.f2stall; @@ -1909,6 +1925,17 @@ begin elsif ex1.se.write_ic = '1' then ctrl_tmp.fscr_ic <= ex1.ic; end if; + if ex1.se.write_heir = '1' then + ctrl_tmp.heir <= ex1.e.write_data; + elsif ex1.se.set_heir = '1' then + ctrl_tmp.heir(31 downto 0) <= ex1.insn; + if ex1.prefixed = '1' then + ctrl_tmp.heir(63 downto 58) <= 6x"01"; + ctrl_tmp.heir(57 downto 32) <= ex1.prefix; + else + ctrl_tmp.heir(63 downto 32) <= (others => '0'); + end if; + end if; end if; if interrupt_in.intr = '1' then diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index 4cb2beb..13b5788 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -550,7 +550,7 @@ static const char *fast_spr_names[] = "lr", "ctr", "srr0", "srr1", "hsrr0", "hsrr1", "sprg0", "sprg1", "sprg2", "sprg3", "hsprg0", "hsprg1", "xer", "tar", - "fscr", "hfscr", + "fscr", "hfscr", "heir", }; static const char *ldst_spr_names[] = { diff --git a/tests/illegal/head.S b/tests/illegal/head.S index 5446d68..2f7e3d2 100644 --- a/tests/illegal/head.S +++ b/tests/illegal/head.S @@ -74,25 +74,15 @@ ill_test_1: EXCEPTION(0x500) EXCEPTION(0x600) + // We shouldn't get a Program interrupt at 700, so fail . = 0x700 mtsprg0 %r3 - mtsprg1 %r4 - - // test for bit 44 being set for ILL - mfsrr1 %r3 - li %r4, 1 - sldi %r4, %r4, (63-44) - and. %r4, %r4, %r3 - li %r4, 8 // PASS so skip 2 instructions - bne 1f - li %r4, 4 // FAIL so only skip 1 instruction. Return will catch -1: + mfsrr0 %r3 - add %r3, %r3, %r4 // skip some instructions + addi %r3, %r3, 4 // skip one instruction, causing a fail mtsrr0 %r3 mfsprg0 %r3 - mfsprg1 %r4 rfid EXCEPTION(0x800) @@ -104,7 +94,18 @@ ill_test_1: EXCEPTION(0xd00) EXCEPTION(0xe00) EXCEPTION(0xe20) - EXCEPTION(0xe40) + + // We now expect a HEAI at e40 for illegal instructions + . = 0xe40 + mthsprg0 %r3 + + mfhsrr0 %r3 + addi %r3, %r3, 8 // skip one instruction, causing success + mthsrr0 %r3 + + mfhsprg0 %r3 + hrfid + EXCEPTION(0xe60) EXCEPTION(0xe80) EXCEPTION(0xf00) diff --git a/tests/test_illegal.bin b/tests/test_illegal.bin index 727876cf46da68ffefca3b1659275b558a4f8dd1..22bbbc1190e4f44b761fcbb1a8fd791badee4350 100755 GIT binary patch literal 5500 zcmeHLzi$&)9RI$vWnWwq8L300L~srn2=ZWtO6L-zJJSZ@&;X59)a{HLsYo3XmBEhA z_=q|b6sc9k01IJ29wO8!e?VF!#MmxXr-mvE2}p!3Re;Cm`|cb&C^Z9vk?gPZ{GPu* zKi?ncCtsN843RNLlyn|OlV$AR(BY&*2|G(WDqq{k?uIVo1 zJCN;^2d}zPUK)$h;}g!6k_nsbKV@?Jg;4!-{@Qt6OH<8^kvzN-Ex8v0{`sw%o5pM~ zFB|Ya#e0F{!8uJy6OJbXh{D6W1wqaD9J>+gfab^I_X;zxx%OPyIYgn<2rbx*PXOPy z!1r~F|CQ)^Z3z11!J5ip7V=CWf7}{dG-CsgqkF^Gc_%NIA?iz+h(UC0<6|1l+z*qx zJY`e`jm|xj%xAqzPZa-<}~oKkxIuvXD!DPRR_PZvhYaef^KfLT|a(x!Xeod+7W! zJ*s>S1o~3?{obw1DrGIwiw5a_jB%>2NA*g>XVcF_^=>d5f{YC2={CVOZs!;jW#Qtc1{~VM3551J~XSM?|j|*@MydRkXI*Mgs#a~J3U{^Sn#2xLF niu0L@0w4&Z?7zcqL6_~A*r$h80M8OQfS7j^%X=*)>odIppCp}O delta 798 zcmZ`$O-vI(6#k~IFd=ALAfZVKE)^0BYHK{WD`K{g@Y51$0#T3E#tSEj9J;v8Zh#A3 zOr3;;nBYOvgYhqss{!TU$pkN61?5mYajdB_zS%HoDL~7ioZqN`9qyu!NSpYxG_UH;O-#B_!6*` zNMcna!qHRHm6rk7FFJGfwTAqZ$b~zD(-ls36#SFmEm8W9#5s~!4@X(A_!jPCX%TGc zarWhhT2=1Uq}=QefmOfAt}4U9BVaud!m6pvk`sOHkIXdTX;4^?} zL}-oJFb7PeiBH%08)gw3)JN95<9^qn)%&^Y`_%#BD1ST@F9xck)@G1fF6l7RQ+>_E z0V6Bwe{NQqhPTrZvq&4?^X@k_@9z5;w*pAlc@K;2`AMXiM)B$ Date: Fri, 15 Sep 2023 21:56:25 +1000 Subject: [PATCH 04/21] execute1: Make CFAR able to be written using mtspr and read using DMI debug mtspr to CFAR is currently a no-op, which is not what should happen. Make it set the contents of CFAR. Also provide access to CFAR via the DMI debug interface as register 0x31. Fixes: c2da82764f74 ("core: Implement CFAR register", 2020-06-15) Signed-off-by: Paul Mackerras --- core_debug.vhdl | 3 +++ execute1.vhdl | 15 +++++++++++---- scripts/mw_debug/mw_debug.c | 2 +- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/core_debug.vhdl b/core_debug.vhdl index 6997477..67b41fb 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -333,6 +333,9 @@ begin when 5x"10" => isram := '0'; sel := SPRSEL_HEIR; + when 5x"11" => + isram := '0'; + sel := SPRSEL_CFAR; when others => valid := '0'; end case; diff --git a/execute1.vhdl b/execute1.vhdl index b71842d..e48bfb0 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -79,6 +79,7 @@ architecture behaviour of execute1 is write_xerlow : std_ulogic; write_dec : std_ulogic; write_cfar : std_ulogic; + set_cfar : std_ulogic; write_loga : std_ulogic; inc_loga : std_ulogic; write_pmuspr : std_ulogic; @@ -687,6 +688,8 @@ begin dbg_spr_data <= assemble_hfscr(ctrl); when SPRSEL_HEIR => dbg_spr_data <= ctrl.heir; + when SPRSEL_CFAR => + dbg_spr_data <= ctrl.cfar; when others => dbg_spr_data <= assemble_xer(xerc_in, ctrl.xer_low); end case; @@ -1177,7 +1180,7 @@ begin if ex1.msr(MSR_BE) = '1' then v.do_trace := '1'; end if; - v.se.write_cfar := '1'; + v.se.set_cfar := '1'; when OP_BC => -- If CTR is being decremented, it is in ramspr_odd. bo := insn_bo(e_in.insn); @@ -1196,7 +1199,7 @@ begin if ex1.msr(MSR_BE) = '1' then v.do_trace := '1'; end if; - v.se.write_cfar := v.take_branch; + v.se.set_cfar := v.take_branch; when OP_BCREG => -- If CTR is being decremented, it is in ramspr_odd. -- The target address is in ramspr_result (LR, CTR or TAR). @@ -1209,7 +1212,7 @@ begin if ex1.msr(MSR_BE) = '1' then v.do_trace := '1'; end if; - v.se.write_cfar := v.take_branch; + v.se.set_cfar := v.take_branch; when OP_RFID => srr1 := ramspr_odd; @@ -1229,7 +1232,7 @@ begin end if; v.se.write_msr := '1'; v.e.redirect := '1'; - v.se.write_cfar := '1'; + v.se.set_cfar := '1'; if HAS_FPU then v.fp_intr := fp_in.exception and (srr1(MSR_FE0) or srr1(MSR_FE1)); @@ -1323,6 +1326,8 @@ begin v.se.write_dec := '1'; when SPRSEL_LOGA => v.se.write_loga := '1'; + when SPRSEL_CFAR => + v.se.write_cfar := '1'; when SPRSEL_FSCR => v.se.write_fscr := '1'; when SPRSEL_HFSCR => @@ -1902,6 +1907,8 @@ begin ctrl_tmp.dec <= ex1.e.write_data; end if; if ex1.se.write_cfar = '1' then + ctrl_tmp.cfar <= ex1.e.write_data; + elsif ex1.se.set_cfar = '1' then ctrl_tmp.cfar <= ex1.e.last_nia; end if; if ex1.se.write_loga = '1' then diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index 13b5788..d7966d9 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -550,7 +550,7 @@ static const char *fast_spr_names[] = "lr", "ctr", "srr0", "srr1", "hsrr0", "hsrr1", "sprg0", "sprg1", "sprg2", "sprg3", "hsprg0", "hsprg1", "xer", "tar", - "fscr", "hfscr", "heir", + "fscr", "hfscr", "heir", "cfar", }; static const char *ldst_spr_names[] = { From 7bc7f335f1fb96cd6362a8efc4c4e4cc38e60cd2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 16 Sep 2023 13:53:34 +1000 Subject: [PATCH 05/21] Implement CTRL register The CTRL register has a single bit called RUN. It has some unusual behaviours: - It can only be written via SPR number 152, which is privileged - It can only be read via SPR number 136, which is non-privileged - Reading in problem state (user mode) returns the RUN bit in bit 0, but reading in privileged state (hypervisor mode) returns the RUN bit in bits 0 and 15. - Reading SPR 152 in problem state causes a HEAI (illegal instruction) interrupt, but reading in privileged state is a no-op; this is the same as for an unimplemented SPR. The RUN bit goes to the PMU and is also plumbed out to drive a LED on the Arty board. Signed-off-by: Paul Mackerras --- common.vhdl | 8 +++++++- core.vhdl | 2 ++ decode1.vhdl | 8 ++++++++ decode2.vhdl | 13 +++++++++---- execute1.vhdl | 20 +++++++++++++++++++- fpga/top-arty.vhdl | 5 +++++ soc.vhdl | 3 +++ 7 files changed, 53 insertions(+), 6 deletions(-) diff --git a/common.vhdl b/common.vhdl index fa6df86..9f38874 100644 --- a/common.vhdl +++ b/common.vhdl @@ -58,6 +58,8 @@ package common is constant SPR_FSCR : spr_num_t := 153; constant SPR_HFSCR : spr_num_t := 190; constant SPR_HEIR : spr_num_t := 339; + constant SPR_CTRL : spr_num_t := 136; + constant SPR_CTRLW : spr_num_t := 152; -- PMU registers constant SPR_UPMC1 : spr_num_t := 771; @@ -148,6 +150,8 @@ package common is sel : spr_selector; valid : std_ulogic; ispmu : std_ulogic; + ronly : std_ulogic; + wonly : std_ulogic; end record; constant spr_id_init : spr_id := (sel => "0000", others => '0'); @@ -161,6 +165,7 @@ package common is constant SPRSEL_FSCR : spr_selector := 4x"7"; constant SPRSEL_HFSCR : spr_selector := 4x"8"; constant SPRSEL_HEIR : spr_selector := 4x"9"; + constant SPRSEL_CTRL : spr_selector := 4x"a"; constant SPRSEL_XER : spr_selector := 4x"f"; -- FSCR and HFSCR bit numbers @@ -243,6 +248,7 @@ package common is -- This needs to die... type ctrl_t is record + run: std_ulogic; tb: std_ulogic_vector(63 downto 0); dec: std_ulogic_vector(63 downto 0); msr: std_ulogic_vector(63 downto 0); @@ -258,7 +264,7 @@ package common is heir: std_ulogic_vector(63 downto 0); end record; constant ctrl_t_init : ctrl_t := - (xer_low => 18x"0", + (run => '1', xer_low => 18x"0", fscr_ic => x"0", fscr_pref => '1', fscr_tar => '1', hfscr_ic => x"0", hfscr_pref => '1', hfscr_tar => '1', hfscr_fp => '1', others => (others => '0')); diff --git a/core.vhdl b/core.vhdl index 35a860e..bba1004 100644 --- a/core.vhdl +++ b/core.vhdl @@ -48,6 +48,7 @@ entity core is ext_irq : in std_ulogic; + run_out : out std_ulogic; terminated_out : out std_logic ); end core; @@ -390,6 +391,7 @@ begin ls_events => loadstore_events, dc_events => dcache_events, ic_events => icache_events, + run_out => run_out, terminate_out => terminate, dbg_spr_req => dbg_spr_req, dbg_spr_ack => dbg_spr_ack, diff --git a/decode1.vhdl b/decode1.vhdl index a4e4908..09f9f77 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -430,6 +430,8 @@ architecture behaviour of decode1 is i.sel := "0000"; i.valid := '1'; i.ispmu := '0'; + i.ronly := '0'; + i.wonly := '0'; case sprn is when SPR_TB => i.sel := SPRSEL_TB; @@ -458,6 +460,12 @@ architecture behaviour of decode1 is i.sel := SPRSEL_HFSCR; when SPR_HEIR => i.sel := SPRSEL_HEIR; + when SPR_CTRL => + i.sel := SPRSEL_CTRL; + i.ronly := '1'; + when SPR_CTRLW => + i.sel := SPRSEL_CTRL; + i.wonly := '1'; when others => i.valid := '0'; end case; diff --git a/decode2.vhdl b/decode2.vhdl index e7c73fa..1c3f324 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -420,6 +420,8 @@ begin v.e.input_cr := d_in.decode.input_cr; v.e.output_cr := d_in.decode.output_cr; + v.e.spr_select := d_in.spr_info; + -- Work out whether XER SO/OV/OV32 bits are set -- or used by this instruction v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); @@ -454,6 +456,9 @@ begin v.e.uses_tar := '1'; when others => end case; + if d_in.spr_info.wonly = '1' then + v.e.spr_select.valid := '0'; + end if; end if; when OP_MTSPR => if is_X(d_in.insn) then @@ -474,7 +479,9 @@ begin v.e.uses_tar := '1'; when others => end case; - if d_in.spr_info.valid = '1' and d_in.valid = '1' then + if d_in.spr_info.ronly = '1' then + v.e.spr_select.valid := '0'; + elsif d_in.spr_info.valid = '1' and d_in.valid = '1' then v.sgl_pipe := '1'; end if; end if; @@ -505,8 +512,6 @@ begin v.e.repeat := '1'; end if; - v.e.spr_select := d_in.spr_info; - if decctr = '1' then -- read and write CTR v.e.ramspr_odd_rdaddr := RAMSPR_CTR; @@ -602,7 +607,7 @@ begin if op = OP_MFSPR then if d_in.ram_spr.valid = '1' then v.e.result_sel := "101"; -- ramspr_result - elsif d_in.spr_info.valid = '0' then + elsif d_in.spr_info.valid = '0' or d_in.spr_info.wonly = '1' then -- Privileged mfspr to invalid/unimplemented SPR numbers -- writes the contents of RT back to RT (i.e. it's a no-op) v.e.result_sel := "001"; -- logical_result diff --git a/execute1.vhdl b/execute1.vhdl index e48bfb0..ed79a3d 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -45,6 +45,7 @@ entity execute1 is dbg_ctrl_out : out ctrl_t; + run_out : out std_ulogic; icache_inval : out std_ulogic; terminate_out : out std_ulogic; @@ -92,6 +93,7 @@ architecture behaviour of execute1 is write_hic : std_ulogic; write_heir : std_ulogic; set_heir : std_ulogic; + write_ctrl : std_ulogic; end record; constant side_effect_init : side_effect_type := (others => '0'); @@ -404,6 +406,15 @@ architecture behaviour of execute1 is return ret; end; + function assemble_ctrl(c: ctrl_t; msrpr: std_ulogic) return std_ulogic_vector is + variable ret : std_ulogic_vector(63 downto 0); + begin + ret := (others => '0'); + ret(0) := c.run; + ret(15) := c.run and not msrpr; + return ret; + end; + -- Tell vivado to keep the hierarchy for the random module so that the -- net names in the xdc file match. attribute keep_hierarchy : string; @@ -523,7 +534,7 @@ begin x_to_pmu.addr_v <= '0'; x_to_pmu.spr_num <= ex1.pmu_spr_num; x_to_pmu.spr_val <= ex1.e.write_data; - x_to_pmu.run <= '1'; + x_to_pmu.run <= ctrl.run; -- XER forwarding. The CA and CA32 bits are only modified by instructions -- that are handled here, so for them we can just use the result most @@ -1334,6 +1345,8 @@ begin v.se.write_hfscr := '1'; when SPRSEL_HEIR => v.se.write_heir := '1'; + when SPRSEL_CTRL => + v.se.write_ctrl := '1'; when others => end case; end if; @@ -1773,6 +1786,7 @@ begin assemble_fscr(ctrl) when SPRSEL_FSCR, assemble_hfscr(ctrl) when SPRSEL_HFSCR, ctrl.heir when SPRSEL_HEIR, + assemble_ctrl(ctrl, ex1.msr(MSR_PR)) when SPRSEL_CTRL, assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; stage2_stall <= l_in.l2stall or fp_in.f2stall; @@ -1943,6 +1957,9 @@ begin ctrl_tmp.heir(63 downto 32) <= (others => '0'); end if; end if; + if ex1.se.write_ctrl = '1' then + ctrl_tmp.run <= ex1.e.write_data(0); + end if; end if; if interrupt_in.intr = '1' then @@ -1981,6 +1998,7 @@ begin e_out <= ex2.e; e_out.msr <= msr_copy(ctrl.msr); + run_out <= ctrl.run; terminate_out <= ex2.se.terminate; icache_inval <= ex2.se.icache_inval; diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index 0980667..c3be9d9 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -142,6 +142,9 @@ end entity toplevel; architecture behaviour of toplevel is + -- Status + signal run_out : std_ulogic; + -- Reset signals: signal soc_rst : std_ulogic; signal pll_rst : std_ulogic; @@ -263,6 +266,7 @@ begin system_clk => system_clk, rst => soc_rst, sw_soc_reset => sw_rst, + run_out => run_out, -- UART signals uart0_txd => uart_main_tx, @@ -742,6 +746,7 @@ begin led4 <= system_clk_locked; led5 <= eth_clk_locked; led6 <= not soc_rst; + led7 <= run_out; -- GPIO gpio_in(10) <= btn0; diff --git a/soc.vhdl b/soc.vhdl index 942da63..71474df 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -99,6 +99,8 @@ entity soc is rst : in std_ulogic; system_clk : in std_ulogic; + run_out : out std_ulogic; + -- "Large" (64-bit) DRAM wishbone wb_dram_in : out wishbone_master_out; wb_dram_out : in wishbone_slave_out := wishbone_slave_out_init; @@ -366,6 +368,7 @@ begin clk => system_clk, rst => rst_core, alt_reset => alt_reset_d, + run_out => run_out, wishbone_insn_in => wishbone_icore_in, wishbone_insn_out => wishbone_icore_out, wishbone_data_in => wishbone_dcore_in, From 205c0e2c787590139d8a8d93521029c6e5c9c947 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 18 Sep 2023 22:15:07 +1000 Subject: [PATCH 06/21] Implement the wait instruction This implements the behaviour of the 'wait 0' instruction of pausing execution of instructions until an exception arises. The exceptions that terminate a wait are a pending trace exception, external interrupt request, PMU interrupt request, or decrementer negative exception. These exception conditions terminate a wait even if not enabled to generate an interrupt (e.g. if MSR[EE] is zero). This is implemented by having execute1 assert its busy_out signal while the wait state exists. The wait state is set by the completion of the wait instruction and cleared by a pending exception. If the WC operand of the wait instruction is non-zero, indicating wait for reservation loss or wait for a short period, then the wait instruction does not wait, but just acts as a no-op. In order to make space in the insn_type_t type without going over 64 elements, this combines OP_DCBT and OP_ICBT into a single OP_XCBT, since they were both no-ops (except for their influence on how SRR1 is set on a trace interrupt, where they were identical). Signed-off-by: Paul Mackerras --- common.vhdl | 3 ++- decode1.vhdl | 6 +++--- decode_types.vhdl | 5 +++-- execute1.vhdl | 28 +++++++++++++++++++++++----- scripts/fmt_log/fmt_log.c | 12 ++++++------ 5 files changed, 37 insertions(+), 17 deletions(-) diff --git a/common.vhdl b/common.vhdl index 9f38874..58033d1 100644 --- a/common.vhdl +++ b/common.vhdl @@ -248,6 +248,7 @@ package common is -- This needs to die... type ctrl_t is record + wait_state: std_ulogic; run: std_ulogic; tb: std_ulogic_vector(63 downto 0); dec: std_ulogic_vector(63 downto 0); @@ -264,7 +265,7 @@ package common is heir: std_ulogic_vector(63 downto 0); end record; constant ctrl_t_init : ctrl_t := - (run => '1', xer_low => 18x"0", + (wait_state => '0', run => '1', xer_low => 18x"0", fscr_ic => x"0", fscr_pref => '1', fscr_tar => '1', hfscr_ic => x"0", hfscr_pref => '1', hfscr_tar => '1', hfscr_fp => '1', others => (others => '0')); diff --git a/decode1.vhdl b/decode1.vhdl index 09f9f77..252a21f 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -128,7 +128,7 @@ architecture behaviour of decode1 is INSN_darn => (ALU, NONE, OP_DARN, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbf => (ALU, NONE, OP_DCBF, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbst => (ALU, NONE, OP_DCBST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbt => (ALU, NONE, OP_DCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbt => (ALU, NONE, OP_XCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbtst => (ALU, NONE, OP_DCBTST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbz => (LDST, NONE, OP_DCBZ, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_divd => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RCOE, '0', '0', NONE), @@ -197,7 +197,7 @@ architecture behaviour of decode1 is INSN_ftdiv => (FPU, FPU, OP_FP_CMP, FRA, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_ftsqrt => (FPU, FPU, OP_FP_CMP, NONE, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_icbi => (ALU, NONE, OP_ICBI, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), - INSN_icbt => (ALU, NONE, OP_ICBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), + INSN_icbt => (ALU, NONE, OP_XCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_isel => (ALU, NONE, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_isync => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_lbarx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), @@ -373,7 +373,7 @@ architecture behaviour of decode1 is INSN_tlbsync => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_tw => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), INSN_twi => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), - INSN_wait => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_wait => (ALU, NONE, OP_WAIT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), INSN_xor => (ALU, NONE, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), INSN_xori => (ALU, NONE, OP_XOR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_xoris => (ALU, NONE, OP_XOR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), diff --git a/decode_types.vhdl b/decode_types.vhdl index 5b21fff..8cb732a 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -7,8 +7,8 @@ package decode_types is OP_BCD, OP_BPERM, OP_BREV, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB, OP_CNTZ, OP_CROP, - OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, - OP_DCBZ, OP_ICBI, OP_ICBT, + OP_DARN, OP_DCBF, OP_DCBST, OP_XCBT, OP_DCBTST, + OP_DCBZ, OP_ICBI, OP_FP_CMP, OP_FP_ARITH, OP_FP_MOVE, OP_FP_MISC, OP_DIV, OP_DIVE, OP_MOD, OP_EXTS, OP_EXTSWSLI, @@ -24,6 +24,7 @@ package decode_types is OP_SYNC, OP_TLBIE, OP_TRAP, OP_XOR, OP_ADDG6S, + OP_WAIT, OP_FETCH_FAILED ); diff --git a/execute1.vhdl b/execute1.vhdl index ed79a3d..b1087ba 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -94,6 +94,7 @@ architecture behaviour of execute1 is write_heir : std_ulogic; set_heir : std_ulogic; write_ctrl : std_ulogic; + enter_wait : std_ulogic; end record; constant side_effect_init : side_effect_type := (others => '0'); @@ -551,7 +552,7 @@ begin -- N.B. the busy signal from each source includes the -- stage2 stall from that source in it. - busy_out <= l_in.busy or ex1.busy or fp_in.busy; + busy_out <= l_in.busy or ex1.busy or fp_in.busy or ctrl.wait_state; valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt); @@ -1146,7 +1147,7 @@ begin else illegal := '1'; end if; - when OP_NOP | OP_DCBF | OP_DCBST | OP_DCBT | OP_DCBTST | OP_ICBT => + when OP_NOP | OP_DCBF | OP_DCBST | OP_XCBT | OP_DCBTST => -- Do nothing when OP_ADD => if e_in.output_carry = '1' then @@ -1398,6 +1399,11 @@ begin owait := '1'; end if; + when OP_WAIT => + if e_in.insn(22 downto 21) = "00" then + v.se.enter_wait := '1'; + end if; + when OP_FETCH_FAILED => -- Handling an ITLB miss doesn't count as having executed an instruction v.do_trace := '0'; @@ -1513,7 +1519,7 @@ begin variable bypass_valid : std_ulogic; begin v := ex1; - if (ex1.busy or l_in.busy or fp_in.busy) = '0' then + if busy_out = '0' then v.e := actions.e; v.e.valid := '0'; v.oe := e_in.oe; @@ -1577,8 +1583,8 @@ begin v.e.srr1 := (others => '0'); v.e.srr1(47 - 33) := '1'; v.e.srr1(47 - 34) := ex1.prev_prefixed; - if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or ex1.prev_op = OP_ICBT or - ex1.prev_op = OP_DCBT or ex1.prev_op = OP_DCBST or ex1.prev_op = OP_DCBF then + if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or + ex1.prev_op = OP_XCBT or ex1.prev_op = OP_DCBST or ex1.prev_op = OP_DCBF then v.e.srr1(47 - 35) := '1'; elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ or ex1.prev_op = OP_DCBTST then @@ -1802,6 +1808,7 @@ begin variable cr_mask : std_ulogic_vector(7 downto 0); variable sign, zero : std_ulogic; variable rcnz_hi, rcnz_lo : std_ulogic; + variable irq_exc : std_ulogic; begin -- Next insn adder used in a couple of places next_nia <= std_ulogic_vector(unsigned(ex1.e.last_nia) + 4); @@ -1960,6 +1967,17 @@ begin if ex1.se.write_ctrl = '1' then ctrl_tmp.run <= ex1.e.write_data(0); end if; + if ex1.se.enter_wait = '1' then + ctrl_tmp.wait_state <= '1'; + end if; + end if; + + -- pending exceptions clear any wait state + -- ex1.fp_exception_next is not tested because it is not possible to + -- get into wait state with a pending FP exception. + irq_exc := pmu_to_x.intr or ctrl.dec(63) or ext_irq_in; + if ex1.trace_next = '1' or irq_exc = '1' or interrupt_in.intr = '1' then + ctrl_tmp.wait_state <= '0'; end if; if interrupt_in.intr = '1' then diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c index e15d42d..226cfbe 100644 --- a/scripts/fmt_log/fmt_log.c +++ b/scripts/fmt_log/fmt_log.c @@ -88,12 +88,12 @@ const char *ops[64] = { "illegal", "nop ", "add ", "attn ", "b ", "bc ", "bcreg ", "bcd ", "bperm ", "brev ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", - "darn ", "dcbf ", "dcbst ", "dcbt ", "dcbtst ", "dcbz ", "icbi ", "icbt ", - "fpcmp ", "fparith", "fpmove ", "fpmisc ", "div ", "dive ", "mod ", "exts ", - "extswsl", "isel ", "isync ", "logic ", "ld ", "st ", "mcrxrx ", "mfcr ", - "mfmsr ", "mfspr ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", - "popcnt ", "prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", - "shl ", "shr ", "sync ", "tlbie ", "trap ", "xor ", "addg6s ", "ffail ", + "darn ", "dcbf ", "dcbst ", "xcbt ", "dcbtst ", "dcbz ", "icbi ", "fpcmp ", + "fparith", "fpmove ", "fpmisc ", "div ", "dive ", "mod ", "exts ", "extswsl", + "isel ", "isync ", "logic ", "ld ", "st ", "mcrxrx ", "mfcr ", "mfmsr ", + "mfspr ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "popcnt ", + "prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", + "shr ", "sync ", "tlbie ", "trap ", "xor ", "addg6s ", "wait ", "ffail ", }; const char *spr_names[13] = From a88fa9c4599e0583b56b08579f61ec63a8db6ca9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 20 Sep 2023 20:38:22 +1000 Subject: [PATCH 07/21] Implement DSCR The DSCR (Data Stream Control Register) is a user-accessible SPR that controls aspects of data prefetching. It has 25 bits of state defined in the ISA. This implements the register as a 25 read/write bits that do nothing, since we don't have any prefetching. The DSCR is accessible at two SPR numbers, 3 (unprivileged) and 17 (privileged). Access via these SPR numbers is controlled by an FSCR bit and an HFSCR bit. The FSCR bit controls access via SPR 3 in user mode. The HFSCR bit controls access via SPR 3 in user mode and either SPR number in privileged non-hypervisor mode, but since we don't implement privileged non-hypervisor mode, it does essentially the same thing as the FSCR bit. Signed-off-by: Paul Mackerras --- common.vhdl | 16 ++++++++++++---- decode1.vhdl | 4 ++++ decode2.vhdl | 4 ++++ execute1.vhdl | 25 +++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 4 deletions(-) diff --git a/common.vhdl b/common.vhdl index 58033d1..033b004 100644 --- a/common.vhdl +++ b/common.vhdl @@ -60,6 +60,8 @@ package common is constant SPR_HEIR : spr_num_t := 339; constant SPR_CTRL : spr_num_t := 136; constant SPR_CTRLW : spr_num_t := 152; + constant SPR_UDSCR : spr_num_t := 3; + constant SPR_DSCR : spr_num_t := 17; -- PMU registers constant SPR_UPMC1 : spr_num_t := 771; @@ -166,13 +168,14 @@ package common is constant SPRSEL_HFSCR : spr_selector := 4x"8"; constant SPRSEL_HEIR : spr_selector := 4x"9"; constant SPRSEL_CTRL : spr_selector := 4x"a"; + constant SPRSEL_DSCR : spr_selector := 4x"b"; constant SPRSEL_XER : spr_selector := 4x"f"; -- FSCR and HFSCR bit numbers constant FSCR_PREFIX : integer := 63 - 50; constant FSCR_SCV : integer := 63 - 51; constant FSCR_TAR : integer := 63 - 55; - constant FSCR_DSCR3 : integer := 63 - 61; + constant FSCR_DSCR : integer := 63 - 61; constant HFSCR_PREFIX : integer := 63 - 50; constant HFSCR_MSG : integer := 63 - 53; constant HFSCR_TAR : integer := 63 - 55; @@ -258,16 +261,20 @@ package common is fscr_ic: std_ulogic_vector(3 downto 0); fscr_pref: std_ulogic; fscr_tar: std_ulogic; + fscr_dscr: std_ulogic; hfscr_ic: std_ulogic_vector(3 downto 0); hfscr_pref: std_ulogic; hfscr_tar: std_ulogic; + hfscr_dscr: std_ulogic; hfscr_fp: std_ulogic; heir: std_ulogic_vector(63 downto 0); + dscr: std_ulogic_vector(24 downto 0); end record; constant ctrl_t_init : ctrl_t := (wait_state => '0', run => '1', xer_low => 18x"0", - fscr_ic => x"0", fscr_pref => '1', fscr_tar => '1', - hfscr_ic => x"0", hfscr_pref => '1', hfscr_tar => '1', hfscr_fp => '1', + fscr_ic => x"0", fscr_pref => '1', fscr_tar => '1', fscr_dscr => '1', + hfscr_ic => x"0", hfscr_pref => '1', hfscr_tar => '1', hfscr_dscr => '1', hfscr_fp => '1', + dscr => (others => '0'), others => (others => '0')); type Fetch1ToIcacheType is record @@ -415,6 +422,7 @@ package common is illegal_suffix : std_ulogic; misaligned_prefix : std_ulogic; uses_tar : std_ulogic; + uses_dscr : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => ALU, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, @@ -435,7 +443,7 @@ package common is dbg_spr_access => '0', dec_ctr => '0', prefixed => '0', prefix => (others => '0'), illegal_suffix => '0', - misaligned_prefix => '0', uses_tar => '0', + misaligned_prefix => '0', uses_tar => '0', uses_dscr => '0', others => (others => '0')); type MultiplyInputType is record diff --git a/decode1.vhdl b/decode1.vhdl index 252a21f..7fca54b 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -466,6 +466,10 @@ architecture behaviour of decode1 is when SPR_CTRLW => i.sel := SPRSEL_CTRL; i.wonly := '1'; + when SPR_UDSCR => + i.sel := SPRSEL_DSCR; + when SPR_DSCR => + i.sel := SPRSEL_DSCR; when others => i.valid := '0'; end case; diff --git a/decode2.vhdl b/decode2.vhdl index 1c3f324..b27f563 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -454,6 +454,8 @@ begin unit := LDST; when SPR_TAR => v.e.uses_tar := '1'; + when SPR_UDSCR => + v.e.uses_dscr := '1'; when others => end case; if d_in.spr_info.wonly = '1' then @@ -477,6 +479,8 @@ begin end if; when SPR_TAR => v.e.uses_tar := '1'; + when SPR_UDSCR => + v.e.uses_dscr := '1'; when others => end case; if d_in.spr_info.ronly = '1' then diff --git a/execute1.vhdl b/execute1.vhdl index b1087ba..7d714fb 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -94,6 +94,7 @@ architecture behaviour of execute1 is write_heir : std_ulogic; set_heir : std_ulogic; write_ctrl : std_ulogic; + write_dscr : std_ulogic; enter_wait : std_ulogic; end record; constant side_effect_init : side_effect_type := (others => '0'); @@ -393,6 +394,7 @@ architecture behaviour of execute1 is ret(59 downto 56) := c.fscr_ic; ret(FSCR_PREFIX) := c.fscr_pref; ret(FSCR_TAR) := c.fscr_tar; + ret(FSCR_DSCR) := c.fscr_dscr; return ret; end; @@ -403,6 +405,7 @@ architecture behaviour of execute1 is ret(59 downto 56) := c.hfscr_ic; ret(HFSCR_PREFIX) := c.hfscr_pref; ret(HFSCR_TAR) := c.hfscr_tar; + ret(HFSCR_DSCR) := c.hfscr_dscr; ret(HFSCR_FP) := c.hfscr_fp; return ret; end; @@ -1348,6 +1351,8 @@ begin v.se.write_heir := '1'; when SPRSEL_CTRL => v.se.write_ctrl := '1'; + when SPRSEL_DSCR => + v.se.write_dscr := '1'; when others => end case; end if; @@ -1480,6 +1485,20 @@ begin v.se.write_ic := '1'; end if; + elsif ex1.msr(MSR_PR) = '1' and e_in.uses_dscr = '1' and + (ctrl.hfscr_dscr = '0' or ctrl.fscr_dscr = '0') then + -- [Hypervisor] facility unavailable for DSCR access + v.exception := '1'; + v.ic := x"2"; + if ctrl.hfscr_dscr = '0' then + v.e.hv_intr := '1'; + v.e.intr_vec := 16#f80#; + v.se.write_hic := '1'; + else + v.e.intr_vec := 16#f60#; + v.se.write_ic := '1'; + end if; + elsif HAS_FPU and ex1.msr(MSR_PR) = '1' and e_in.fac = FPU and ctrl.hfscr_fp = '0' then -- Hypervisor facility unavailable for FP instructions @@ -1793,6 +1812,7 @@ begin assemble_hfscr(ctrl) when SPRSEL_HFSCR, ctrl.heir when SPRSEL_HEIR, assemble_ctrl(ctrl, ex1.msr(MSR_PR)) when SPRSEL_CTRL, + 39x"0" & ctrl.dscr when SPRSEL_DSCR, assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; stage2_stall <= l_in.l2stall or fp_in.f2stall; @@ -1942,6 +1962,7 @@ begin ctrl_tmp.hfscr_ic <= ex1.e.write_data(59 downto 56); ctrl_tmp.hfscr_pref <= ex1.e.write_data(HFSCR_PREFIX); ctrl_tmp.hfscr_tar <= ex1.e.write_data(HFSCR_TAR); + ctrl_tmp.hfscr_dscr <= ex1.e.write_data(HFSCR_DSCR); ctrl_tmp.hfscr_fp <= ex1.e.write_data(HFSCR_FP); elsif ex1.se.write_hic = '1' then ctrl_tmp.hfscr_ic <= ex1.ic; @@ -1950,6 +1971,7 @@ begin ctrl_tmp.fscr_ic <= ex1.e.write_data(59 downto 56); ctrl_tmp.fscr_pref <= ex1.e.write_data(FSCR_PREFIX); ctrl_tmp.fscr_tar <= ex1.e.write_data(FSCR_TAR); + ctrl_tmp.fscr_dscr <= ex1.e.write_data(FSCR_DSCR); elsif ex1.se.write_ic = '1' then ctrl_tmp.fscr_ic <= ex1.ic; end if; @@ -1967,6 +1989,9 @@ begin if ex1.se.write_ctrl = '1' then ctrl_tmp.run <= ex1.e.write_data(0); end if; + if ex1.se.write_dscr = '1' then + ctrl_tmp.dscr <= ex1.e.write_data(24 downto 0); + end if; if ex1.se.enter_wait = '1' then ctrl_tmp.wait_state <= '1'; end if; From d112a7ad94dfd05861401fb66f5592137ba10c08 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 Sep 2023 08:56:31 +1000 Subject: [PATCH 08/21] Implement scv and rfscv The main quirk here is that scv sets LR and CTR instead of SRR0 and SRR1, and likewise rfscv uses LR and CTR. Also, scv uses a set of 128 interrupt vectors starting at 0x17000. Fortunately, the layout of the SPR RAM was already such that LR and CTR were in the even and odd halves respectively at the same index, so reading or writing LR and CTR instead of SRR0 and SRR1 is quite easy. Use of scv is subject to an FSCR bit but not an HFSCR bit. Signed-off-by: Paul Mackerras --- common.vhdl | 11 +++++++---- decode1.vhdl | 1 + decode2.vhdl | 6 +++++- decode_types.vhdl | 11 ++++++----- execute1.vhdl | 43 +++++++++++++++++++++++++++++++++---------- fetch1.vhdl | 2 +- predecode.vhdl | 1 + writeback.vhdl | 11 ++++++++++- 8 files changed, 64 insertions(+), 22 deletions(-) diff --git a/common.vhdl b/common.vhdl index 033b004..790d98c 100644 --- a/common.vhdl +++ b/common.vhdl @@ -260,6 +260,7 @@ package common is xer_low: std_ulogic_vector(17 downto 0); fscr_ic: std_ulogic_vector(3 downto 0); fscr_pref: std_ulogic; + fscr_scv: std_ulogic; fscr_tar: std_ulogic; fscr_dscr: std_ulogic; hfscr_ic: std_ulogic_vector(3 downto 0); @@ -272,7 +273,7 @@ package common is end record; constant ctrl_t_init : ctrl_t := (wait_state => '0', run => '1', xer_low => 18x"0", - fscr_ic => x"0", fscr_pref => '1', fscr_tar => '1', fscr_dscr => '1', + fscr_ic => x"0", fscr_pref => '1', fscr_scv => '1', fscr_tar => '1', fscr_dscr => '1', hfscr_ic => x"0", hfscr_pref => '1', hfscr_tar => '1', hfscr_dscr => '1', hfscr_fp => '1', dscr => (others => '0'), others => (others => '0')); @@ -711,6 +712,7 @@ package common is xerc : xer_common_t; interrupt : std_ulogic; hv_intr : std_ulogic; + is_scv : std_ulogic; intr_vec : intr_vector_t; redirect: std_ulogic; redir_mode: std_ulogic_vector(3 downto 0); @@ -727,7 +729,7 @@ package common is write_xerc_enable => '0', xerc => xerc_init, write_data => (others => '0'), write_cr_mask => (others => '0'), write_cr_data => (others => '0'), write_reg => (others => '0'), - interrupt => '0', hv_intr => '0', intr_vec => 0, + interrupt => '0', hv_intr => '0', is_scv => '0', intr_vec => 0, redirect => '0', redir_mode => "0000", last_nia => (others => '0'), br_last => '0', br_taken => '0', abs_br => '0', @@ -816,13 +818,13 @@ package common is br_last : std_ulogic; br_taken : std_ulogic; interrupt : std_ulogic; - intr_vec : std_ulogic_vector(11 downto 0); + intr_vec : std_ulogic_vector(16 downto 0); end record; constant WritebackToFetch1Init : WritebackToFetch1Type := (redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0', mode_32bit => '0', redirect_nia => (others => '0'), br_last => '0', br_taken => '0', br_nia => (others => '0'), - interrupt => '0', intr_vec => x"000"); + interrupt => '0', intr_vec => 17x"0"); type WritebackToRegisterFileType is record write_reg : gspr_index_t; @@ -847,6 +849,7 @@ package common is type WritebackToExecute1Type is record intr : std_ulogic; hv_intr : std_ulogic; + scv_int : std_ulogic; srr1 : std_ulogic_vector(15 downto 0); end record; diff --git a/decode1.vhdl b/decode1.vhdl index 7fca54b..ccfdf9f 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -302,6 +302,7 @@ architecture behaviour of decode1 is INSN_prtyd => (ALU, NONE, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_prtyw => (ALU, NONE, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_rfid => (ALU, NONE, OP_RFID, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_rfscv => (ALU, NONE, OP_RFID, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_rldcl => (ALU, NONE, OP_RLCL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), INSN_rldcr => (ALU, NONE, OP_RLCR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), INSN_rldic => (ALU, NONE, OP_RLC, NONE, CONST_SH, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), diff --git a/decode2.vhdl b/decode2.vhdl index b27f563..d094681 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -553,7 +553,11 @@ begin v.e.ramspr_write_odd := d_in.ram_spr.valid and d_in.ram_spr.isodd; v.e.spr_is_ram := d_in.ram_spr.valid; when OP_RFID => - if d_in.insn(9) = '0' then + if d_in.insn(7) = '1' then + -- rfscv + v.e.ramspr_even_rdaddr := RAMSPR_LR; + v.e.ramspr_odd_rdaddr := RAMSPR_CTR; + elsif d_in.insn(9) = '0' then -- rfid v.e.ramspr_even_rdaddr := RAMSPR_SRR0; v.e.ramspr_odd_rdaddr := RAMSPR_SRR1; diff --git a/decode_types.vhdl b/decode_types.vhdl index 8cb732a..4f81a36 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -107,6 +107,7 @@ package decode_types is INSN_prtyw, INSN_prtyd, -- 70 INSN_rfid, + INSN_rfscv, INSN_rldic, INSN_rldicl, INSN_rldicr, @@ -114,8 +115,8 @@ package decode_types is INSN_rlwimi, INSN_rlwinm, INSN_rnop, - INSN_sc, - INSN_setb, -- 80 + INSN_sc, -- 80 + INSN_setb, INSN_slbia, INSN_sradi, INSN_srawi, @@ -124,8 +125,8 @@ package decode_types is INSN_stdu, INSN_sthu, INSN_stwu, - INSN_subfic, - INSN_subfme, -- 90 + INSN_subfic, -- 90 + INSN_subfme, INSN_subfze, INSN_sync, INSN_tdi, @@ -135,7 +136,7 @@ package decode_types is INSN_xori, INSN_xoris, -- pad to 104 - INSN_063, INSN_064, INSN_065, INSN_066, INSN_067, + INSN_064, INSN_065, INSN_066, INSN_067, -- Non-prefixed instructions that have a MLS:D prefixed form and -- their corresponding prefixed instructions. diff --git a/execute1.vhdl b/execute1.vhdl index 7d714fb..dad8d72 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -96,6 +96,7 @@ architecture behaviour of execute1 is write_ctrl : std_ulogic; write_dscr : std_ulogic; enter_wait : std_ulogic; + scv_trap : std_ulogic; end record; constant side_effect_init : side_effect_type := (others => '0'); @@ -393,6 +394,7 @@ architecture behaviour of execute1 is ret := (others => '0'); ret(59 downto 56) := c.fscr_ic; ret(FSCR_PREFIX) := c.fscr_pref; + ret(FSCR_SCV) := c.fscr_scv; ret(FSCR_TAR) := c.fscr_tar; ret(FSCR_DSCR) := c.fscr_dscr; return ret; @@ -587,10 +589,12 @@ begin even_wr_enab := (ex1.se.ramspr_write_even and doit) or interrupt_in.intr; odd_wr_enab := (ex1.se.ramspr_write_odd and doit) or interrupt_in.intr; if interrupt_in.intr = '1' then - if interrupt_in.hv_intr = '0' then - wr_addr := RAMSPR_SRR0; - else + if interrupt_in.hv_intr = '1' then wr_addr := RAMSPR_HSRR0; + elsif interrupt_in.scv_int = '1' then + wr_addr := RAMSPR_LR; + else + wr_addr := RAMSPR_SRR0; end if; else wr_addr := ex1.ramspr_wraddr; @@ -1127,18 +1131,20 @@ begin when OP_ILLEGAL => illegal := '1'; when OP_SC => - -- check bit 1 of the instruction is 1 so we know this is sc; - -- 0 would mean scv, so generate an illegal instruction interrupt + -- check bit 1 of the instruction to distinguish sc from scv if e_in.insn(1) = '1' then - v.trap := '1'; - v.advance_nia := '1'; + -- sc v.e.intr_vec := 16#C00#; if e_in.valid = '1' then report "sc"; end if; else - illegal := '1'; + -- scv + v.se.scv_trap := '1'; + v.e.intr_vec := to_integer(unsigned(e_in.insn(11 downto 5))) * 32; end if; + v.trap := '1'; + v.advance_nia := '1'; when OP_ATTN => -- check bits 1-10 of the instruction to make sure it's attn -- if not then it is illegal @@ -1230,6 +1236,9 @@ begin v.se.set_cfar := v.take_branch; when OP_RFID => + -- rfid, hrfid and rfscv. + -- These all act the same given that we don't have + -- privileged non-hypervisor mode or ultravisor mode. srr1 := ramspr_odd; v.e.redir_mode := (srr1(MSR_IR) or srr1(MSR_PR)) & not srr1(MSR_PR) & not srr1(MSR_LE) & not srr1(MSR_SF); @@ -1471,6 +1480,14 @@ begin report "illegal instruction"; end if; + elsif ex1.msr(MSR_PR) = '1' and v.se.scv_trap = '1' and + ctrl.fscr_scv = '0' then + -- Facility unavailable for scv instruction + v.exception := '1'; + v.ic := x"c"; + v.e.intr_vec := 16#f60#; + v.se.write_ic := '1'; + elsif ex1.msr(MSR_PR) = '1' and e_in.uses_tar = '1' and (ctrl.hfscr_tar = '0' or ctrl.fscr_tar = '0') then -- [Hypervisor] facility unavailable for TAR access @@ -1536,6 +1553,7 @@ begin variable fv : Execute1ToFPUType; variable go : std_ulogic; variable bypass_valid : std_ulogic; + variable is_scv : std_ulogic; begin v := ex1; if busy_out = '0' then @@ -1670,6 +1688,7 @@ begin fv.valid := '1'; end if; end if; + is_scv := go and actions.se.scv_trap; if not HAS_FPU and ex1.div_in_progress = '1' then v.div_in_progress := not divider_to_x.valid; @@ -1710,6 +1729,7 @@ begin if (ex1.busy or l_in.busy or fp_in.busy) = '0' then v.e.interrupt := exception; + v.e.is_scv := is_scv; end if; if v.e.valid = '0' then v.e.redirect := '0'; @@ -1970,6 +1990,7 @@ begin if ex1.se.write_fscr = '1' then ctrl_tmp.fscr_ic <= ex1.e.write_data(59 downto 56); ctrl_tmp.fscr_pref <= ex1.e.write_data(FSCR_PREFIX); + ctrl_tmp.fscr_scv <= ex1.e.write_data(FSCR_SCV); ctrl_tmp.fscr_tar <= ex1.e.write_data(FSCR_TAR); ctrl_tmp.fscr_dscr <= ex1.e.write_data(FSCR_DSCR); elsif ex1.se.write_ic = '1' then @@ -2007,7 +2028,6 @@ begin if interrupt_in.intr = '1' then ctrl_tmp.msr(MSR_SF) <= '1'; - ctrl_tmp.msr(MSR_EE) <= '0'; ctrl_tmp.msr(MSR_PR) <= '0'; ctrl_tmp.msr(MSR_SE) <= '0'; ctrl_tmp.msr(MSR_BE) <= '0'; @@ -2016,8 +2036,11 @@ begin ctrl_tmp.msr(MSR_FE1) <= '0'; ctrl_tmp.msr(MSR_IR) <= '0'; ctrl_tmp.msr(MSR_DR) <= '0'; - ctrl_tmp.msr(MSR_RI) <= '0'; ctrl_tmp.msr(MSR_LE) <= '1'; + if interrupt_in.scv_int = '0' then + ctrl_tmp.msr(MSR_EE) <= '0'; + ctrl_tmp.msr(MSR_RI) <= '0'; + end if; end if; bypass_valid := ex1.e.valid; diff --git a/fetch1.vhdl b/fetch1.vhdl index 96c16fb..f07188d 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -391,7 +391,7 @@ begin v_int.next_nia := RESET_ADDRESS; end if; elsif w_in.interrupt = '1' then - v_int.next_nia := 52x"0" & w_in.intr_vec(11 downto 2) & "00"; + v_int.next_nia := 47x"0" & w_in.intr_vec(16 downto 2) & "00"; end if; if rst /= '0' or w_in.interrupt = '1' then v.req := '0'; diff --git a/predecode.vhdl b/predecode.vhdl index 858910c..1846e3c 100644 --- a/predecode.vhdl +++ b/predecode.vhdl @@ -447,6 +447,7 @@ architecture behaviour of predecoder is 2#1_00100_11110# => INSN_isync, 2#1_00000_10000# => INSN_mcrf, 2#1_00000_11010# => INSN_rfid, + 2#1_00010_11010# => INSN_rfscv, 2#1_01000_11010# => INSN_rfid, -- hrfid -- Major opcode 59 diff --git a/writeback.vhdl b/writeback.vhdl index c479c20..d7690a5 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -73,6 +73,8 @@ begin variable srr1 : std_ulogic_vector(15 downto 0); variable intr : std_ulogic; variable hvi : std_ulogic; + variable scv : std_ulogic; + variable intr_page : std_ulogic_vector(4 downto 0); begin w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; @@ -95,10 +97,16 @@ begin interrupt_out.intr <= intr; srr1 := (others => '0'); + intr_page := 5x"0"; + scv := '0'; if e_in.interrupt = '1' then vec := e_in.intr_vec; srr1 := e_in.srr1; hvi := e_in.hv_intr; + scv := e_in.is_scv; + if e_in.is_scv = '1' then + intr_page := 5x"17"; + end if; elsif l_in.interrupt = '1' then vec := l_in.intr_vec; srr1 := l_in.srr1; @@ -108,6 +116,7 @@ begin end if; interrupt_out.hv_intr <= hvi; interrupt_out.srr1 <= srr1; + interrupt_out.scv_int <= scv; if intr = '0' then if e_in.write_enable = '1' then @@ -165,7 +174,7 @@ begin -- Outputs to fetch1 f.interrupt := intr; - f.intr_vec := std_ulogic_vector(to_unsigned(vec, 12)); + f.intr_vec := intr_page & std_ulogic_vector(to_unsigned(vec, 12)); f.redirect := e_in.redirect; f.redirect_nia := e_in.write_data; f.br_nia := e_in.last_nia; From d7d7a3afd4789875ce253369d1fe631f8b1c6192 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 Sep 2023 12:30:13 +1000 Subject: [PATCH 09/21] Implement VRSAVE SPR VRSAVE is a 32-bit software-use SPR accessible in user mode. It is stored in the SPR RAM. The value read from the RAM is trimmed to 32 bits at the ramspr_read process. Signed-off-by: Paul Mackerras --- common.vhdl | 5 +++++ decode1.vhdl | 6 +++++- decode2.vhdl | 1 + execute1.vhdl | 3 +++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/common.vhdl b/common.vhdl index 790d98c..4f7fced 100644 --- a/common.vhdl +++ b/common.vhdl @@ -62,6 +62,7 @@ package common is constant SPR_CTRLW : spr_num_t := 152; constant SPR_UDSCR : spr_num_t := 3; constant SPR_DSCR : spr_num_t := 17; + constant SPR_VRSAVE : spr_num_t := 256; -- PMU registers constant SPR_UPMC1 : spr_num_t := 771; @@ -139,10 +140,12 @@ package common is constant RAMSPR_SPRG3 : ramspr_index := to_unsigned(3,3); constant RAMSPR_HSPRG1 : ramspr_index := to_unsigned(4,3); constant RAMSPR_CTR : ramspr_index := to_unsigned(5,3); -- must equal RAMSPR_LR + constant RAMSPR_VRSAVE : ramspr_index := to_unsigned(6,3); type ram_spr_info is record index : ramspr_index; isodd : std_ulogic; + is32b : std_ulogic; valid : std_ulogic; end record; constant ram_spr_info_init: ram_spr_info := (index => to_unsigned(0,3), others => '0'); @@ -416,6 +419,7 @@ package common is ramspr_wraddr : ramspr_index; ramspr_write_even : std_ulogic; ramspr_write_odd : std_ulogic; + ramspr_32bit : std_ulogic; dbg_spr_access : std_ulogic; dec_ctr : std_ulogic; prefixed : std_ulogic; @@ -441,6 +445,7 @@ package common is spr_is_ram => '0', ramspr_even_rdaddr => (others => '0'), ramspr_odd_rdaddr => (others => '0'), ramspr_rd_odd => '0', ramspr_wraddr => (others => '0'), ramspr_write_even => '0', ramspr_write_odd => '0', + ramspr_32bit => '0', dbg_spr_access => '0', dec_ctr => '0', prefixed => '0', prefix => (others => '0'), illegal_suffix => '0', diff --git a/decode1.vhdl b/decode1.vhdl index ccfdf9f..75bb9c3 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -385,7 +385,7 @@ architecture behaviour of decode1 is function decode_ram_spr(sprn : spr_num_t) return ram_spr_info is variable ret : ram_spr_info; begin - ret := (index => (others => '0'), isodd => '0', valid => '1'); + ret := (index => (others => '0'), isodd => '0', is32b => '0', valid => '1'); case sprn is when SPR_LR => ret.index := RAMSPR_LR; @@ -419,6 +419,10 @@ architecture behaviour of decode1 is when SPR_HSPRG1 => ret.index := RAMSPR_HSPRG1; ret.isodd := '1'; + when SPR_VRSAVE => + ret.index := RAMSPR_VRSAVE; + ret.isodd := '1'; + ret.is32b := '1'; when others => ret.valid := '0'; end case; diff --git a/decode2.vhdl b/decode2.vhdl index d094681..94fb6a7 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -545,6 +545,7 @@ begin v.e.ramspr_even_rdaddr := d_in.ram_spr.index; v.e.ramspr_odd_rdaddr := d_in.ram_spr.index; v.e.ramspr_rd_odd := d_in.ram_spr.isodd; + v.e.ramspr_32bit := d_in.ram_spr.is32b; v.e.spr_is_ram := d_in.ram_spr.valid; sprs_busy := d_in.ram_spr.valid; when OP_MTSPR => diff --git a/execute1.vhdl b/execute1.vhdl index dad8d72..9b55195 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -635,6 +635,9 @@ begin else ramspr_result <= ramspr_odd; end if; + if e_in.ramspr_32bit = '1' then + ramspr_result(63 downto 32) <= 32x"0"; + end if; end process; ramspr_write: process(clk) From fa9df33f7ea750de5d11078b0bc7586dbfac86a4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 28 Sep 2023 21:58:15 +1000 Subject: [PATCH 10/21] Implement cfuged, pdepd and pextd This implements the cfuged, pdepd and pextd instructions in a new unit called bit_sorter (so called because cfuged and pextd can be viewed as sorting the bits of the mask). The cnt* instructions and the popcnt* instructions now use the same OP_COUNTB insn_type so as to free up an insn_type value to use for the new instructions. The new instructions are implemented using a slow and simple algorithm that takes 64 cycles to compute the result. The ex1 stage is stalled while this happens, as for a 64-bit multiply, or for a divide when there is no FPU. Signed-off-by: Paul Mackerras --- Makefile | 2 +- bitsort.vhdl | 102 ++++++++++++++++++++++++++++++++++++++ decode1.vhdl | 17 ++++--- decode2.vhdl | 13 ++--- decode_types.vhdl | 57 +++++++++++---------- execute1.vhdl | 76 +++++++++++++++++++++------- microwatt.core | 1 + predecode.vhdl | 3 ++ scripts/fmt_log/fmt_log.c | 4 +- 9 files changed, 213 insertions(+), 62 deletions(-) create mode 100644 bitsort.vhdl diff --git a/Makefile b/Makefile index fb591a4..01eab73 100644 --- a/Makefile +++ b/Makefile @@ -74,7 +74,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \ execute1.vhdl loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl \ - core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl + core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl bitsort.vhdl soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \ wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \ diff --git a/bitsort.vhdl b/bitsort.vhdl new file mode 100644 index 0000000..f2aeddb --- /dev/null +++ b/bitsort.vhdl @@ -0,0 +1,102 @@ +-- Implements instructions that involve sorting bits, +-- that is, cfuged, pextd and pdepd. +-- +-- cfuged: Sort the bits in the mask in RB into 0s at the left, 1s at the right +-- and move the bits in RS in the same fashion to give the result +-- pextd: Like cfuged but the only use the bits of RS where the +-- corresponding bit in RB is 1 +-- pdepd: Inverse of pextd; take the low-order bits of RS and spread them out +-- to the bit positions which have a 1 in RB + +-- NB opc is bits 7-6 of the instruction: +-- 00 = pdepd, 01 = pextd, 10 = cfuged + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.helpers.all; + +entity bit_sorter is + port ( + clk : in std_ulogic; + rst : in std_ulogic; + rs : in std_ulogic_vector(63 downto 0); + rb : in std_ulogic_vector(63 downto 0); + go : in std_ulogic; + opc : in std_ulogic_vector(1 downto 0); + done : out std_ulogic; + result : out std_ulogic_vector(63 downto 0) + ); +end entity bit_sorter; + +architecture behaviour of bit_sorter is + + signal val : std_ulogic_vector(63 downto 0); + signal st : std_ulogic; + signal sd : std_ulogic; + signal opr : std_ulogic_vector(1 downto 0); + signal bc : unsigned(5 downto 0); + signal jl : unsigned(5 downto 0); + signal jr : unsigned(5 downto 0); + signal sr_ml : std_ulogic_vector(63 downto 0); + signal sr_mr : std_ulogic_vector(63 downto 0); + signal sr_vl : std_ulogic_vector(63 downto 0); + signal sr_vr : std_ulogic_vector(63 downto 0); + +begin + bsort_r: process(clk) + begin + if rising_edge(clk) then + sd <= '0'; + if rst = '1' then + st <= '0'; + opr <= "00"; + val <= (others => '0'); + elsif go = '1' then + st <= '1'; + sr_ml <= rb; + sr_mr <= rb; + sr_vl <= rs; + sr_vr <= rs; + opr <= opc; + val <= (others => '0'); + bc <= to_unsigned(0, 6); + jl <= to_unsigned(63, 6); + jr <= to_unsigned(0, 6); + elsif st = '1' then + if bc = 6x"3f" then + st <= '0'; + sd <= '1'; + end if; + bc <= bc + 1; + if sr_ml(63) = '0' and opr(1) = '1' then + -- cfuged + val(to_integer(jl)) <= sr_vl(63); + jl <= jl - 1; + end if; + if sr_mr(0) = '1' then + if opr = "00" then + -- pdepd + val(to_integer(bc)) <= sr_vr(0); + else + -- cfuged or pextd + val(to_integer(jr)) <= sr_vr(0); + end if; + jr <= jr + 1; + end if; + sr_vl <= sr_vl(62 downto 0) & '0'; + if opr /= "00" or sr_mr(0) = '1' then + sr_vr <= '0' & sr_vr(63 downto 1); + end if; + sr_ml <= sr_ml(62 downto 0) & '0'; + sr_mr <= '0' & sr_mr(63 downto 1); + end if; + end if; + end process; + + done <= sd; + result <= val; + +end behaviour; diff --git a/decode1.vhdl b/decode1.vhdl index 75bb9c3..86fb5cf 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -106,6 +106,7 @@ architecture behaviour of decode1 is INSN_brd => (ALU, NONE, OP_BREV, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cbcdtd => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cdtbcd => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_cfuged => (ALU, NONE, OP_BSORT, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmp => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), INSN_cmpb => (ALU, NONE, OP_CMPB, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmpeqb => (ALU, NONE, OP_CMPEQB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -113,10 +114,10 @@ architecture behaviour of decode1 is INSN_cmpl => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmpli => (ALU, NONE, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmprb => (ALU, NONE, OP_CMPRB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_cntlzd => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), - INSN_cntlzw => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), - INSN_cnttzd => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), - INSN_cnttzw => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), + INSN_cntlzd => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), + INSN_cntlzw => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), + INSN_cnttzd => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), + INSN_cnttzw => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), INSN_crand => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_crandc => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_creqv => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -281,6 +282,8 @@ architecture behaviour of decode1 is INSN_ori => (ALU, NONE, OP_LOGIC, NONE, CONST_UI, RS, RA, '0', '0', '1', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), INSN_oris => (ALU, NONE, OP_LOGIC, NONE, CONST_UI_HI, RS, RA, '0', '0', '1', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), INSN_paddi => (ALU, NONE, OP_ADD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_pdepd => (ALU, NONE, OP_BSORT, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_pextd => (ALU, NONE, OP_BSORT, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_plbz => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_pld => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_plfd => (LDST, FPU, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -296,9 +299,9 @@ architecture behaviour of decode1 is INSN_pstfs => (LDST, FPU, OP_STORE, RA0_OR_CIA, CONST_PSI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), INSN_psth => (LDST, NONE, OP_STORE, RA0_OR_CIA, CONST_PSI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_pstw => (LDST, NONE, OP_STORE, RA0_OR_CIA, CONST_PSI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_popcntb => (ALU, NONE, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_popcntd => (ALU, NONE, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_popcntw => (ALU, NONE, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_popcntb => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_popcntd => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_popcntw => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_prtyd => (ALU, NONE, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_prtyw => (ALU, NONE, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_rfid => (ALU, NONE, OP_RFID, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), diff --git a/decode2.vhdl b/decode2.vhdl index 94fb6a7..a747495 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -232,12 +232,13 @@ architecture behaviour of decode2 is ); constant subresult_select : mux_select_array_t := ( - OP_MUL_L64 => "000", -- muldiv_result - OP_MUL_H64 => "001", - OP_MUL_H32 => "010", - OP_DIV => "011", - OP_DIVE => "011", - OP_MOD => "011", + OP_MUL_L64 => "000", -- multicyc_result + OP_MUL_H64 => "010", + OP_MUL_H32 => "001", + OP_DIV => "101", + OP_DIVE => "101", + OP_MOD => "101", + OP_BSORT => "100", OP_ADDG6S => "001", -- misc_result OP_ISEL => "010", OP_DARN => "011", diff --git a/decode_types.vhdl b/decode_types.vhdl index 4f81a36..dc104cd 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -6,7 +6,7 @@ package decode_types is OP_ATTN, OP_B, OP_BC, OP_BCREG, OP_BCD, OP_BPERM, OP_BREV, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB, - OP_CNTZ, OP_CROP, + OP_COUNTB, OP_CROP, OP_DARN, OP_DCBF, OP_DCBST, OP_XCBT, OP_DCBTST, OP_DCBZ, OP_ICBI, OP_FP_CMP, OP_FP_ARITH, OP_FP_MOVE, OP_FP_MISC, @@ -18,7 +18,8 @@ package decode_types is OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64, OP_MUL_H64, OP_MUL_H32, - OP_POPCNT, OP_PRTY, OP_RFID, + OP_BSORT, + OP_PRTY, OP_RFID, OP_RLC, OP_RLCL, OP_RLCR, OP_SC, OP_SETB, OP_SHL, OP_SHR, OP_SYNC, OP_TLBIE, OP_TRAP, @@ -179,11 +180,12 @@ package decode_types is INSN_and, INSN_andc, INSN_bperm, + INSN_cfuged, INSN_cmp, INSN_cmpb, INSN_cmpeqb, - INSN_cmpl, - INSN_cmprb, -- 140 + INSN_cmpl, -- 140 + INSN_cmprb, INSN_dcbf, INSN_dcbst, INSN_dcbt, @@ -192,8 +194,8 @@ package decode_types is INSN_divd, INSN_divdu, INSN_divde, - INSN_divdeu, - INSN_divw, -- 150 + INSN_divdeu, -- 150 + INSN_divw, INSN_divwu, INSN_divwe, INSN_divweu, @@ -202,8 +204,8 @@ package decode_types is INSN_icbt, INSN_isel, INSN_lbarx, - INSN_lbzcix, - INSN_lbzux, -- 160 + INSN_lbzcix, -- 160 + INSN_lbzux, INSN_lbzx, INSN_ldarx, INSN_ldbrx, @@ -212,8 +214,8 @@ package decode_types is INSN_ldux, INSN_lharx, INSN_lhax, - INSN_lhaux, - INSN_lhbrx, -- 170 + INSN_lhaux, -- 170 + INSN_lhbrx, INSN_lhzcix, INSN_lhzx, INSN_lhzux, @@ -222,8 +224,8 @@ package decode_types is INSN_lwaux, INSN_lwbrx, INSN_lwzcix, - INSN_lwzx, - INSN_lwzux, -- 180 + INSN_lwzx, -- 180 + INSN_lwzux, INSN_modsd, INSN_modsw, INSN_moduw, @@ -232,51 +234,54 @@ package decode_types is INSN_mulhwu, INSN_mulhd, INSN_mulhdu, - INSN_mullw, - INSN_mulld, -- 190 + INSN_mullw, -- 190 + INSN_mulld, INSN_nand, INSN_nor, INSN_or, INSN_orc, + INSN_pdepd, + INSN_pextd, INSN_rldcl, INSN_rldcr, - INSN_rlwnm, + INSN_rlwnm, -- 200 INSN_slw, INSN_sld, - INSN_sraw, -- 200 + INSN_sraw, INSN_srad, INSN_srw, INSN_srd, INSN_stbcix, INSN_stbcx, INSN_stbx, - INSN_stbux, + INSN_stbux, -- 210 INSN_stdbrx, INSN_stdcix, - INSN_stdcx, -- 210 + INSN_stdcx, INSN_stdx, INSN_stdux, INSN_sthbrx, INSN_sthcix, INSN_sthcx, INSN_sthx, - INSN_sthux, + INSN_sthux, -- 220 INSN_stwbrx, INSN_stwcix, - INSN_stwcx, -- 220 + INSN_stwcx, INSN_stwx, INSN_stwux, INSN_subf, INSN_subfc, INSN_subfe, INSN_td, - INSN_tlbie, + INSN_tlbie, -- 230 INSN_tlbiel, INSN_tw, - INSN_xor, -- 230 + INSN_xor, - -- pad to 232 to simplify comparison logic - INSN_231, + -- pad to 240 to simplify comparison logic + INSN_234, INSN_235, + INSN_236, INSN_237, INSN_238, INSN_239, -- The following instructions have a third input addressed by RC INSN_maddld, @@ -284,9 +289,7 @@ package decode_types is INSN_maddhdu, -- pad to 256 to simplify comparison logic - INSN_235, - INSN_236, INSN_237, INSN_238, INSN_239, - INSN_240, INSN_241, INSN_242, INSN_243, + INSN_243, INSN_244, INSN_245, INSN_246, INSN_247, INSN_248, INSN_249, INSN_250, INSN_251, INSN_252, INSN_253, INSN_254, INSN_255, diff --git a/execute1.vhdl b/execute1.vhdl index 9b55195..2cc9c35 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -113,6 +113,7 @@ architecture behaviour of execute1 is direct_branch : std_ulogic; start_mul : std_ulogic; start_div : std_ulogic; + start_bsort : std_ulogic; do_trace : std_ulogic; fp_intr : std_ulogic; res2_sel : std_ulogic_vector(1 downto 0); @@ -134,7 +135,7 @@ architecture behaviour of execute1 is prev_op : insn_type_t; prev_prefixed : std_ulogic; oe : std_ulogic; - mul_select : std_ulogic_vector(1 downto 0); + mul_select : std_ulogic_vector(2 downto 0); res2_sel : std_ulogic_vector(1 downto 0); spr_select : spr_id; pmu_spr_num : std_ulogic_vector(4 downto 0); @@ -144,6 +145,7 @@ architecture behaviour of execute1 is mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; + bsort_in_progress : std_ulogic; no_instr_avail : std_ulogic; instr_dispatch : std_ulogic; ext_interrupt : std_ulogic; @@ -164,10 +166,11 @@ architecture behaviour of execute1 is busy => '0', fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, prev_prefixed => '0', - oe => '0', mul_select => "00", res2_sel => "00", + oe => '0', mul_select => "000", res2_sel => "00", spr_select => spr_id_init, pmu_spr_num => 5x"0", redir_to_next => '0', advance_nia => '0', lr_from_next => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', + bsort_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', msr => 64x"0", @@ -209,7 +212,8 @@ architecture behaviour of execute1 is signal alu_result: std_ulogic_vector(63 downto 0); signal adder_result: std_ulogic_vector(63 downto 0); signal misc_result: std_ulogic_vector(63 downto 0); - signal muldiv_result: std_ulogic_vector(63 downto 0); + signal multicyc_result: std_ulogic_vector(63 downto 0); + signal bsort_result: std_ulogic_vector(63 downto 0); signal spr_result: std_ulogic_vector(63 downto 0); signal next_nia : std_ulogic_vector(63 downto 0); signal s1_sel : std_ulogic_vector(2 downto 0); @@ -234,6 +238,10 @@ architecture behaviour of execute1 is signal x_to_divider: Execute1ToDividerType; signal divider_to_x: DividerToExecute1Type := DividerToExecute1Init; + -- bit-sort unit signals + signal bsort_start : std_ulogic; + signal bsort_done : std_ulogic; + -- random number generator signals signal random_raw : std_ulogic_vector(63 downto 0); signal random_cond : std_ulogic_vector(63 downto 0); @@ -493,6 +501,18 @@ begin ); end generate; + bsort_0: entity work.bit_sorter + port map ( + clk => clk, + rst => rst, + rs => c_in, + rb => b_in, + go => bsort_start, + opc => e_in.insn(7 downto 6), + done => bsort_done, + result => bsort_result + ); + random_0: entity work.random port map ( clk => clk, @@ -664,7 +684,7 @@ begin adder_result when "000", logical_result when "001", rotator_result when "010", - muldiv_result when "100", + multicyc_result when "100", ramspr_result when "101", misc_result when others; @@ -845,17 +865,21 @@ begin x_to_mult_32s.subtract <= '0'; x_to_mult_32s.addend <= (others => '0'); - case ex1.mul_select is - when "00" => - muldiv_result <= multiply_to_x.result(63 downto 0); - when "01" => - muldiv_result <= multiply_to_x.result(127 downto 64); - when "10" => - muldiv_result <= multiply_to_x.result(63 downto 32) & - multiply_to_x.result(63 downto 32); - when others => - muldiv_result <= divider_to_x.write_reg_data; - end case; + if ex1.mul_select(2) = '0' then + case ex1.mul_select(1 downto 0) is + when "00" => + multicyc_result <= multiply_to_x.result(63 downto 0); + when "01" => + multicyc_result <= multiply_to_x.result(63 downto 32) & + multiply_to_x.result(63 downto 32); + when others => + multicyc_result <= multiply_to_x.result(127 downto 64); + end case; + elsif ex1.mul_select(0) = '1' and not HAS_FPU then + multicyc_result <= divider_to_x.write_reg_data; + else + multicyc_result <= bsort_result; + end if; -- Compute misc_result case e_in.sub_select is @@ -1266,7 +1290,7 @@ begin end if; v.do_trace := '0'; - when OP_CNTZ | OP_POPCNT => + when OP_COUNTB => v.res2_sel := "01"; slow_op := '1'; when OP_ISEL => @@ -1388,6 +1412,11 @@ begin when OP_ICBI => v.se.icache_inval := '1'; + when OP_BSORT => + v.start_bsort := '1'; + slow_op := '1'; + owait := '1'; + when OP_MUL_L64 => if e_in.is_32bit = '1' then v.se.mult_32s := '1'; @@ -1565,7 +1594,7 @@ begin v.oe := e_in.oe; v.spr_select := e_in.spr_select; v.pmu_spr_num := e_in.insn(20 downto 16); - v.mul_select := e_in.sub_select(1 downto 0); + v.mul_select := e_in.sub_select; v.se := side_effect_init; v.ramspr_wraddr := e_in.ramspr_wraddr; v.lr_from_next := e_in.lr; @@ -1596,7 +1625,7 @@ begin rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; - do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0'; + do_popcnt <= '1' when e_in.insn_type = OP_COUNTB and e_in.insn(7 downto 6) = "11" else '0'; if valid_in = '1' then v.prev_op := e_in.insn_type; @@ -1671,6 +1700,7 @@ begin v.mul_in_progress := actions.start_mul; x_to_divider.valid <= actions.start_div; v.div_in_progress := actions.start_div; + v.bsort_in_progress := actions.start_bsort; v.br_mispredict := v.e.redirect and actions.direct_branch; v.advance_nia := actions.advance_nia; v.redir_to_next := actions.redir_to_next; @@ -1681,7 +1711,7 @@ begin -- multiply is happening in order to stop following -- instructions from using the wrong XER value -- (and for simplicity in the OE=0 case). - v.busy := actions.start_div or actions.start_mul; + v.busy := actions.start_div or actions.start_mul or actions.start_bsort; -- instruction for other units, i.e. LDST if e_in.unit = LDST then @@ -1692,6 +1722,7 @@ begin end if; end if; is_scv := go and actions.se.scv_trap; + bsort_start <= go and actions.start_bsort; if not HAS_FPU and ex1.div_in_progress = '1' then v.div_in_progress := not divider_to_x.valid; @@ -1724,6 +1755,13 @@ begin end if; v.e.valid := '1'; end if; + if ex1.bsort_in_progress = '1' then + v.bsort_in_progress := not bsort_done; + v.e.valid := bsort_done; + v.busy := not bsort_done; + v.e.write_data := alu_result; + bypass_valid := bsort_done; + end if; if v.e.write_xerc_enable = '1' and v.e.valid = '1' then v.xerc := v.e.xerc; diff --git a/microwatt.core b/microwatt.core index dad180f..f56bee0 100644 --- a/microwatt.core +++ b/microwatt.core @@ -20,6 +20,7 @@ filesets: - sim_console.vhdl - logical.vhdl - countbits.vhdl + - bitsort.vhdl - control.vhdl - execute1.vhdl - fpu.vhdl diff --git a/predecode.vhdl b/predecode.vhdl index 1846e3c..65cb751 100644 --- a/predecode.vhdl +++ b/predecode.vhdl @@ -219,6 +219,7 @@ architecture behaviour of predecoder is 2#0_00101_11011# => INSN_brd, 2#0_01001_11010# => INSN_cbcdtd, 2#0_01000_11010# => INSN_cdtbcd, + 2#0_00110_11100# => INSN_cfuged, 2#0_00000_00000# => INSN_cmp, 2#0_01111_11100# => INSN_cmpb, 2#0_00111_00000# => INSN_cmpeqb, @@ -363,6 +364,8 @@ architecture behaviour of predecoder is 2#0_00011_11100# => INSN_nor, 2#0_01101_11100# => INSN_or, 2#0_01100_11100# => INSN_orc, + 2#0_00100_11100# => INSN_pdepd, + 2#0_00101_11100# => INSN_pextd, 2#0_00011_11010# => INSN_popcntb, 2#0_01111_11010# => INSN_popcntd, 2#0_01011_11010# => INSN_popcntw, diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c index 226cfbe..aa0573a 100644 --- a/scripts/fmt_log/fmt_log.c +++ b/scripts/fmt_log/fmt_log.c @@ -87,11 +87,11 @@ const char *units[4] = { "al", "ls", "fp", "3?" }; const char *ops[64] = { "illegal", "nop ", "add ", "attn ", "b ", "bc ", "bcreg ", "bcd ", - "bperm ", "brev ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", + "bperm ", "brev ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "countb ", "crop ", "darn ", "dcbf ", "dcbst ", "xcbt ", "dcbtst ", "dcbz ", "icbi ", "fpcmp ", "fparith", "fpmove ", "fpmisc ", "div ", "dive ", "mod ", "exts ", "extswsl", "isel ", "isync ", "logic ", "ld ", "st ", "mcrxrx ", "mfcr ", "mfmsr ", - "mfspr ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "popcnt ", + "mfspr ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "bsort ", "prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", "shr ", "sync ", "tlbie ", "trap ", "xor ", "addg6s ", "wait ", "ffail ", }; From d358981d4340ce1f3f4008e43852a9da5e1be217 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 23 Dec 2024 22:07:07 +1100 Subject: [PATCH 11/21] Generate doubled instructions in decode1 rather than decode2 This will allow us to read different source registers for the two pieces, which will be needed for instructions like stq. Signed-off-by: Paul Mackerras --- common.vhdl | 3 +- decode1.vhdl | 78 ++++++++++++++++++++++++++++++++-------------------- decode2.vhdl | 27 +++++++++++------- 3 files changed, 67 insertions(+), 41 deletions(-) diff --git a/common.vhdl b/common.vhdl index 4f7fced..7c79ccf 100644 --- a/common.vhdl +++ b/common.vhdl @@ -318,6 +318,7 @@ package common is type Decode1ToDecode2Type is record valid: std_ulogic; stop_mark : std_ulogic; + second : std_ulogic; nia: std_ulogic_vector(63 downto 0); prefixed: std_ulogic; prefix: std_ulogic_vector(25 downto 0); @@ -334,7 +335,7 @@ package common is reg_c : gspr_index_t; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := - (valid => '0', stop_mark => '0', nia => (others => '0'), + (valid => '0', stop_mark => '0', second => '0', nia => (others => '0'), prefixed => '0', prefix => (others => '0'), insn => (others => '0'), illegal_suffix => '0', misaligned_prefix => '0', decode => decode_rom_init, br_pred => '0', big_endian => '0', diff --git a/decode1.vhdl b/decode1.vhdl index 86fb5cf..ebc5993 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -44,6 +44,8 @@ architecture behaviour of decode1 is signal decode_rom_addr : insn_code; signal decode : decode_rom_t; + signal double : std_ulogic; + type prefix_state_t is record prefixed : std_ulogic; prefix : std_ulogic_vector(25 downto 0); @@ -485,6 +487,8 @@ architecture behaviour of decode1 is end; begin + double <= not r.second when (r.valid = '1' and decode.repeat /= NONE) else '0'; + decode1_0: process(clk) begin if rising_edge(clk) then @@ -497,10 +501,14 @@ begin fetch_failed <= '0'; pr <= prefix_state_init; elsif stall_in = '0' then - r <= rin; - fetch_failed <= f_in.fetch_failed; - if f_in.valid = '1' then - pr <= pr_in; + if double = '0' then + r <= rin; + fetch_failed <= f_in.fetch_failed; + if f_in.valid = '1' then + pr <= pr_in; + end if; + else + r.second <= '1'; end if; end if; if rst = '1' then @@ -511,12 +519,12 @@ begin end if; end process; - busy_out <= stall_in; + busy_out <= stall_in or double; decode1_rom: process(clk) begin if rising_edge(clk) then - if stall_in = '0' then + if stall_in = '0' and double = '0' then decode <= decode_rom(decode_rom_addr); end if; end if; @@ -646,33 +654,43 @@ begin -- Work out GPR/FPR read addresses -- Note that for prefixed instructions we are working this out based -- only on the suffix. - maybe_rb := '0'; - vr.reg_1_addr := '0' & insn_ra(f_in.insn); - vr.reg_2_addr := '0' & insn_rb(f_in.insn); - vr.reg_3_addr := '0' & insn_rs(f_in.insn); - if icode >= INSN_first_rb then - maybe_rb := '1'; - if icode < INSN_first_frs then - if icode >= INSN_first_rc then - vr.reg_3_addr := '0' & insn_rcreg(f_in.insn); - end if; - else - -- access FRS operand - vr.reg_3_addr(5) := '1'; - if icode >= INSN_first_frab then - -- access FRA and/or FRB operands - vr.reg_1_addr(5) := '1'; - vr.reg_2_addr(5) := '1'; - end if; - if icode >= INSN_first_frabc then - -- access FRC operand - vr.reg_3_addr := '1' & insn_rcreg(f_in.insn); + if double = '0' then + maybe_rb := '0'; + vr.reg_1_addr := '0' & insn_ra(f_in.insn); + vr.reg_2_addr := '0' & insn_rb(f_in.insn); + vr.reg_3_addr := '0' & insn_rs(f_in.insn); + if icode >= INSN_first_rb then + maybe_rb := '1'; + if icode < INSN_first_frs then + if icode >= INSN_first_rc then + vr.reg_3_addr := '0' & insn_rcreg(f_in.insn); + end if; + else + -- access FRS operand + vr.reg_3_addr(5) := '1'; + if icode >= INSN_first_frab then + -- access FRA and/or FRB operands + vr.reg_1_addr(5) := '1'; + vr.reg_2_addr(5) := '1'; + end if; + if icode >= INSN_first_frabc then + -- access FRC operand + vr.reg_3_addr := '1' & insn_rcreg(f_in.insn); + end if; end if; end if; + vr.read_1_enable := f_in.valid; + vr.read_2_enable := f_in.valid and maybe_rb; + vr.read_3_enable := f_in.valid; + else + -- second instance of a doubled instruction + vr.reg_1_addr := r.reg_a; + vr.reg_2_addr := r.reg_b; + vr.reg_3_addr := r.reg_c; + vr.read_1_enable := '0'; -- (not actually used) + vr.read_2_enable := '0'; + vr.read_3_enable := '1'; -- (not actually used) end if; - vr.read_1_enable := f_in.valid; - vr.read_2_enable := f_in.valid and maybe_rb; - vr.read_3_enable := f_in.valid; v.reg_a := vr.reg_1_addr; v.reg_b := vr.reg_2_addr; diff --git a/decode2.vhdl b/decode2.vhdl index a747495..4a020da 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -377,6 +377,21 @@ begin dec_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, d_in.prefix); dec_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn); dec_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn); + case d_in.decode.repeat is + when DUPD => + if d_in.second = '1' then + -- update-form loads, 2nd instruction writes RA + dec_o.reg := dec_a.reg; + end if; + when others => + end case; + -- For the second instance of a doubled instruction, we ignore the RA + -- and RB operands, in order to avoid false dependencies on the output + -- of the first instance. + if d_in.second = '1' then + dec_a.reg_valid := '0'; + dec_b.reg_valid := '0'; + end if; if d_in.valid = '0' or d_in.illegal_suffix = '1' then dec_a.reg_valid := '0'; dec_b.reg_valid := '0'; @@ -512,10 +527,10 @@ begin end if; v.e.dec_ctr := decctr; - v.repeat := d_in.decode.repeat; if d_in.decode.repeat /= NONE then v.e.repeat := '1'; end if; + v.e.second := d_in.second; if decctr = '1' then -- read and write CTR @@ -627,14 +642,6 @@ begin v.e.prefix := d_in.prefix; v.e.illegal_suffix := d_in.illegal_suffix; v.e.misaligned_prefix := d_in.misaligned_prefix; - - elsif dc2.e.valid = '1' then - -- dc2.busy = 1 and dc2.e.valid = 1, thus this must be a repeated instruction. - -- Set up for the second iteration (if deferred = 1 this will all be ignored) - v.e.second := '1'; - -- DUPD is the only possibility here: - -- update-form loads, 2nd instruction writes RA - v.e.write_reg := dc2.e.read_reg1; end if; -- issue control @@ -723,7 +730,7 @@ begin v.e.valid := control_valid_out; v.e.instr_tag := instr_tag; - v.busy := valid_in and (not control_valid_out or (v.e.repeat and not v.e.second)); + v.busy := valid_in and not control_valid_out; stall_out <= dc2.busy or deferred; From 722f239c025e55bb45e477ca70f8f6500d7801b8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Dec 2024 22:09:51 +1100 Subject: [PATCH 12/21] Reimplement quadword loads and stores This adds implementations of lq, plq, stq, pstq, lqarx and stqcx. Because register file addresses are now computed in decode1 before we have the decode table entry for the instruction, we have to check the icode directly to know when to read register RS|1 before RS (i.e. for stq and stqcx in LE mode, but not pstq). For the second instance of the instruction, loadstore1 uses the EA from the first instance + 8. It generates an alignment interrupt for unaligned lqarx and stqcx and for lq in LE mode with an unaligned address. (The reason for the latter case is that it writes RT|1 before RT, and if we have RA = RT|1 and the second instance traps, we will have overwritten RA.) Signed-off-by: Paul Mackerras --- common.vhdl | 5 +++- dcache.vhdl | 4 ++-- decode1.vhdl | 19 +++++++++++++++ decode2.vhdl | 25 +++++++++++++++++++- decode_types.vhdl | 60 ++++++++++++++++++++++++++++------------------- execute1.vhdl | 5 ++-- loadstore1.vhdl | 36 +++++++++++++++++++++++++--- predecode.vhdl | 15 +++++++++++- 8 files changed, 135 insertions(+), 34 deletions(-) diff --git a/common.vhdl b/common.vhdl index 7c79ccf..3af1d7b 100644 --- a/common.vhdl +++ b/common.vhdl @@ -427,6 +427,7 @@ package common is prefix : std_ulogic_vector(25 downto 0); illegal_suffix : std_ulogic; misaligned_prefix : std_ulogic; + illegal_form : std_ulogic; uses_tar : std_ulogic; uses_dscr : std_ulogic; end record; @@ -450,7 +451,7 @@ package common is dbg_spr_access => '0', dec_ctr => '0', prefixed => '0', prefix => (others => '0'), illegal_suffix => '0', - misaligned_prefix => '0', uses_tar => '0', uses_dscr => '0', + misaligned_prefix => '0', illegal_form => '0', uses_tar => '0', uses_dscr => '0', others => (others => '0')); type MultiplyInputType is record @@ -604,6 +605,8 @@ package common is dcbz : std_ulogic; nc : std_ulogic; reserve : std_ulogic; + atomic_qw : std_ulogic; -- part of a quadword atomic op + atomic_last : std_ulogic; virt_mode : std_ulogic; priv_mode : std_ulogic; addr : std_ulogic_vector(63 downto 0); diff --git a/dcache.vhdl b/dcache.vhdl index c9541e5..807a2dc 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -1112,10 +1112,10 @@ begin -- XXX or if r0.req.nc = '1' if r0.req.load = '1' then -- load with reservation - set_rsrv <= '1'; + set_rsrv <= not r0.req.atomic_qw or r0.req.atomic_last; else -- store conditional - clear_rsrv <= '1'; + clear_rsrv <= not r0.req.atomic_qw or r0.req.atomic_last; if reservation.valid = '0' or r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then cancel_store <= '1'; diff --git a/decode1.vhdl b/decode1.vhdl index ebc5993..643523b 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -237,6 +237,8 @@ architecture behaviour of decode1 is INSN_lhzu => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), INSN_lhzux => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), INSN_lhzx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_lq => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DQ, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRTP), + INSN_lqarx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', DRTP), INSN_lwa => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DS, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_lwarx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), INSN_lwaux => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), @@ -292,6 +294,7 @@ architecture behaviour of decode1 is INSN_plfs => (LDST, FPU, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), INSN_plha => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_plhz => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_plq => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRTP), INSN_plwa => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_plwz => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_pnop => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -300,6 +303,7 @@ architecture behaviour of decode1 is INSN_pstfd => (LDST, FPU, OP_STORE, RA0_OR_CIA, CONST_PSI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_pstfs => (LDST, FPU, OP_STORE, RA0_OR_CIA, CONST_PSI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), INSN_psth => (LDST, NONE, OP_STORE, RA0_OR_CIA, CONST_PSI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_pstq => (LDST, NONE, OP_STORE, RA0_OR_CIA, CONST_PSI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRSP), INSN_pstw => (LDST, NONE, OP_STORE, RA0_OR_CIA, CONST_PSI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_popcntb => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_popcntd => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -358,6 +362,8 @@ architecture behaviour of decode1 is INSN_sthu => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), INSN_sthux => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), INSN_sthx => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_stq => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRSP), + INSN_stqcx => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', DRSP), INSN_stw => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_stwbrx => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_stwcix => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '1', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -509,6 +515,7 @@ begin end if; else r.second <= '1'; + r.reg_c <= rin.reg_c; end if; end if; if rst = '1' then @@ -679,6 +686,12 @@ begin end if; end if; end if; + -- See if this is an instruction where repeat_t = DRSP and we need + -- to read RS|1 followed by RS, i.e. stq or stqcx. in LE mode + -- (note we don't have access to the decode for the current instruction) + if (icode = INSN_stq or icode = INSN_stqcx) and f_in.big_endian = '0' then + vr.reg_3_addr(0) := '1'; + end if; vr.read_1_enable := f_in.valid; vr.read_2_enable := f_in.valid and maybe_rb; vr.read_3_enable := f_in.valid; @@ -690,6 +703,12 @@ begin vr.read_1_enable := '0'; -- (not actually used) vr.read_2_enable := '0'; vr.read_3_enable := '1'; -- (not actually used) + -- For pstq, and for stq and stqcx in BE mode, + -- we need to read register RS|1 in the cycle after we read RS; + -- stq and stqcx in LE mode read RS. + if decode.repeat = DRSP then + vr.reg_3_addr(0) := r.prefixed or f_in.big_endian; + end if; end if; v.reg_a := vr.reg_1_addr; diff --git a/decode2.vhdl b/decode2.vhdl index 4a020da..7e993d5 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -348,7 +348,8 @@ begin elsif deferred = '0' then if dc2in.e.valid = '1' then report "execute " & to_hstring(dc2in.e.nia) & - " tag=" & integer'image(dc2in.e.instr_tag.tag) & std_ulogic'image(dc2in.e.instr_tag.valid); + " tag=" & integer'image(dc2in.e.instr_tag.tag) & std_ulogic'image(dc2in.e.instr_tag.valid) & + " rpt=" & std_ulogic'image(dc2in.e.repeat) & " 2nd=" & std_ulogic'image(dc2in.e.second) & " wr=" & to_hstring(dc2in.e.write_reg); end if; dc2 <= dc2in; elsif dc2.read_rspr = '0' then @@ -383,6 +384,16 @@ begin -- update-form loads, 2nd instruction writes RA dec_o.reg := dec_a.reg; end if; + when DRSP => + -- non-prefixed stq, stqcx do RS|1, RS in LE mode; others do RS, RS|1 + if d_in.second = (d_in.big_endian or d_in.prefixed) then + dec_c.reg(0) := '1'; -- do RS, RS|1 + end if; + when DRTP => + -- non-prefixed lq, lqarx do RT|1, RT in LE mode; others do RT, RT|1 + if d_in.second = (d_in.big_endian or d_in.prefixed) then + dec_o.reg(0) := '1'; + end if; when others => end case; -- For the second instance of a doubled instruction, we ignore the RA @@ -642,6 +653,18 @@ begin v.e.prefix := d_in.prefix; v.e.illegal_suffix := d_in.illegal_suffix; v.e.misaligned_prefix := d_in.misaligned_prefix; + + -- check for invalid forms that cause an illegal instruction interrupt + -- Does RA = RT for a load quadword instr, or RB = RT for lqarx? + if d_in.decode.repeat = DRTP and + (insn_ra(d_in.insn) = insn_rt(d_in.insn) or + (d_in.decode.reserve = '1' and insn_rb(d_in.insn) = insn_rt(d_in.insn))) then + v.e.illegal_form := '1'; + end if; + -- Is RS/RT odd for a load/store quadword instruction? + if (d_in.decode.repeat = DRSP or d_in.decode.repeat = DRTP) and d_in.insn(21) = '1' then + v.e.illegal_form := '1'; + end if; end if; -- issue control diff --git a/decode_types.vhdl b/decode_types.vhdl index dc104cd..03e958b 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -125,8 +125,9 @@ package decode_types is INSN_std, INSN_stdu, INSN_sthu, - INSN_stwu, - INSN_subfic, -- 90 + INSN_stq, + INSN_stwu, -- 90 + INSN_subfic, INSN_subfme, INSN_subfze, INSN_sync, @@ -135,23 +136,23 @@ package decode_types is INSN_twi, INSN_wait, INSN_xori, - INSN_xoris, - -- pad to 104 - INSN_064, INSN_065, INSN_066, INSN_067, + INSN_xoris, -- 100 + -- pad to 102 + INSN_065, -- Non-prefixed instructions that have a MLS:D prefixed form and -- their corresponding prefixed instructions. -- The non-prefixed versions have even indexes so that we can -- convert them to the prefixed version by setting bit 0 - INSN_addi, -- 104 + INSN_addi, -- 102 INSN_paddi, INSN_lbz, INSN_plbz, INSN_lha, INSN_plha, - INSN_lhz, -- 110 + INSN_lhz, INSN_plhz, - INSN_lwz, + INSN_lwz, -- 110 INSN_plwz, INSN_stb, INSN_pstb, @@ -161,15 +162,18 @@ package decode_types is INSN_pstw, -- Slots for non-prefixed opcodes that are 8LS:D when prefixed - INSN_lhzu, -- 120 + INSN_lhzu, INSN_plwa, + INSN_lq, -- 120 + INSN_plq, INSN_op57, INSN_pld, + INSN_op60, + INSN_pstq, INSN_op61, INSN_pstd, -- pad to 128 to simplify comparison logic - INSN_07e, INSN_07f, -- The following instructions have an RB operand but don't access FPRs INSN_add, @@ -219,12 +223,13 @@ package decode_types is INSN_lhzcix, INSN_lhzx, INSN_lhzux, + INSN_lqarx, INSN_lwarx, INSN_lwax, INSN_lwaux, INSN_lwbrx, - INSN_lwzcix, - INSN_lwzx, -- 180 + INSN_lwzcix, -- 180 + INSN_lwzx, INSN_lwzux, INSN_modsd, INSN_modsw, @@ -233,8 +238,8 @@ package decode_types is INSN_mulhw, INSN_mulhwu, INSN_mulhd, - INSN_mulhdu, - INSN_mullw, -- 190 + INSN_mulhdu, -- 190 + INSN_mullw, INSN_mulld, INSN_nand, INSN_nor, @@ -243,8 +248,8 @@ package decode_types is INSN_pdepd, INSN_pextd, INSN_rldcl, - INSN_rldcr, - INSN_rlwnm, -- 200 + INSN_rldcr, -- 200 + INSN_rlwnm, INSN_slw, INSN_sld, INSN_sraw, @@ -253,8 +258,8 @@ package decode_types is INSN_srd, INSN_stbcix, INSN_stbcx, - INSN_stbx, - INSN_stbux, -- 210 + INSN_stbx, -- 210 + INSN_stbux, INSN_stdbrx, INSN_stdcix, INSN_stdcx, @@ -263,8 +268,9 @@ package decode_types is INSN_sthbrx, INSN_sthcix, INSN_sthcx, - INSN_sthx, - INSN_sthux, -- 220 + INSN_sthx, -- 220 + INSN_sthux, + INSN_stqcx, INSN_stwbrx, INSN_stwcix, INSN_stwcx, @@ -272,15 +278,14 @@ package decode_types is INSN_stwux, INSN_subf, INSN_subfc, - INSN_subfe, + INSN_subfe, -- 230 INSN_td, - INSN_tlbie, -- 230 + INSN_tlbie, INSN_tlbiel, INSN_tw, INSN_xor, -- pad to 240 to simplify comparison logic - INSN_234, INSN_235, INSN_236, INSN_237, INSN_238, INSN_239, -- The following instructions have a third input addressed by RC @@ -439,7 +444,9 @@ package decode_types is type length_t is (NONE, is1B, is2B, is4B, is8B); type repeat_t is (NONE, -- instruction is not repeated - DUPD); -- update-form load + DUPD, -- update-form load + DRSP, -- double RS (RS, RS+1) + DRTP); -- double RT (RT, RT+1, or RT+1, RT) type decode_rom_t is record unit : unit_t; @@ -523,6 +530,7 @@ package body decode_types is when INSN_lhau => return "101011"; when INSN_lhz => return "101000"; when INSN_lhzu => return "101001"; + when INSN_lq => return "111000"; when INSN_lwz => return "100000"; when INSN_lwzu => return "100001"; when INSN_mulli => return "000111"; @@ -542,6 +550,7 @@ package body decode_types is when INSN_sth => return "101100"; when INSN_sthu => return "101101"; when INSN_stw => return "100100"; + when INSN_stq => return "111110"; when INSN_stwu => return "100101"; when INSN_subfic => return "001000"; when INSN_tdi => return "000010"; @@ -587,6 +596,7 @@ package body decode_types is when INSN_fnmadd => return "111111"; when INSN_prefix => return "000001"; when INSN_op57 => return "111001"; + when INSN_op60 => return "111100"; when INSN_op61 => return "111101"; when INSN_add => return "011111"; when INSN_addc => return "011111"; @@ -654,6 +664,7 @@ package body decode_types is when INSN_lhzcix => return "011111"; when INSN_lhzux => return "011111"; when INSN_lhzx => return "011111"; + when INSN_lqarx => return "011111"; when INSN_lwarx => return "011111"; when INSN_lwaux => return "011111"; when INSN_lwax => return "011111"; @@ -719,6 +730,7 @@ package body decode_types is when INSN_sthcx => return "011111"; when INSN_sthux => return "011111"; when INSN_sthx => return "011111"; + when INSN_stqcx => return "011111"; when INSN_stwbrx => return "011111"; when INSN_stwcix => return "011111"; when INSN_stwcx => return "011111"; diff --git a/execute1.vhdl b/execute1.vhdl index 2cc9c35..ecb1e63 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -704,7 +704,8 @@ begin if valid_in = '1' then report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) & " wr=" & to_hstring(ex1in.e.write_reg) & " we=" & std_ulogic'image(ex1in.e.write_enable) & - " tag=" & integer'image(ex1in.e.instr_tag.tag) & std_ulogic'image(ex1in.e.instr_tag.valid); + " tag=" & integer'image(ex1in.e.instr_tag.tag) & std_ulogic'image(ex1in.e.instr_tag.valid) & + " 2nd=" & std_ulogic'image(e_in.second); end if; -- We mustn't get stalled on a cycle where execute2 is -- completing an instruction or generating an interrupt @@ -1147,7 +1148,7 @@ begin slow_op := '0'; owait := '0'; - if e_in.illegal_suffix = '1' then + if e_in.illegal_suffix = '1' or e_in.illegal_form = '1' then illegal := '1'; elsif ex1.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then privileged := '1'; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index fc8c158..dcacc75 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -84,6 +84,8 @@ architecture behave of loadstore1 is update : std_ulogic; xerc : xer_common_t; reserve : std_ulogic; + atomic_qw : std_ulogic; + atomic_last : std_ulogic; rc : std_ulogic; nc : std_ulogic; -- non-cacheable access virt_mode : std_ulogic; @@ -108,6 +110,7 @@ architecture behave of loadstore1 is elt_length => x"0", byte_reverse => '0', brev_mask => "000", sign_extend => '0', update => '0', xerc => xerc_init, reserve => '0', + atomic_qw => '0', atomic_last => '0', rc => '0', nc => '0', virt_mode => '0', priv_mode => '0', load_sp => '0', sprsel => "00", ric => "00", is_slbia => '0', align_intr => '0', @@ -447,7 +450,10 @@ begin if l_in.second = '1' then -- for an update-form load, use the previous address -- as the value to write back to RA. - addr := r1.addr0; + -- for a quadword load or store, use with the previous + -- address + 8. + addr := std_ulogic_vector(unsigned(r1.addr0(63 downto 3)) + not l_in.update) & + r1.addr0(2 downto 0); end if; if l_in.mode_32bit = '1' then addr(63 downto 32) := (others => '0'); @@ -474,12 +480,32 @@ begin misaligned := or (addr_mask and addr(2 downto 0)); v.align_intr := l_in.reserve and misaligned; + -- is this a quadword load or store? i.e. lq plq stq pstq lqarx stqcx. + if l_in.repeat = '1' and l_in.update = '0' then + -- is the access aligned? + if misaligned = '0' and addr(3) = l_in.second then + -- Since the access is aligned we have to do it atomically + v.atomic_qw := '1'; + v.atomic_last := l_in.second; + else + -- lqarx/stqcx have to be aligned + if l_in.reserve = '1' then + v.align_intr := '1'; + end if; + -- We require non-prefixed lq in LE mode to be aligned in order + -- to avoid the case where RA = RT+1 and the second access faults + -- after the first has overwritten RA. + if l_in.op = OP_LOAD and l_in.byte_reverse = '0' and l_in.prefixed = '0' then + v.align_intr := '1'; + end if; + end if; + end if; + case l_in.op is when OP_STORE => v.store := '1'; when OP_LOAD => - -- Note: only RA updates have l_in.second = 1 - if l_in.second = '0' then + if l_in.update = '0' or l_in.second = '0' then v.load := '1'; if HAS_FPU and l_in.is_32bit = '1' then -- Allow an extra cycle for SP->DP precision conversion @@ -952,6 +978,8 @@ begin d_out.dcbz <= stage1_req.dcbz; d_out.nc <= stage1_req.nc; d_out.reserve <= stage1_req.reserve; + d_out.atomic_qw <= stage1_req.atomic_qw; + d_out.atomic_last <= stage1_req.atomic_last; d_out.addr <= stage1_req.addr; d_out.byte_sel <= stage1_req.byte_sel; d_out.virt_mode <= stage1_req.virt_mode; @@ -962,6 +990,8 @@ begin d_out.dcbz <= r2.req.dcbz; d_out.nc <= r2.req.nc; d_out.reserve <= r2.req.reserve; + d_out.atomic_qw <= r2.req.atomic_qw; + d_out.atomic_last <= r2.req.atomic_last; d_out.addr <= r2.req.addr; d_out.byte_sel <= r2.req.byte_sel; d_out.virt_mode <= r2.req.virt_mode; diff --git a/predecode.vhdl b/predecode.vhdl index 65cb751..e8689ef 100644 --- a/predecode.vhdl +++ b/predecode.vhdl @@ -121,6 +121,8 @@ architecture behaviour of predecoder is 2#011110_01110# to 2#011110_01111# => INSN_rldimi, 2#011110_10000# to 2#011110_10001# => INSN_rldcl, 2#011110_10010# to 2#011110_10011# => INSN_rldcr, + -- major opcode 56 + 2#111000_00000# to 2#111000_11111# => INSN_lq, -- major opcode 58 2#111010_00000# => INSN_ld, 2#111010_00001# => INSN_ldu, @@ -161,20 +163,28 @@ architecture behaviour of predecoder is -- major opcode 62 2#111110_00000# => INSN_std, 2#111110_00001# => INSN_stdu, + 2#111110_00010# => INSN_stq, 2#111110_00100# => INSN_std, 2#111110_00101# => INSN_stdu, + 2#111110_00110# => INSN_stq, 2#111110_01000# => INSN_std, 2#111110_01001# => INSN_stdu, + 2#111110_01010# => INSN_stq, 2#111110_01100# => INSN_std, 2#111110_01101# => INSN_stdu, + 2#111110_01110# => INSN_stq, 2#111110_10000# => INSN_std, 2#111110_10001# => INSN_stdu, + 2#111110_10010# => INSN_stq, 2#111110_10100# => INSN_std, 2#111110_10101# => INSN_stdu, + 2#111110_10110# => INSN_stq, 2#111110_11000# => INSN_std, 2#111110_11001# => INSN_stdu, + 2#111110_11010# => INSN_stq, 2#111110_11100# => INSN_std, 2#111110_11101# => INSN_stdu, + 2#111110_11110# => INSN_stq, -- major opcode 63 2#111111_00100# to 2#111111_00101# => INSN_fdiv, 2#111111_01000# to 2#111111_01001# => INSN_fsub, @@ -190,8 +200,9 @@ architecture behaviour of predecoder is 2#111111_11110# to 2#111111_11111# => INSN_fnmadd, -- prefix word, PO1 2#000001_00000# to 2#000001_11111# => INSN_prefix, - -- Major opcodes 57 and 61 are SFFS load/store instructions when prefixed + -- Major opcodes 57, 60 and 61 are SFFS load/store instructions when prefixed 2#111001_00000# to 2#111001_11111# => INSN_op57, + 2#111100_00000# to 2#111100_11111# => INSN_op60, 2#111101_00000# to 2#111101_11111# => INSN_op61, others => INSN_illegal ); @@ -317,6 +328,7 @@ architecture behaviour of predecoder is 2#0_11001_10101# => INSN_lhzcix, 2#0_01001_10111# => INSN_lhzux, 2#0_01000_10111# => INSN_lhzx, + 2#0_01000_10100# => INSN_lqarx, 2#0_00000_10100# => INSN_lwarx, 2#0_01011_10101# => INSN_lwaux, 2#0_01010_10101# => INSN_lwax, @@ -405,6 +417,7 @@ architecture behaviour of predecoder is 2#0_10110_10110# => INSN_sthcx, 2#0_01101_10111# => INSN_sthux, 2#0_01100_10111# => INSN_sthx, + 2#0_00101_10110# => INSN_stqcx, 2#0_10100_10110# => INSN_stwbrx, 2#0_11100_10101# => INSN_stwcix, 2#0_00100_10110# => INSN_stwcx, From 140b930ad3ba337ec4d80b400ca5aa16846909df Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 14 Sep 2020 18:21:27 +1000 Subject: [PATCH 13/21] tests: Add tests for lq/stq, plq/pstq and lqarx/stqcx. Lq and stq are tested in both BE and LE modes (though only 64-bit mode) by the 'modes' test. Lqarx and stqcx. are tested by the 'reservation' test in LE mode (64-bit). Plq and pstq are tested in 64-bit LE mode by the 'prefix' test. Signed-off-by: Paul Mackerras --- tests/modes/head.S | 60 ++++++++++ tests/modes/modes.c | 172 +++++++++++++++++++++++++++++ tests/prefix/head.S | 20 ++++ tests/prefix/prefix.c | 36 ++++++ tests/reservation/head.S | 28 +++++ tests/reservation/reservation.c | 62 +++++++++++ tests/test_modes.bin | Bin 20520 -> 20520 bytes tests/test_modes.console_out | 2 + tests/test_prefix.bin | Bin 12320 -> 12320 bytes tests/test_prefix.console_out | 1 + tests/test_reservation.bin | Bin 10888 -> 11588 bytes tests/test_reservation.console_out | 1 + 12 files changed, 382 insertions(+) diff --git a/tests/modes/head.S b/tests/modes/head.S index d9e69dc..8b00bdd 100644 --- a/tests/modes/head.S +++ b/tests/modes/head.S @@ -230,3 +230,63 @@ restore: ld %r0,16(%r1) mtlr %r0 blr + + .global do_lq +do_lq: + lq %r6,0(%r3) + std %r6,0(%r4) + std %r7,8(%r4) + li %r3,0 + blr + + .global do_lq_np /* "non-preferred" form of lq */ +do_lq_np: + mr %r7,%r3 + lq %r6,0(%r7) + std %r6,0(%r4) + std %r7,8(%r4) + li %r3,0 + blr + + .global do_lq_bad /* illegal form of lq */ +do_lq_bad: + mr %r6,%r3 + .long 0xe0c60000 /* lq %r6,0(%r6) */ + std %r6,0(%r4) + std %r7,8(%r4) + li %r3,0 + blr + + .global do_stq +do_stq: + ld %r8,0(%r4) + ld %r9,8(%r4) + stq %r8,0(%r3) + li %r3,0 + blr + + /* big-endian versions of the above */ + .global do_lq_be +do_lq_be: + .long 0x0000c3e0 + .long 0x0000c4f8 + .long 0x0800e4f8 + .long 0x00006038 + .long 0x2000804e + + .global do_lq_np_be /* "non-preferred" form of lq */ +do_lq_np_be: + .long 0x781b677c + .long 0x0000c7e0 + .long 0x0000c4f8 + .long 0x0800e4f8 + .long 0x00006038 + .long 0x2000804e + + .global do_stq_be +do_stq_be: + .long 0x000004e9 + .long 0x080024e9 + .long 0x020003f9 + .long 0x00006038 + .long 0x2000804e diff --git a/tests/modes/modes.c b/tests/modes/modes.c index fa4872c..f37e70b 100644 --- a/tests/modes/modes.c +++ b/tests/modes/modes.c @@ -12,6 +12,14 @@ extern unsigned long callit(unsigned long arg1, unsigned long arg2, unsigned long fn, unsigned long msr); +extern void do_lq(void *src, unsigned long *regs); +extern void do_lq_np(void *src, unsigned long *regs); +extern void do_lq_bad(void *src, unsigned long *regs); +extern void do_stq(void *dst, unsigned long *regs); +extern void do_lq_be(void *src, unsigned long *regs); +extern void do_lq_np_be(void *src, unsigned long *regs); +extern void do_stq_be(void *dst, unsigned long *regs); + static inline void do_tlbie(unsigned long rb, unsigned long rs) { __asm__ volatile("tlbie %0,%1" : : "r" (rb), "r" (rs) : "memory"); @@ -25,6 +33,8 @@ static inline void do_tlbie(unsigned long rb, unsigned long rs) #define SPRG0 272 #define SPRG1 273 #define SPRG3 275 +#define HSRR0 314 +#define HSRR1 315 #define PTCR 464 static inline unsigned long mfspr(int sprnum) @@ -294,6 +304,166 @@ int mode_test_6(void) return 0; } +int mode_test_7(void) +{ + unsigned long quad[4] __attribute__((__aligned__(16))); + unsigned long regs[2]; + unsigned long ret, msr; + + /* + * Test lq/stq in LE mode + */ + msr = MSR_SF | MSR_LE; + quad[0] = 0x123456789abcdef0ul; + quad[1] = 0xfafa5959bcbc3434ul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_lq, msr); + if (ret) + return ret | 1; + if (regs[0] != quad[1] || regs[1] != quad[0]) + return 2; + /* unaligned may give alignment interrupt */ + quad[2] = 0x0011223344556677ul; + ret = callit((unsigned long)&quad[1], (unsigned long)regs, + (unsigned long)&do_lq, msr); + if (ret == 0) { + if (regs[0] != quad[2] || regs[1] != quad[1]) + return 3; + } else if (ret == 0x600) { + if (mfspr(SPRG0) != (unsigned long) &do_lq || + mfspr(DAR) != (unsigned long) &quad[1]) + return ret | 4; + } else + return ret | 5; + + /* try stq */ + regs[0] = 0x5238523852385238ul; + regs[1] = 0x5239523952395239ul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_stq, msr); + if (ret) + return ret | 5; + if (quad[0] != regs[1] || quad[1] != regs[0]) + return 6; + regs[0] = 0x0172686966746564ul; + regs[1] = 0xfe8d0badd00dabcdul; + ret = callit((unsigned long)quad + 1, (unsigned long)regs, + (unsigned long)&do_stq, msr); + if (ret) + return ret | 7; + if (((quad[0] >> 8) | (quad[1] << 56)) != regs[1] || + ((quad[1] >> 8) | (quad[2] << 56)) != regs[0]) + return 8; + + /* try lq non-preferred form */ + quad[0] = 0x56789abcdef01234ul; + quad[1] = 0x5959bcbc3434fafaul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_lq_np, msr); + if (ret) + return ret | 9; + if (regs[0] != quad[1] || regs[1] != quad[0]) + return 10; + /* unaligned should give alignment interrupt in uW implementation */ + quad[2] = 0x6677001122334455ul; + ret = callit((unsigned long)&quad[1], (unsigned long)regs, + (unsigned long)&do_lq_np, msr); + if (ret == 0x600) { + if (mfspr(SPRG0) != (unsigned long) &do_lq_np + 4 || + mfspr(DAR) != (unsigned long) &quad[1]) + return ret | 11; + } else + return 12; + + /* make sure lq with rt = ra causes a HEAI interrupt */ + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_lq_bad, msr); + if (ret != 0xe40) + return 13; + if (mfspr(HSRR0) != (unsigned long)&do_lq_bad + 4) + return 14; + return 0; +} + +int mode_test_8(void) +{ + unsigned long quad[4] __attribute__((__aligned__(16))); + unsigned long regs[2]; + unsigned long ret, msr; + + /* + * Test lq/stq in BE mode + */ + msr = MSR_SF; + quad[0] = 0x123456789abcdef0ul; + quad[1] = 0xfafa5959bcbc3434ul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_lq_be, msr); + if (ret) + return ret | 1; + if (regs[0] != quad[0] || regs[1] != quad[1]) { + print_hex(regs[0], 16); + print_string(" "); + print_hex(regs[1], 16); + print_string(" "); + return 2; + } + /* don't expect alignment interrupt */ + quad[2] = 0x0011223344556677ul; + ret = callit((unsigned long)&quad[1], (unsigned long)regs, + (unsigned long)&do_lq_be, msr); + if (ret == 0) { + if (regs[0] != quad[1] || regs[1] != quad[2]) + return 3; + } else + return ret | 5; + + /* try stq */ + regs[0] = 0x5238523852385238ul; + regs[1] = 0x5239523952395239ul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_stq_be, msr); + if (ret) + return ret | 5; + if (quad[0] != regs[0] || quad[1] != regs[1]) + return 6; + regs[0] = 0x0172686966746564ul; + regs[1] = 0xfe8d0badd00dabcdul; + ret = callit((unsigned long)quad + 1, (unsigned long)regs, + (unsigned long)&do_stq_be, msr); + if (ret) + return ret | 7; + if (((quad[0] >> 8) | (quad[1] << 56)) != regs[0] || + ((quad[1] >> 8) | (quad[2] << 56)) != regs[1]) { + print_hex(quad[0], 16); + print_string(" "); + print_hex(quad[1], 16); + print_string(" "); + print_hex(quad[2], 16); + print_string(" "); + return 8; + } + + /* try lq non-preferred form */ + quad[0] = 0x56789abcdef01234ul; + quad[1] = 0x5959bcbc3434fafaul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_lq_np_be, msr); + if (ret) + return ret | 9; + if (regs[0] != quad[0] || regs[1] != quad[1]) + return 10; + /* unaligned should not give alignment interrupt in uW implementation */ + quad[2] = 0x6677001122334455ul; + ret = callit((unsigned long)&quad[1], (unsigned long)regs, + (unsigned long)&do_lq_np_be, msr); + if (ret) + return ret | 11; + if (regs[0] != quad[1] || regs[1] != quad[2]) + return 12; + return 0; +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -338,6 +508,8 @@ int main(void) do_test(4, mode_test_4); do_test(5, mode_test_5); do_test(6, mode_test_6); + do_test(7, mode_test_7); + do_test(8, mode_test_8); return fail; } diff --git a/tests/prefix/head.S b/tests/prefix/head.S index 961c2a9..9a46e4b 100644 --- a/tests/prefix/head.S +++ b/tests/prefix/head.S @@ -245,3 +245,23 @@ test_pstw: pstw %r3,wvar(0) li %r3,0 blr + + .globl test_plq +test_plq: + nop + nop + plq %r4,qvar(0) + std %r4,0(%r3) + std %r5,8(%r3) + li %r3,0 + blr + + .globl test_pstq +test_pstq: + nop + nop + ld %r4,0(%r3) + ld %r5,8(%r3) + pstq %r4,qvar(0) + li %r3,0 + blr diff --git a/tests/prefix/prefix.c b/tests/prefix/prefix.c index d594037..8cc117d 100644 --- a/tests/prefix/prefix.c +++ b/tests/prefix/prefix.c @@ -33,6 +33,8 @@ extern long test_pstd(long arg); extern long test_psth(long arg); extern long test_pstw(long arg); extern long test_plfd(long arg); +extern long test_plq(long arg); +extern long test_pstq(long arg); static inline unsigned long mfspr(int sprnum) { @@ -183,6 +185,39 @@ long int prefix_test_3(void) return 0; } +unsigned long qvar[2] __attribute__((__aligned__(16))); +#define V1 0x678912345a5a2b2bull +#define V2 0xa0549922bbccddeeull + +/* test plq and pstq */ +long int prefix_test_4(void) +{ + long int ret; + unsigned long x[2]; + + qvar[0] = V1; + qvar[1] = V2; + ret = trapit((long)&x, test_plq); + if (ret) + return ret | 1; + if (x[0] != V1 || x[1] != V2) { + print_hex(x[0], 16, " "); + print_hex(x[1], 16, " "); + return 2; + } + x[0] = ~V2; + x[1] = ~V1; + ret = trapit((long)&x, test_pstq); + if (ret) + return ret | 3; + if (qvar[0] != ~V2 || qvar[1] != ~V1) { + print_hex(qvar[0], 16, " "); + print_hex(qvar[1], 16, " "); + return 4; + } + return 0; +} + int fail = 0; void do_test(int num, long int (*test)(void)) @@ -210,6 +245,7 @@ int main(void) do_test(1, prefix_test_1); do_test(2, prefix_test_2); do_test(3, prefix_test_3); + do_test(4, prefix_test_4); return fail; } diff --git a/tests/reservation/head.S b/tests/reservation/head.S index ce258b5..4ff85ce 100644 --- a/tests/reservation/head.S +++ b/tests/reservation/head.S @@ -155,3 +155,31 @@ call_ret: ld %r31,248(%r1) addi %r1,%r1,256 blr + + .global do_lqarx +do_lqarx: + /* r3 = src, r4 = regs */ + lqarx %r10,0,%r3 + std %r10,0(%r4) + std %r11,8(%r4) + li %r3,0 + blr + + .global do_lqarx_bad +do_lqarx_bad: + /* r3 = src, r4 = regs */ + .long 0x7d405228 /* lqarx %r10,0,%r10 */ + std %r10,0(%r4) + std %r11,8(%r4) + li %r3,0 + blr + + .global do_stqcx +do_stqcx: + /* r3 = dest, r4 = regs, return CR */ + ld %r10,0(%r4) + ld %r11,8(%r4) + stqcx. %r10,0,%r3 + mfcr %r3 + oris %r3,%r3,1 /* to distinguish from trap number */ + blr diff --git a/tests/reservation/reservation.c b/tests/reservation/reservation.c index 79bbc1f..502b285 100644 --- a/tests/reservation/reservation.c +++ b/tests/reservation/reservation.c @@ -7,6 +7,10 @@ extern unsigned long callit(unsigned long arg1, unsigned long arg2, unsigned long (*fn)(unsigned long, unsigned long)); +extern unsigned long do_lqarx(unsigned long src, unsigned long regs); +extern unsigned long do_lqarx_bad(unsigned long src, unsigned long regs); +extern unsigned long do_stqcx(unsigned long dst, unsigned long regs); + #define DSISR 18 #define DAR 19 #define SRR0 26 @@ -180,6 +184,63 @@ int resv_test_2(void) return 0; } +/* test lqarx/stqcx */ +int resv_test_3(void) +{ + unsigned long x[4] __attribute__((__aligned__(16))); + unsigned long y[2], regs[2]; + unsigned long ret, offset; + int count; + + x[0] = 0x7766554433221100ul; + x[1] = 0xffeeddccbbaa9988ul; + y[0] = 0x0badcafef00dd00dul; + y[1] = 0xdeadbeef07070707ul; + for (count = 0; count < 1000; ++count) { + ret = callit((unsigned long)x, (unsigned long)regs, do_lqarx); + if (ret) + return ret | 1; + ret = callit((unsigned long)x, (unsigned long)y, do_stqcx); + if (ret < 0x10000) + return ret | 2; + if (ret & 0x20000000) + break; + } + if (count == 1000) + return 3; + if (x[0] != y[1] || x[1] != y[0]) + return 4; + if (regs[1] != 0x7766554433221100ul || regs[0] != 0xffeeddccbbaa9988ul) + return 5; + ret = callit((unsigned long)x, (unsigned long)regs, do_stqcx); + if (ret < 0x10000 || (ret & 0x20000000)) + return ret | 12; + /* test alignment interrupts */ + for (offset = 0; offset < 16; ++offset) { + ret = callit((unsigned long)x + offset, (unsigned long)regs, do_lqarx); + if (ret == 0 && (offset & 15) != 0) + return 6; + if (ret == 0x600) { + if ((offset & 15) == 0) + return ret + 7; + } else if (ret) + return ret; + ret = callit((unsigned long)x + offset, (unsigned long)y, do_stqcx); + if (ret >= 0x10000 && (offset & 15) != 0) + return 8; + if (ret == 0x600) { + if ((offset & 15) == 0) + return ret + 9; + } else if (ret < 0x10000) + return ret; + } + /* test illegal interrupt for bad lqarx case */ + ret = callit((unsigned long)x, (unsigned long)regs, do_lqarx_bad); + if (ret != 0xe40) + return ret + 10; + return 0; +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -204,6 +265,7 @@ int main(void) do_test(1, resv_test_1); do_test(2, resv_test_2); + do_test(3, resv_test_3); return fail; } diff --git a/tests/test_modes.bin b/tests/test_modes.bin index 24e39813fa353af0d03f534c01389d4c3f97d9f5..d0c24d70a6026017cb9837afa378b6144a1da476 100755 GIT binary patch literal 20520 zcmeHPeQ*@#d4KlqIIRSn&o-G+4GpUgw|AX4;|5l--$S?|t59pO4@B?C*VdwF{9FAxfk!R3y?kZqvIGDWW7~yCB;I z*{*H0VOOf_b5Ca4Xjk($-rSX5M%if2yEAij3uy`^nkNm45lO8?HOUJVH7bt1^h`~X zh~#UAFqvz=Mn|0d;e>%0BPfBFv2 zzXJJB-NE^b$ba$<&c71*|Ni#nufxB18~sxi^8elK%m3CB+)X?zM*hFOgY&-$Jpb>z zmty>XeFu+!0Qvvwlk=C*zhi-Z1^w{_`jhAnEzqx{UtFO7me=21$p6{``J;cw0{sg5 z;|ugB(H~l%Uq`>VK>sbzA2<8(k<58Q8G-5=SK!{>V;{XZUcM_EMYk*w?$fC^&qrU7 z;=T!EoiCV;`X);KInNF)=M7~n-T#BgT^B9_E&?tBE&?tBE&?tBE&?tBE&?tBE&?tB zE&?}2U<2;qa~@F(k}xS4HIsitl<6hmnH_X;YB!ymQfXvLCt-R&)l46v==2dvO`o8B z)8D3()2Hd&^!I3FdXR*fi&QgniJ~(@l$sf)eKUDFIWs}$W+rK5W>_LQ*B;bz?W?eF z)N&+ca_tXf4oOOL&Ku9c|v5k1IU>I@yN<)78 zq-%F&kYfQk7La2BIgTU8apX9T9LJI4IC30Ej^oI2961(|V-YzPkz)}#7Lj8SITn#) zk%Sr<80K?cQL~ zO}>VGj6^v}^oUN9_NC?O@Q*$&FZhTG^(0Pt$>ZpfySWIs2)GEi2)GEi2)GEi2)GEi z2)GEi2;8v<2&A;qrI^gm0Sx0?t)#pp_t2mWK$4!x3o7YZS-{`_`5D0oo)NtHOY>cc zB=?gZE8u%veABSAYF_KgaMUVpu;XhS~yEhc$k0q1=Cy?SKEh6hC@Y!nX%X z^U)aK0;2~b0N^JX!{u`k>BLoGm+Y4)`&aa?0xAN<$+Ii51Ico(9sk#qELOvv|$ zG5hX_pyNA+^1Nu`gv@p&dI{e|=(4vA#{;$x#){=m@$0yU;?Dpx#?tG^LKMZb356WS4N0D2PQiz61l9pCUrStSISV#jw0 z?Y-Nj$X^LcWao}Vs&|`(!QXzQME+&5HC7zg(o0f=vH0mfV+;{52 zSb{h!5a-^HEdPu}oJR5@*Uxz|9T<$KNe*e<@ZE%1Jm(XTRl%N6Af-WTg?!?Q6j2{j zOx$(=x3kYCQmgN=Vtf2?C35zx6&u&c>eW_k)6;WfbL!;v;@sFO5!(>HWpL`F%Esok z59ij$8MGg$l_Fo=uAoK|sm*IG|2&3_1><0jAL@eZx67^mIBVYADqSC zW=YNLgX@It?FR=lXEApzYj{~(^)X*chR;#J(I@8NuY=|MC44+T!+5{N$pbZckjUGJ zIz)Zd$W4f|P(^rTOUbbX>^S^x7-KE!@~>;34H{bIo^_1^xT4$Ss}$ait0_>tjyV&27zIai zy8++GWuqKJ5OcurQ%1`NxPJK>I)-DMQhAoYW6KWU$h?}g`Sy(^6rYbvIew0rW91k_ zh>x$8Rb{a;k9`lk~k&bNj4Wc{MZB;^CRQJJ|(ml;zy2weK9|B940>^28SPOkT3INw9Iz}d|w2| zVXX3uJ?HH?|D+liY)CWzVU2(~!@PAq3|{QOJixJpway#HXFWKK$LB+QmjqnR@oBbY z1<0Cg+5NWcZOE8|J27@8ejRNa1eWvhE9Zt795wce>3--t~7Nm-?;9vHRDf0jS^+oU!rz=tD4w@d0lCv`cNZyB>U#m zDzDda-j2-GNBO(eu=(9;um6DBB^}U~Vcj(dE>XJbHMQ;l*0K8(byEl4i7j9)%zvvY zXhVh)`_2Y0WQpeAlz7Ksd_tI;2XjSl4L$%|MkhMU#)M279q63l^!u{WV)7@Nc5 zTx`l?W4l4v`a|f9e(Gzg$#=^$TK&q;_o;O+;O}tX*G9kCr#4&)XGUMrux<@Eg1;0e z9ue{U1M1wdBO<1IIi^W5z3gAZ87C$YJ}4M&#&OEh9W935GWy5bjlTMoeSMzuM&GiZ zKhfVX)P`KcjdGa}wY@sSe*P4G*v`Gk%`=%utDC}^(AyIFO149n9h=WvHb+-x8it>M z&4xzsaFlTw^S#p7F!DsEcU;Mg#a`<(3|=n{sOnD!w7^7tI`p8fG2b>-l;I|#sLSq+ zI0wVJejh*q#XFAqmT@gmTPz;kr=czv? z?Qet=vRd~IZI6v3uOCvF)7dVsQ=EIQZKKZh&ua@_H(TJ)D-m@?@fSR@$69>I^vn$`;?4izL(DKtgU>^ zI8U|ixcT>OzUFp$CBgXF{Gf; zvIujOGSQ~x4IOKdY%8xBDLG)SJGjkbx#BMdgD*C@~w|!_gZ5xpP$!;$~ zI}iP%cH573S871!^$VY0f%a*%({@`#`_lFSc_-S8A!D4yHJG zAw34%E14s3JvS7V;d(mdBkko!B=BW;&sB3>%X(Y$ru@@MZH>RDo9of4j|OnBbSog+wn&yc|niEfL!Hnn8}}E@BJOoryIcw=2{x&<&oB#YV@Fr;|3v|vzX#8=ezmP{81~pXxC~N6ArU@A%f~~MdA~wT zvmLfsveS{V?s+K5*Dl8IDB;{Cl9gwf3t@A3N!*jfI-Y4;sl{43OWup~$E!J>c-*t_x;y|}r_cT9C!?TyCljU((_Uic= zRMbN~N#Jk#8lSB(SSxMlx3<08p4!X5yP}&mEv+JYDB97jP9A9y1uAl)7P`nmwM`vDGxu=7MYi2flQ>A}h< zz2G@6Bmzy)OH>6Y_TCkLbTPWvRxkE1Up0&*j+8KMwFl^7(-s3ifc+HubuWH&!dCQ{ zm(9UeG_ZSvW$qW2y?Use!G8UhhB1sIJp}#Z!o#NLb*8GR$ab0AC zYoQl?&(z~wd;-y1&@*#&a(U3yY&SLST-at4bfs&Cp>eFNdtB%YBoXP>xxPOKy}ekV zYEYZS=ONQ(7dwYZV(fM3U4)*_df3>-xW}w#5phi&M@AF{>-RP$7Itwn`t>}x{wT+e35#BY&TjnXjbrcP`v5%Ye2WF;nDf}6eBMJd(nw*0-4)Pw6m1ER+pUFr*L9oS2AFJw7qp{D2AK&9(R6Z4 zFQY`|L4m^q3kfU6Efe=J!b~7BrifYa!DJ+06j4URq|>M%poIa)@4LN);e$6hx#xU; ze&6~1&Ufw|$_PUl;ZVYql}^8NO*|8~pRyhCIjTU*2FEO82>^)sVJrq(uOvWBvV)Mn z!K?D$Fu!kq>XkovGezh_b4`NNqgFt#0-6k(g%E)HuDVrx` zjxu45fBq8hY*dhx+f@cl?Uq9Eo9HWNBkQcJYmH;xsD-BhDlt;iBb{aJgEh}DAVnT> zUi*u=kaUsrW*>M0IZ}yju5q*ub;b3gyo5)&2)*AoIvkd(r`kjaYB80{))}EPQ?OD)&u;F0LG+}18&`Wkr z>&=6GX`tBotR>`oVTVzVdV#JA?M1&XcujnGU8sp)on$9Mo8p}J20>0soX4;^bG1KSJ`bFGoJ=^K-bSW~O^V4Go}S&OpM1zE5%msw6#Pp7Nhc`z!IJlrRi)Z`oacJVQ>ut+aK1G-g3TZsXnB6)x4{yt z+V+4q(=ib$#hk^M_cYhx5ZV({KVuC|`tl^-^A^R$_?}e;R<9PMdr>Pl?niQ1wlimA}K-MIb4)Xfx?)Wxy$+DB-kZNYh0KyF&6 zPH;^jkx6rirKKBqHnb+OyxY^#C1YKJYZd+HjddxmM?2Ee>B4$Na0VJ4V!}xyz>MpS z7*K33Qv4ux%xDQTF}cRU+=`oJE1twYq~OtI~P(Rc29|<m}l)FF%M~` z`xO8ovK19rRa?S}9#|NNVUW&Q&Z6tu}G4vbWg=VKaWz!Hol@fYLjhU&c2?W-4&P z)GtJBK8h=|fpYyPvoZ@^xLeqUV;E!U4b3`M*anR{)>1U3aoj@xHS#wgR=?1QNb{Aw zEjpBrM1jLNKEBIdi?~)&-LLL)PQSh<^&@)y`swv44FmxDJ$5zXinwvQ?H=bEneibv zwj_+BEklNSol0H5(4b>&HmDHYZ5Y?|H|H`_8pKq+?)mZB4_Ckc4w(?o3i~fh`C%&iyX|ymS`V zQDO>mngBZQbAA!$I)i>8tix}&LEUw;!kmp2o;(KVl`)2@d2w_;cq#>*s*K|@+tRpv xg>{N^vd}}(8jcFId2v7VknJdrRrm+%Jkw!6KIiaA+mVfP=YtU-76S|Ke*tSH&Z7VT diff --git a/tests/test_modes.console_out b/tests/test_modes.console_out index a49bb9b..25e791c 100644 --- a/tests/test_modes.console_out +++ b/tests/test_modes.console_out @@ -4,3 +4,5 @@ test 03:PASS test 04:PASS test 05:PASS test 06:PASS +test 07:PASS +test 08:PASS diff --git a/tests/test_prefix.bin b/tests/test_prefix.bin index 8690be78d4cf9fe71c2d425ac617fee2fa92e122..e84e70e7870bf5e2cd13d9368b89c732dd83c46f 100755 GIT binary patch delta 2097 zcmZ9Me@s(X6vxjiwX_bKR$*cW+oP}0K3ZcPx;(U9TLN0_A|{x{7>utr`w?R@*vtY_ z)+clcBxXUcf8ZaE>Y_pBA2a&TGXGHjS~B+sjx1)AUyda-iwh=ft1EbSPWymnFS+#I z?>(P$zVF=oNULde_?)5Bb?b|atx*Ml)nOd=X|n(WSu=cmkt|@Nu z_9^=z>jlx96d1v!WjxSYmSToVTjZxo}kWozkg6oDUXb*(mKndg@lsVfZSc& zUu8;=d1REmV>**YMueDBQN-Ag8Ppk@)-)tSO)_KGi*@SYT?EHbo0?fKwi&@gYoKFj z3+|7D#p~CZMZMUKB1S}}N@t8((-kb1snKg0b#78k7F>1}fer-cq3~uB0tnwAm^$(h zb?Qi23{6U^(pbbfk-MpLM$K`hmP|?(?}*NXkfDxjC}Mm`gt~PIw5GFEXs7N{)j_8Y z{y4qfkvEYG>0ATG8?>g(s!6xbbrekc+m_<$`69Lw`op`B_d$)q6tPSMxe|R@0p5#2reA4U3`~84jt|?I!8wlA6UX!iXa|$2^S|m9(KOpyvpA5zz zDF#~vNO)Ngl=}rxwT3JspqefLew(nd+%VR59CJGB=xXa zr-tM6ie@^i@b?-iQZ&-VfJDe5+ zj!SaP3c>t7y~l|5H23+h>nBYgXfry;y#-j55;mN>={ja*>SCDlr^%oVG$qu__jc!g zPtLvMQ8xzDob(A?(-%#zU2cL%L-W1h*T=;GnH)irQ}4?$yJI>|Ca+za4SqTz1|}zY z{D)_9j0}lzKSu`LFqeDw%et{LEM9`<#`wDhaTxz~z@nTe&-WSB5xkR-txpW%RYI4+ z?x@s9Cdm|;YeZwzl@aa4n%P>^jA-^S9fn$4$(EhvHId%<+NHa+!lOUuUM78zR5k*C z<4$QjQO$PNZrNrr6i#Rl;wwU7P8VvC9jk3m9xCKmz+?L3dWku)w9KP;aU011=O_F} za9`6L=ciET{HmW!s=e3H=x<3o*1T>wlUUV1M12M0t{Zw2RDtwP)QG|3U90gJV=zBG zY^&4GLUrbr3f(1jUqGko$0=nSzAUZjl+2%`6pOy5>kgeM#J+9 zjA@P`%&ex_w{aaxHZvc$P+z{!5A&mR8FFtE`IO{Q@86_l!Q z_;IBE=6no)p*0^fDjc&yc*YbKQR=`^_7CT?_zTUJFpLWOtRc*FvJm4$S;O%?#?Z&F z$FQI3D=d_A0rhk2dA+pTjjP8LkmbBnz{x($_N8G|i01PhLu28YX;a=AWi?{VAglIB zCNIvOykPr0_2nHnJ^e4|8U{d%RYW80Eah>^_fQ-r5g!{4YKlL{`5+1SQ1uXwZ`g## kvwa?SNTG+KH5^&gLgWh{(k|j~u5f65BxZ^wg3PC delta 1629 zcmZvcUuau(6vxj^(l)7WP1g=88}=qQZIkqdrM4TlTdGNDX_zaWh&Z7SH&$_o!u<2F z);sTAjgW$ly5D>gY-KI1n^0LF3Mx!N(HF6!sK}=)fBp0NzB)E1^^|HfFf|Dt$NoAZL zoaJzSD?Kh#fL)b+=4prf0^BTeee?T&>+J*AyUPW~8xS)~J~i%eYfioQ%QzWajl+4Y z^c3n90?rxNckDAR={d91>{DfM5b%#Xb|VTPB)B3Hhyg7)a7`8sE4sG7j17UkzbdW;tCHGw{P1CC6asklt}i@JCTvumqk*=(8->8BbXqV#s7&N zf60#RqKEC^FoG2XPqW3K;=%be_Dh@0{E95~unr|2n~;0N0)AT+8KHROMTta$s4-M3 z_Qw;abbd_C`t0T%bLAGhptQAssJc(c4+t*^Y`K<1jc_IVRXJLJw}LYBRl~-ri!wO8 zzRUulX4iBDONWlc-^6Mi*}7srUKM%9hjlzi=abJ;qMYUQ7Jh^5X*a%J7HYB@)qiAr|d5@#fB@bno z+}gzETQ#$2C;qF*SN@)HdyN;c9k_^Y(K_ePCunq7v#j6;1F&=cNy1N-<{h2JZ2mOg zg>A+)ic7Wae%N0t+1^4E0ZrJOU=s{3d$-03#2~+Tm|Ll5Mw|C=g>wM0AQBqRy`Iw^ z!9j&CJ~D|na+)}}(8b-+nVh!4zHjUFO<2KkWYY7jBHSkBSuEU%-)y*jkLSaZ=gaUR zcvi!MQj+yVKCaDM=?$V}>8um15b1}vBYHrZU<1(ueR+JQu6f9q9Za4R@2;Ov8uCBf9AXfNZAVg|s&QfGL6gn~~Tr*13uu4vs}H_~8r{8xHbH zoj4~2B)aHt?{XQO@8&6i*`FgZUbTX8DNt}Fiw`M*zkZ0s>NRXwtx~{h&1^J6SMMFw z6x!oeANFad$Nn!PPLA?zi)P9vCtHySW9eW~tlM9owO8+aq$RT)Wl~P$Q%W&`)6Y*U zHMNeRdIIh8kK~lNJ5dQO2faOo&7$eu=vy}MjV8^PbrLVxmhgf^6m)k&@&HCK*gTiD zVN7}Y7aNqb4Qn~o(7aV2tbKLymsFUMnou^D17-rKp(NtXEc4 zEi_gmA`W@dOo_Xt(L}!z zqX7lb1Bfe8->wyUU`eeek>aD?$X3Jq>q?T}FS119)a%M%>!qYbt``1hDtzxL+&t-u zlSID0`H;QhR^B!-Wtv(<6d*~A?W>I>Q{ZtN53bLia&+axKE=2nk^9#dR&E!IrHo&F zZ{?DeH6Lf(I7W$V{9O%Ii){^N+yb@a!vs(B#D-{szxhYzx2Z_Y!1akruXu`GoIcBL zPwQ@{lPh7cM6v^!MTA#Ac0Qq8TQn}aw0xc>XKtCr9_@_^P! zw371wWvtl*EyA5VLw)-R6|8cY^<#$hmtFEJ6Ii`Fl}&36*gWEzLJxx0f)%`K!#U?U z=Q-#7{qlu34>c>Uw_{2POX`i_xWwn{xUBe6_C@^5hBYA0bwQ)?icMt@LokQTRz% z4Y;Fl@{tne_(UQa84Mek23F8~e;ZA{BVLT=t(I3}qq~rCx|s=Np4hx<@AbwH^--TO zLdUJ=D4uS4!;70zW_G%05-m$t&_4xST#u$)T1EKBMpKvAR&MrJyOyH)?jd~K!=A(* zPW61Xf)zJSid)e@x+8CIXv8{~aZ4O0TSjBpy8$Y6K743-G3a(?;ICJA*fm_NCFM)m zKxN*v#AjnWqAKEu$O)Y|^0%C>Z2V-q$uUpCS2-QIw)kx<9KGEeE@eH-uozlsnENqH z#V*8W%MCLp_Ql3UKpz%oV*Tu>I2TI}9$oigE1$Zl_VZ||c1_%iC7B@}(|59j$mmHn zC64L&*j_{H9j&^oIUG_yjq2*Sw<9iMj(Pw&@r^z{M{O&|Y3c~H+U@(SwMcEABhOUT zM{(fsEqsPA!Y>E#L{#4TU(Vf$SE)_8)j?S91J73#rT8#!C<4~O@US%}^qwjjK!GE9 zx?9wYFAS)T0lqa&_!47Jg_a5Tt$02sbT`uN5jJ#xqvC_ls8Mp`zSV+1IpE+boEf5s zp>i&oFJYl+@p8fxXA`rj~-hC0?`TM Date: Mon, 17 May 2021 13:52:19 +1000 Subject: [PATCH 14/21] dcache: Cancel reservation on snooped store This restructures the reservation machinery so that the reservation is cleared when a snooped store by another agent is done to reservation address. The reservation address is now a real address rather than an effective address. For store-conditional, it is possible that a snooped store to the reservation address could come in even after we have asserted cyc and stb on the wishbone to do the store, and that should cause the store not to be performed. To achieve this, store-conditional now uses a separate state in the r1 state machine, which is set up so that losing the reservation due to a snooped store cause cyc and stb to be dropped immediately, and the store-conditional fails. For load-reserve, the reservation address is set at the end of cycle 1 and the reservation is made valid when the data is available. For lqarx, the reservation is made valid when the first doubleword of data is available. For the case where a snooped write comes in on cycle 0 of a larx and hits the same cache line, we detect that the index and way of the snooped write are the same as the index and way of the larx; it is done this way because reservation.addr is not set until the real address is available at the end of cycle 1. A hit on the same index and way causes reservation.valid to be set to 0 at the end of cycle 1. For a write in cycle 1, we compare the latched address in cycle 2 with the reservation address and clear reservation.valid at the end of cycle 2 if they match. In other words we compare the reservation address with both the address being written this cycle and the address being written in the previous cycle. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + dcache.vhdl | 228 +++++++++++++++++++++++++++++++----------------- loadstore1.vhdl | 19 ++-- 3 files changed, 159 insertions(+), 89 deletions(-) diff --git a/common.vhdl b/common.vhdl index 3af1d7b..6df5b6b 100644 --- a/common.vhdl +++ b/common.vhdl @@ -606,6 +606,7 @@ package common is nc : std_ulogic; reserve : std_ulogic; atomic_qw : std_ulogic; -- part of a quadword atomic op + atomic_first : std_ulogic; atomic_last : std_ulogic; virt_mode : std_ulogic; priv_mode : std_ulogic; diff --git a/dcache.vhdl b/dcache.vhdl index 807a2dc..68f3b60 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -184,7 +184,6 @@ architecture rtl of dcache is -- Type of operation on a "valid" input type op_t is (OP_NONE, OP_BAD, -- NC cache hit, TLB miss, prot/RC failure - OP_STCX_FAIL, -- conditional store w/o reservation OP_LOAD_HIT, -- Cache hit on load OP_LOAD_MISS, -- Load missing cache OP_LOAD_NC, -- Non-cachable load @@ -195,8 +194,8 @@ architecture rtl of dcache is type state_t is (IDLE, -- Normal load hit processing RELOAD_WAIT_ACK, -- Cache reload wait ack STORE_WAIT_ACK, -- Store wait ack - NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack - + NC_LOAD_WAIT_ACK, -- Non-cachable load wait ack + DO_STCX); -- Check for stcx. validity -- -- Dcache operations: @@ -290,6 +289,9 @@ architecture rtl of dcache is op : op_t; valid : std_ulogic; dcbz : std_ulogic; + reserve : std_ulogic; + first_dw : std_ulogic; + last_dw : std_ulogic; real_addr : real_addr_t; data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); @@ -365,10 +367,12 @@ architecture rtl of dcache is -- type reservation_t is record valid : std_ulogic; - addr : std_ulogic_vector(63 downto LINE_OFF_BITS); + addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); end record; signal reservation : reservation_t; + signal kill_rsrv : std_ulogic; + signal kill_rsrv2 : std_ulogic; -- Async signals on incoming request signal req_index : index_t; @@ -382,10 +386,6 @@ architecture rtl of dcache is signal early_req_row : row_t; signal early_rd_valid : std_ulogic; - signal cancel_store : std_ulogic; - signal set_rsrv : std_ulogic; - signal clear_rsrv : std_ulogic; - signal r0_valid : std_ulogic; signal r0_stall : std_ulogic; @@ -427,10 +427,13 @@ architecture rtl of dcache is -- TLB PLRU output interface signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal snoop_active : std_ulogic; signal snoop_tag_set : cache_tags_set_t; signal snoop_valid : std_ulogic; - signal snoop_wrtag : cache_tag_t; - signal snoop_index : index_t; + signal snoop_paddr : real_addr_t; + signal snoop_addr : real_addr_t; + signal snoop_hits : cache_way_valids_t; + signal req_snoop_hit : std_ulogic; -- -- Helper functions to decode incoming requests @@ -861,26 +864,45 @@ begin end if; end process; + -- Snoop logic + -- Don't snoop our own cycles + snoop_addr <= addr_to_real(wb_to_addr(snoop_in.adr)); + snoop_active <= snoop_in.cyc and snoop_in.stb and snoop_in.we and + not (r1.wb.cyc and not wishbone_in.stall); + kill_rsrv <= '1' when (snoop_active = '1' and reservation.valid = '1' and + snoop_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) = reservation.addr) + else '0'; + -- Cache tag RAM second read port, for snooping cache_tag_read_2 : process(clk) - variable addr : real_addr_t; begin if rising_edge(clk) then - -- Don't snoop our own cycles - snoop_valid <= '0'; - if not (r1.wb.cyc = '1' and wishbone_in.stall = '0') then - if (snoop_in.cyc and snoop_in.stb and snoop_in.we) = '1' then - snoop_valid <= '1'; - addr := addr_to_real(wb_to_addr(snoop_in.adr)); - assert not is_X(addr); - snoop_tag_set <= cache_tags(to_integer(get_index(addr))); - snoop_wrtag <= get_tag(addr); - snoop_index <= get_index(addr); - end if; + if is_X(snoop_addr) then + snoop_tag_set <= (others => 'X'); + else + snoop_tag_set <= cache_tags(to_integer(get_index(snoop_addr))); end if; + snoop_paddr <= snoop_addr; + snoop_valid <= snoop_active; end if; end process; + -- Compare the previous cycle's snooped store address to the reservation, + -- to catch the case where a write happens on cycle 1 of a cached larx + kill_rsrv2 <= '1' when (snoop_valid = '1' and reservation.valid = '1' and + snoop_paddr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) = reservation.addr) + else '0'; + + snoop_tag_match : process(all) + begin + snoop_hits <= (others => '0'); + for i in 0 to NUM_WAYS-1 loop + if snoop_valid = '1' and read_tag(i, snoop_tag_set) = get_tag(snoop_paddr) then + snoop_hits(i) <= '1'; + end if; + end loop; + end process; + -- Cache request parsing and hit detection dcache_request : process(all) variable req_row : row_t; @@ -901,6 +923,8 @@ begin variable rel_match : std_ulogic; variable fwd_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable fwd_match : std_ulogic; + variable snp_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable snoop_match : std_ulogic; begin -- Extract line, row and tag from request rindex := get_index(r0.req.addr); @@ -924,9 +948,11 @@ begin is_hit := '0'; rel_match := '0'; fwd_match := '0'; + snoop_match := '0'; if r0.req.virt_mode = '1' then rel_matches := (others => '0'); fwd_matches := (others => '0'); + snp_matches := (others => '0'); for j in tlb_way_t loop hit_way_set(j) := to_unsigned(0, WAY_BITS); s_hit := '0'; @@ -943,6 +969,9 @@ begin tlb_valid_way(j) = '1' then hit_way_set(j) := to_unsigned(i, WAY_BITS); s_hit := '1'; + if snoop_hits(i) = '1' then + snp_matches(j) := '1'; + end if; end if; end loop; hit_set(j) := s_hit; @@ -959,6 +988,7 @@ begin hit_way := hit_way_set(to_integer(tlb_hit_way)); rel_match := rel_matches(to_integer(tlb_hit_way)); fwd_match := fwd_matches(to_integer(tlb_hit_way)); + snoop_match := snp_matches(to_integer(tlb_hit_way)); end if; else s_tag := get_tag(r0.req.addr); @@ -970,6 +1000,9 @@ begin read_tag(i, cache_tag_set) = s_tag then hit_way := to_unsigned(i, WAY_BITS); is_hit := '1'; + if snoop_hits(i) = '1' then + snoop_match := '1'; + end if; end if; end loop; if go = '1' and not is_X(r1.reload_tag) and s_tag = r1.reload_tag then @@ -982,6 +1015,13 @@ begin req_same_tag <= rel_match; fwd_same_tag <= fwd_match; + -- This is 1 if the snooped write from the previous cycle hits the same + -- cache line that is being accessed in this cycle. + req_snoop_hit <= '0'; + if go = '1' and snoop_match = '1' and get_index(snoop_paddr) = rindex then + req_snoop_hit <= '1'; + end if; + -- Whether to use forwarded data for a load or not use_forward_st <= '0'; use_forward_rl <= '0'; @@ -1060,8 +1100,6 @@ begin if go = '1' then if access_ok = '0' then op := OP_BAD; - elsif cancel_store = '1' then - op := OP_STCX_FAIL; else opsel := r0.req.load & nc & is_hit; case opsel is @@ -1101,45 +1139,6 @@ begin -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Handle load-with-reservation and store-conditional instructions - reservation_comb: process(all) - begin - cancel_store <= '0'; - set_rsrv <= '0'; - clear_rsrv <= '0'; - if r0_valid = '1' and r0.req.reserve = '1' then - -- XXX generate alignment interrupt if address is not aligned - -- XXX or if r0.req.nc = '1' - if r0.req.load = '1' then - -- load with reservation - set_rsrv <= not r0.req.atomic_qw or r0.req.atomic_last; - else - -- store conditional - clear_rsrv <= not r0.req.atomic_qw or r0.req.atomic_last; - if reservation.valid = '0' or - r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then - cancel_store <= '1'; - end if; - end if; - end if; - end process; - - reservation_reg: process(clk) - begin - if rising_edge(clk) then - if rst = '1' then - reservation.valid <= '0'; - elsif r0_valid = '1' and access_ok = '1' then - if clear_rsrv = '1' then - reservation.valid <= '0'; - elsif set_rsrv = '1' then - reservation.valid <= '1'; - reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); - end if; - end if; - end if; - end process; - -- Return data for loads & completion control logic -- writeback_control: process(all) @@ -1367,12 +1366,6 @@ begin r1.cache_paradox <= '0'; end if; - if req_op = OP_STCX_FAIL then - r1.stcx_fail <= '1'; - else - r1.stcx_fail <= '0'; - end if; - -- Record TLB hit information for updating TLB PLRU r1.tlb_hit <= tlb_hit; r1.tlb_hit_way <= tlb_hit_way; @@ -1423,6 +1416,8 @@ begin r1.acks_pending <= to_unsigned(0, 3); r1.stalled <= '0'; r1.dec_acks <= '0'; + reservation.valid <= '0'; + reservation.addr <= (others => '0'); -- Not useful normally but helps avoiding tons of sim warnings r1.wb.adr <= (others => '0'); @@ -1430,27 +1425,40 @@ begin -- One cycle pulses reset r1.slow_valid <= '0'; r1.write_bram <= '0'; + r1.stcx_fail <= '0'; r1.ls_valid <= '0'; -- complete tlbies and TLB loads in the third cycle r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); - if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then + if req_op = OP_LOAD_HIT then if r0.mmu_req = '0' then r1.ls_valid <= '1'; else r1.mmu_done <= '1'; end if; end if; + -- The kill_rsrv2 term covers the case where the reservation + -- address was set at the beginning of this cycle, and a store + -- to that address happened in the previous cycle. + if kill_rsrv = '1' or kill_rsrv2 = '1' then + reservation.valid <= '0'; + end if; + if req_go = '1' and access_ok = '1' and r0.req.load = '1' and + r0.req.reserve = '1' and r0.req.atomic_first = '1' then + reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); + if req_op = OP_LOAD_HIT then + reservation.valid <= not req_snoop_hit; + end if; + end if; -- Do invalidations from snooped stores to memory if snoop_valid = '1' then - assert not is_X(snoop_tag_set); - assert not is_X(snoop_wrtag); + assert not is_X(snoop_paddr); + assert not is_X(snoop_hits); end if; for i in 0 to NUM_WAYS-1 loop - if snoop_valid = '1' and read_tag(i, snoop_tag_set) = snoop_wrtag then - assert not is_X(snoop_index); - cache_valids(to_integer(snoop_index))(i) <= '0'; + if snoop_hits(i) = '1' then + cache_valids(to_integer(get_index(snoop_paddr)))(i) <= '0'; end if; end loop; @@ -1477,6 +1485,9 @@ begin req.valid := req_go; req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; + req.reserve := r0.req.reserve; + req.first_dw := r0.req.atomic_first; + req.last_dw := r0.req.atomic_last; req.real_addr := ra; -- Force data to 0 for dcbz if r0.req.dcbz = '1' then @@ -1581,7 +1592,11 @@ begin r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - if req.dcbz = '0' then + if req.reserve = '1' then + -- stcx needs to wait until next cycle + -- for the reservation address check + r1.state <= DO_STCX; + elsif req.dcbz = '0' then r1.state <= STORE_WAIT_ACK; r1.full <= '0'; r1.slow_valid <= '1'; @@ -1593,6 +1608,9 @@ begin if req.op = OP_STORE_HIT then r1.write_bram <= '1'; end if; + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; else -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading @@ -1600,19 +1618,18 @@ begin if req.op = OP_STORE_MISS then r1.write_tag <= '1'; end if; + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; end if; - r1.wb.we <= '1'; - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; if req.op = OP_STORE_MISS then ev.store_miss <= '1'; end if; -- OP_NONE and OP_BAD do nothing - -- OP_BAD & OP_STCX_FAIL were handled above already + -- OP_BAD was handled above already when OP_NONE => when OP_BAD => - when OP_STCX_FAIL => end case; when RELOAD_WAIT_ACK => @@ -1652,6 +1669,13 @@ begin else r1.mmu_done <= '1'; end if; + -- NB: for lqarx, set the reservation on the first + -- dword so that a snooped store between the two + -- dwords will kill the reservation. + if req.reserve = '1' and req.first_dw = '1' then + reservation.valid <= '1'; + reservation.addr <= req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); + end if; end if; -- Check for completion @@ -1736,6 +1760,48 @@ begin r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; + + when DO_STCX => + if reservation.valid = '0' or kill_rsrv = '1' or + r1.req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) /= reservation.addr then + -- Wrong address, didn't have reservation, or lost reservation + -- Abandon the wishbone cycle if started and fail the stcx. + r1.stcx_fail <= '1'; + r1.full <= '0'; + r1.ls_valid <= '1'; + r1.state <= IDLE; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; + reservation.valid <= '0'; + elsif r1.wb.cyc = '0' then + -- Right address and have reservation, so start the + -- wishbone cycle + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; + else + if wishbone_in.stall = '0' then + -- Store has been accepted, so now we can write the + -- cache data RAM + if r1.req.op = OP_STORE_HIT then + r1.write_bram <= '1'; + end if; + r1.wb.stb <= '0'; + end if; + if wishbone_in.ack = '1' then + r1.state <= IDLE; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + -- For stqcx., kill the reservation on the last dword + if r1.req.last_dw = '1' then + reservation.valid <= '0'; + end if; + end if; + end if; + end case; end if; end if; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index dcacc75..e69a27e 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -85,6 +85,7 @@ architecture behave of loadstore1 is xerc : xer_common_t; reserve : std_ulogic; atomic_qw : std_ulogic; + atomic_first : std_ulogic; atomic_last : std_ulogic; rc : std_ulogic; nc : std_ulogic; -- non-cacheable access @@ -110,7 +111,7 @@ architecture behave of loadstore1 is elt_length => x"0", byte_reverse => '0', brev_mask => "000", sign_extend => '0', update => '0', xerc => xerc_init, reserve => '0', - atomic_qw => '0', atomic_last => '0', + atomic_qw => '0', atomic_first => '0', atomic_last => '0', rc => '0', nc => '0', virt_mode => '0', priv_mode => '0', load_sp => '0', sprsel => "00", ric => "00", is_slbia => '0', align_intr => '0', @@ -478,20 +479,20 @@ begin -- check alignment for larx/stcx misaligned := or (addr_mask and addr(2 downto 0)); + if l_in.repeat = '1' and l_in.update = '0' and addr(3) /= l_in.second then + misaligned := '1'; + end if; v.align_intr := l_in.reserve and misaligned; + v.atomic_first := not misaligned and not l_in.second; + v.atomic_last := not misaligned and (l_in.second or not l_in.repeat); + -- is this a quadword load or store? i.e. lq plq stq pstq lqarx stqcx. if l_in.repeat = '1' and l_in.update = '0' then - -- is the access aligned? - if misaligned = '0' and addr(3) = l_in.second then + if misaligned = '0' then -- Since the access is aligned we have to do it atomically v.atomic_qw := '1'; - v.atomic_last := l_in.second; else - -- lqarx/stqcx have to be aligned - if l_in.reserve = '1' then - v.align_intr := '1'; - end if; -- We require non-prefixed lq in LE mode to be aligned in order -- to avoid the case where RA = RT+1 and the second access faults -- after the first has overwritten RA. @@ -979,6 +980,7 @@ begin d_out.nc <= stage1_req.nc; d_out.reserve <= stage1_req.reserve; d_out.atomic_qw <= stage1_req.atomic_qw; + d_out.atomic_first <= stage1_req.atomic_first; d_out.atomic_last <= stage1_req.atomic_last; d_out.addr <= stage1_req.addr; d_out.byte_sel <= stage1_req.byte_sel; @@ -991,6 +993,7 @@ begin d_out.nc <= r2.req.nc; d_out.reserve <= r2.req.reserve; d_out.atomic_qw <= r2.req.atomic_qw; + d_out.atomic_first <= r2.req.atomic_first; d_out.atomic_last <= r2.req.atomic_last; d_out.addr <= r2.req.addr; d_out.byte_sel <= r2.req.byte_sel; From ba4614c5f4cd6fa56079151a6be44f92790e0b2b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 11 May 2021 20:26:09 +1000 Subject: [PATCH 15/21] dcache: Implement data cache touch and flush instructions This implements dcbf, dcbt and dcbtst in the dcache. The dcbst (data cache block store) instruction remains a no-op because our dcache is write-through and therefore never has modified data that could need to be written back. Dcbt (data cache block touch) and dcbtst (data cache block touch for store) behave similarly except that dcbtst is a no-op on a readonly page. Neither instruction ever causes an interrupt. If they miss in the cache and the page is cacheable, they are handled like a load miss except that they complete immediately the state machine starts handling the load miss rather than waiting for any data. Dcbf (data cache block flush) can cause a data storage interrupt. If it hits in the cache, the state machine goes to a new FLUSH_CYCLE state in which the cache line valid bit is cleared. In order to avoid having more than 8 values in op_t, this combines OP_STORE_MISS and OP_STORE_HIT into a single state. A new OP_NOP state is used for operations which can complete immediately without changing any dcache state (now used for dcbt/dcbtst causing access exception or on a non-cachable page, or dcbf that misses the cache). Signed-off-by: Paul Mackerras --- common.vhdl | 5 +++ dcache.vhdl | 109 ++++++++++++++++++++++++++++++---------------- decode1.vhdl | 8 ++-- decode_types.vhdl | 5 ++- execute1.vhdl | 11 +++-- loadstore1.vhdl | 25 +++++++++-- 6 files changed, 110 insertions(+), 53 deletions(-) diff --git a/common.vhdl b/common.vhdl index 6df5b6b..b1a2c8e 100644 --- a/common.vhdl +++ b/common.vhdl @@ -603,6 +603,8 @@ package common is hold : std_ulogic; load : std_ulogic; -- is this a load dcbz : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; nc : std_ulogic; reserve : std_ulogic; atomic_qw : std_ulogic; -- part of a quadword atomic op @@ -614,6 +616,9 @@ package common is data : std_ulogic_vector(63 downto 0); -- valid the cycle after .valid = 1 byte_sel : std_ulogic_vector(7 downto 0); end record; + constant Loadstore1ToDcacheInit : Loadstore1ToDcacheType := + (addr => (others => '0'), data => (others => '0'), byte_sel => x"00", + others => '0'); type DcacheToLoadstore1Type is record valid : std_ulogic; diff --git a/dcache.vhdl b/dcache.vhdl index 68f3b60..82ae791 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -187,15 +187,17 @@ architecture rtl of dcache is OP_LOAD_HIT, -- Cache hit on load OP_LOAD_MISS, -- Load missing cache OP_LOAD_NC, -- Non-cachable load - OP_STORE_HIT, -- Store hitting cache - OP_STORE_MISS); -- Store missing cache - + OP_STORE, -- Store, whether hitting or missing cache + OP_NOP, -- nothing to do, just complete the op + OP_MISC); -- Flush + -- Cache state machine type state_t is (IDLE, -- Normal load hit processing RELOAD_WAIT_ACK, -- Cache reload wait ack STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK, -- Non-cachable load wait ack - DO_STCX); -- Check for stcx. validity + DO_STCX, -- Check for stcx. validity + FLUSH_CYCLE); -- Cycle for invalidating cache line -- -- Dcache operations: @@ -289,12 +291,15 @@ architecture rtl of dcache is op : op_t; valid : std_ulogic; dcbz : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; reserve : std_ulogic; first_dw : std_ulogic; last_dw : std_ulogic; real_addr : real_addr_t; data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); + is_hit : std_ulogic; hit_way : way_t; same_tag : std_ulogic; mmu_req : std_ulogic; @@ -377,6 +382,7 @@ architecture rtl of dcache is -- Async signals on incoming request signal req_index : index_t; signal req_hit_way : way_t; + signal req_is_hit : std_ulogic; signal req_tag : cache_tag_t; signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); @@ -568,12 +574,9 @@ begin assert (d_in.valid and m_in.valid) = '0' report "request collision loadstore vs MMU"; if m_in.valid = '1' then + r.req := Loadstore1ToDcacheInit; r.req.valid := '1'; r.req.load := not (m_in.tlbie or m_in.tlbld); - r.req.dcbz := '0'; - r.req.nc := '0'; - r.req.reserve := '0'; - r.req.virt_mode := '0'; r.req.priv_mode := '1'; r.req.addr := m_in.addr; r.req.data := m_in.pte; @@ -1077,13 +1080,17 @@ begin -- since it will be by the time we perform the store. -- For a load, check the appropriate row valid bit; but also, -- if use_forward_rl is 1 then we can consider this a hit. - is_hit := not r0.req.load or r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or + -- For a touch, since the line we want is being reloaded already, + -- consider this a hit. + is_hit := not r0.req.load or r0.req.touch or + r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or use_forward_rl; hit_way := replace_way; end if; -- The way that matched on a hit req_hit_way <= hit_way; + req_is_hit <= is_hit; -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP @@ -1098,17 +1105,32 @@ begin nc := r0.req.nc or perm_attr.nocache; op := OP_NONE; if go = '1' then - if access_ok = '0' then + if r0.req.touch = '1' then + if access_ok = '1' and is_hit = '0' and nc = '0' then + op := OP_LOAD_MISS; + elsif access_ok = '1' and is_hit = '1' and nc = '0' then + -- Make this OP_LOAD_HIT so the PLRU gets updated + op := OP_LOAD_HIT; + else + op := OP_NOP; + end if; + elsif access_ok = '0' then op := OP_BAD; + elsif r0.req.flush = '1' then + if is_hit = '0' then + op := OP_NOP; + else + op := OP_MISC; + end if; else opsel := r0.req.load & nc & is_hit; case opsel is when "101" => op := OP_LOAD_HIT; when "100" => op := OP_LOAD_MISS; when "110" => op := OP_LOAD_NC; - when "001" => op := OP_STORE_HIT; - when "000" => op := OP_STORE_MISS; - when "010" => op := OP_STORE_MISS; + when "001" => op := OP_STORE; + when "000" => op := OP_STORE; + when "010" => op := OP_STORE; when "011" => op := OP_BAD; when "111" => op := OP_BAD; when others => op := OP_NONE; @@ -1348,8 +1370,8 @@ begin end if; -- The cache hit indication is used for PLRU updates - if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then - r1.cache_hit <= '1'; + if req_op = OP_LOAD_HIT or req_op = OP_STORE then + r1.cache_hit <= req_is_hit; else r1.cache_hit <= '0'; end if; @@ -1430,7 +1452,7 @@ begin r1.ls_valid <= '0'; -- complete tlbies and TLB loads in the third cycle r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); - if req_op = OP_LOAD_HIT then + if req_op = OP_LOAD_HIT or req_op = OP_NOP then if r0.mmu_req = '0' then r1.ls_valid <= '1'; else @@ -1446,7 +1468,7 @@ begin if req_go = '1' and access_ok = '1' and r0.req.load = '1' and r0.req.reserve = '1' and r0.req.atomic_first = '1' then reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); - if req_op = OP_LOAD_HIT then + if req_is_hit = '1' then reservation.valid <= not req_snoop_hit; end if; end if; @@ -1485,6 +1507,8 @@ begin req.valid := req_go; req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; + req.flush := r0.req.flush; + req.touch := r0.req.touch; req.reserve := r0.req.reserve; req.first_dw := r0.req.atomic_first; req.last_dw := r0.req.atomic_last; @@ -1504,12 +1528,13 @@ begin req.byte_sel := r0.req.byte_sel; end if; req.hit_way := req_hit_way; + req.is_hit := req_is_hit; req.same_tag := req_same_tag; -- Store the incoming request from r0, if it is a slow request -- Note that r1.full = 1 implies req_op = OP_NONE if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC or - req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then + req_op = OP_STORE or req_op = OP_MISC then r1.req <= req; r1.full <= '1'; end if; @@ -1523,7 +1548,7 @@ begin r1.victim_way <= plru_victim; report "victim way:" & to_hstring(plru_victim); end if; - if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then + if req_op = OP_LOAD_MISS or (r0.req.dcbz = '1' and req_is_hit = '0') then r1.choose_victim <= '1'; end if; @@ -1555,7 +1580,7 @@ begin r1.reload_tag <= get_tag(req.real_addr); r1.req.same_tag <= '1'; - if req.op = OP_STORE_HIT then + if req.is_hit = '1' then r1.store_way <= req.hit_way; end if; @@ -1585,13 +1610,20 @@ begin r1.write_tag <= '1'; ev.load_miss <= '1'; + -- If this is a touch, complete the instruction + if req.touch = '1' then + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + end if; + when OP_LOAD_NC => r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; - when OP_STORE_HIT | OP_STORE_MISS => + when OP_STORE => if req.reserve = '1' then -- stcx needs to wait until next cycle -- for the reservation address check @@ -1605,9 +1637,7 @@ begin else r1.mmu_done <= '1'; end if; - if req.op = OP_STORE_HIT then - r1.write_bram <= '1'; - end if; + r1.write_bram <= req.is_hit; r1.wb.we <= '1'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -1615,21 +1645,24 @@ begin -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading r1.state <= RELOAD_WAIT_ACK; - if req.op = OP_STORE_MISS then - r1.write_tag <= '1'; - end if; + r1.write_tag <= not req.is_hit; r1.wb.we <= '1'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; end if; - if req.op = OP_STORE_MISS then - ev.store_miss <= '1'; + if req.op = OP_STORE then + ev.store_miss <= not req.is_hit; end if; + when OP_MISC => + r1.state <= FLUSH_CYCLE; + -- OP_NONE and OP_BAD do nothing - -- OP_BAD was handled above already + -- OP_BAD & OP_NOP were handled above already when OP_NONE => when OP_BAD => + when OP_NOP => + end case; when RELOAD_WAIT_ACK => @@ -1712,14 +1745,12 @@ begin end if; assert not is_X(acks); if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and - (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then + req.op = OP_STORE then r1.wb.stb <= '1'; stbs_done := false; r1.store_way <= req.hit_way; r1.store_row <= get_row(req.real_addr); - if req.op = OP_STORE_HIT then - r1.write_bram <= '1'; - end if; + r1.write_bram <= req.is_hit; r1.full <= '0'; r1.slow_valid <= '1'; -- Store requests never come from the MMU @@ -1783,9 +1814,7 @@ begin if wishbone_in.stall = '0' then -- Store has been accepted, so now we can write the -- cache data RAM - if r1.req.op = OP_STORE_HIT then - r1.write_bram <= '1'; - end if; + r1.write_bram <= req.is_hit; r1.wb.stb <= '0'; end if; if wishbone_in.ack = '1' then @@ -1802,6 +1831,12 @@ begin end if; end if; + when FLUSH_CYCLE => + cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '0'; + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + r1.state <= IDLE; end case; end if; end if; diff --git a/decode1.vhdl b/decode1.vhdl index 643523b..9047cf8 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -129,10 +129,10 @@ architecture behaviour of decode1 is INSN_crorc => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_crxor => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_darn => (ALU, NONE, OP_DARN, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbf => (ALU, NONE, OP_DCBF, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbf => (LDST, NONE, OP_DCBF, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbst => (ALU, NONE, OP_DCBST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbt => (ALU, NONE, OP_XCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbtst => (ALU, NONE, OP_DCBTST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbt => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbtst => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbz => (LDST, NONE, OP_DCBZ, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_divd => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RCOE, '0', '0', NONE), INSN_divde => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RCOE, '0', '0', NONE), @@ -200,7 +200,7 @@ architecture behaviour of decode1 is INSN_ftdiv => (FPU, FPU, OP_FP_CMP, FRA, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_ftsqrt => (FPU, FPU, OP_FP_CMP, NONE, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_icbi => (ALU, NONE, OP_ICBI, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), - INSN_icbt => (ALU, NONE, OP_XCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_icbt => (ALU, NONE, OP_ICBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_isel => (ALU, NONE, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_isync => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_lbarx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), diff --git a/decode_types.vhdl b/decode_types.vhdl index 03e958b..5695643 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -7,8 +7,9 @@ package decode_types is OP_BCD, OP_BPERM, OP_BREV, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB, OP_COUNTB, OP_CROP, - OP_DARN, OP_DCBF, OP_DCBST, OP_XCBT, OP_DCBTST, - OP_DCBZ, OP_ICBI, + OP_DARN, OP_DCBF, OP_DCBST, OP_DCBZ, + OP_SPARE, + OP_ICBI, OP_ICBT, OP_FP_CMP, OP_FP_ARITH, OP_FP_MOVE, OP_FP_MISC, OP_DIV, OP_DIVE, OP_MOD, OP_EXTS, OP_EXTSWSLI, diff --git a/execute1.vhdl b/execute1.vhdl index ecb1e63..84a6fbe 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -1184,8 +1184,8 @@ begin else illegal := '1'; end if; - when OP_NOP | OP_DCBF | OP_DCBST | OP_XCBT | OP_DCBTST => - -- Do nothing + when OP_NOP | OP_DCBST | OP_ICBT => + -- Do nothing when OP_ADD => if e_in.output_carry = '1' then if e_in.input_carry /= OV then @@ -1653,11 +1653,10 @@ begin v.e.srr1 := (others => '0'); v.e.srr1(47 - 33) := '1'; v.e.srr1(47 - 34) := ex1.prev_prefixed; - if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or - ex1.prev_op = OP_XCBT or ex1.prev_op = OP_DCBST or ex1.prev_op = OP_DCBF then + if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or ex1.prev_op = OP_ICBT or + ex1.prev_op = OP_DCBF then v.e.srr1(47 - 35) := '1'; - elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ or - ex1.prev_op = OP_DCBTST then + elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ then v.e.srr1(47 - 36) := '1'; end if; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index e69a27e..69d053d 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -61,6 +61,8 @@ architecture behave of loadstore1 is dc_req : std_ulogic; load : std_ulogic; store : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; tlbie : std_ulogic; dcbz : std_ulogic; read_spr : std_ulogic; @@ -100,7 +102,8 @@ architecture behave of loadstore1 is two_dwords : std_ulogic; incomplete : std_ulogic; end record; - constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0', + constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', + flush => '0', touch => '0', tlbie => '0', dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0', instr_fault => '0', do_update => '0', mode_32bit => '0', prefixed => '0', @@ -470,7 +473,7 @@ begin addr_mask := std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1); -- Do length_to_sel and work out if we are doing 2 dwords - long_sel := xfer_data_sel(v.length, addr(2 downto 0)); + long_sel := xfer_data_sel(l_in.length, addr(2 downto 0)); v.byte_sel := long_sel(7 downto 0); v.second_bytes := long_sel(15 downto 8); if long_sel(15 downto 8) /= "00000000" then @@ -505,6 +508,9 @@ begin case l_in.op is when OP_STORE => v.store := '1'; + if l_in.length = "0000" then + v.touch := '1'; + end if; when OP_LOAD => if l_in.update = '0' or l_in.second = '0' then v.load := '1'; @@ -512,10 +518,16 @@ begin -- Allow an extra cycle for SP->DP precision conversion v.load_sp := '1'; end if; + if l_in.length = "0000" then + v.touch := '1'; + end if; else -- write back address to RA v.do_update := '1'; end if; + when OP_DCBF => + v.load := '1'; + v.flush := '1'; when OP_DCBZ => v.dcbz := '1'; v.align_intr := v.nc; @@ -541,7 +553,7 @@ begin -- Work out controls for load and store formatting brev_lenm1 := "000"; if v.byte_reverse = '1' then - brev_lenm1 := unsigned(v.length(2 downto 0)) - 1; + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; end if; v.brev_mask := brev_lenm1; @@ -882,7 +894,8 @@ begin if d_in.valid = '1' then if r2.req.incomplete = '0' then - write_enable := r2.req.load and not r2.req.load_sp; + write_enable := r2.req.load and not r2.req.load_sp and + not r2.req.flush and not r2.req.touch; -- stores write back rA update do_update := r2.req.update and r2.req.store; end if; @@ -977,6 +990,8 @@ begin d_out.valid <= stage1_dcreq; d_out.load <= stage1_req.load; d_out.dcbz <= stage1_req.dcbz; + d_out.flush <= stage1_req.flush; + d_out.touch <= stage1_req.touch; d_out.nc <= stage1_req.nc; d_out.reserve <= stage1_req.reserve; d_out.atomic_qw <= stage1_req.atomic_qw; @@ -990,6 +1005,8 @@ begin d_out.valid <= req; d_out.load <= r2.req.load; d_out.dcbz <= r2.req.dcbz; + d_out.flush <= r2.req.flush; + d_out.touch <= r2.req.touch; d_out.nc <= r2.req.nc; d_out.reserve <= r2.req.reserve; d_out.atomic_qw <= r2.req.atomic_qw; From 0fbeaa2a015b3477c4e900e3ad46906cf3de49b7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Dec 2024 13:01:08 +1100 Subject: [PATCH 16/21] dcache: Use discrete req_op_* signals instead of an encoded req_op Hopefully this will improve timing by reducing unnecessary dependencies and giving more opportunities for routing. Signed-off-by: Paul Mackerras --- dcache.vhdl | 206 ++++++++++++++++++++++------------------------------ 1 file changed, 85 insertions(+), 121 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 82ae791..1a1087c 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -181,16 +181,6 @@ architecture rtl of dcache is constant real_mode_perm_attr : perm_attr_t := (nocache => '0', others => '1'); - -- Type of operation on a "valid" input - type op_t is (OP_NONE, - OP_BAD, -- NC cache hit, TLB miss, prot/RC failure - OP_LOAD_HIT, -- Cache hit on load - OP_LOAD_MISS, -- Load missing cache - OP_LOAD_NC, -- Non-cachable load - OP_STORE, -- Store, whether hitting or missing cache - OP_NOP, -- nothing to do, just complete the op - OP_MISC); -- Flush - -- Cache state machine type state_t is (IDLE, -- Normal load hit processing RELOAD_WAIT_ACK, -- Cache reload wait ack @@ -231,8 +221,9 @@ architecture rtl of dcache is -- Clock edge between cycle 1 and cycle 2: -- Request is stored in r1 (assuming r1.full was 0) -- The state machine transitions out of IDLE state for a load miss, - -- a store, a dcbz, or a non-cacheable load. r1.full is set to 1 - -- for a load miss, dcbz or non-cacheable load but not a store. + -- a store, a dcbz, a flush (dcbf) or a non-cacheable load. + -- r1.full is set to 1 for a load miss, dcbz, flush or + -- non-cacheable load but not a store. -- -- Cycle 2: Completion signals are asserted for a load hit, -- a store (excluding dcbz), a TLB operation, a conditional @@ -288,7 +279,10 @@ architecture rtl of dcache is signal r0_full : std_ulogic; type mem_access_request_t is record - op : op_t; + op_lmiss : std_ulogic; + op_store : std_ulogic; + op_flush : std_ulogic; + nc : std_ulogic; valid : std_ulogic; dcbz : std_ulogic; flush : std_ulogic; @@ -380,14 +374,20 @@ architecture rtl of dcache is signal kill_rsrv2 : std_ulogic; -- Async signals on incoming request - signal req_index : index_t; - signal req_hit_way : way_t; - signal req_is_hit : std_ulogic; - signal req_tag : cache_tag_t; - signal req_op : op_t; - signal req_data : std_ulogic_vector(63 downto 0); - signal req_same_tag : std_ulogic; - signal req_go : std_ulogic; + signal req_index : index_t; + signal req_hit_way : way_t; + signal req_is_hit : std_ulogic; + signal req_tag : cache_tag_t; + signal req_op_load_hit : std_ulogic; + signal req_op_load_miss : std_ulogic; + signal req_op_store : std_ulogic; + signal req_op_flush : std_ulogic; + signal req_op_bad : std_ulogic; + signal req_op_nop : std_ulogic; + signal req_data : std_ulogic_vector(63 downto 0); + signal req_same_tag : std_ulogic; + signal req_go : std_ulogic; + signal req_nc : std_ulogic; signal early_req_row : row_t; signal early_rd_valid : std_ulogic; @@ -912,8 +912,6 @@ begin variable rindex : index_t; variable is_hit : std_ulogic; variable hit_way : way_t; - variable op : op_t; - variable opsel : std_ulogic_vector(2 downto 0); variable go : std_ulogic; variable nc : std_ulogic; variable s_hit : std_ulogic; @@ -1103,42 +1101,41 @@ begin -- operation needs to be done -- nc := r0.req.nc or perm_attr.nocache; - op := OP_NONE; + req_op_bad <= '0'; + req_op_load_hit <= '0'; + req_op_load_miss <= '0'; + req_op_store <= '0'; + req_op_nop <= '0'; + req_op_flush <= '0'; if go = '1' then if r0.req.touch = '1' then if access_ok = '1' and is_hit = '0' and nc = '0' then - op := OP_LOAD_MISS; + req_op_load_miss <= '1'; elsif access_ok = '1' and is_hit = '1' and nc = '0' then -- Make this OP_LOAD_HIT so the PLRU gets updated - op := OP_LOAD_HIT; + req_op_load_hit <= '1'; else - op := OP_NOP; + req_op_nop <= '1'; end if; elsif access_ok = '0' then - op := OP_BAD; + req_op_bad <= '1'; elsif r0.req.flush = '1' then if is_hit = '0' then - op := OP_NOP; + req_op_nop <= '1'; else - op := OP_MISC; + req_op_flush <= '1'; end if; + elsif nc = '1' and is_hit = '1' then + req_op_bad <= '1'; + elsif r0.req.load = '0' then + req_op_store <= '1'; -- includes dcbz else - opsel := r0.req.load & nc & is_hit; - case opsel is - when "101" => op := OP_LOAD_HIT; - when "100" => op := OP_LOAD_MISS; - when "110" => op := OP_LOAD_NC; - when "001" => op := OP_STORE; - when "000" => op := OP_STORE; - when "010" => op := OP_STORE; - when "011" => op := OP_BAD; - when "111" => op := OP_BAD; - when others => op := OP_NONE; - end case; + req_op_load_hit <= is_hit; + req_op_load_miss <= not is_hit; -- includes non-cacheable loads end if; end if; - req_op <= op; req_go <= go; + req_nc <= nc; -- Version of the row number that is valid one cycle earlier -- in the cases where we need to read the cache data BRAM. @@ -1309,14 +1306,6 @@ begin variable data_out : std_ulogic_vector(63 downto 0); begin if rising_edge(clk) then - if req_op /= OP_NONE then - report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(r0.req.addr) & - " nc:" & std_ulogic'image(r0.req.nc) & - " idx:" & to_hstring(req_index) & - " tag:" & to_hstring(req_tag) & - " way: " & to_hstring(req_hit_way); - end if; if r0_valid = '1' then r1.mmu_req <= r0.mmu_req; end if; @@ -1362,21 +1351,10 @@ begin r1.forward_valid <= '1'; end if; - -- Fast path for load/store hits. Set signals for the writeback controls. - if req_op = OP_LOAD_HIT then - r1.hit_load_valid <= '1'; - else - r1.hit_load_valid <= '0'; - end if; - - -- The cache hit indication is used for PLRU updates - if req_op = OP_LOAD_HIT or req_op = OP_STORE then - r1.cache_hit <= req_is_hit; - else - r1.cache_hit <= '0'; - end if; + r1.hit_load_valid <= req_op_load_hit; + r1.cache_hit <= req_op_load_hit or (req_op_store and req_is_hit); -- causes PLRU update - if req_op = OP_BAD then + if req_op_bad = '1' then report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); r1.ls_error <= not r0.mmu_req; @@ -1449,16 +1427,11 @@ begin r1.write_bram <= '0'; r1.stcx_fail <= '0'; - r1.ls_valid <= '0'; + r1.ls_valid <= (req_op_load_hit or req_op_nop) and not r0.mmu_req; -- complete tlbies and TLB loads in the third cycle - r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); - if req_op = OP_LOAD_HIT or req_op = OP_NOP then - if r0.mmu_req = '0' then - r1.ls_valid <= '1'; - else - r1.mmu_done <= '1'; - end if; - end if; + r1.mmu_done <= (r0_valid and (r0.tlbie or r0.tlbld)) or + (req_op_load_hit and r0.mmu_req); + -- The kill_rsrv2 term covers the case where the reservation -- address was set at the beginning of this cycle, and a store -- to that address happened in the previous cycle. @@ -1499,11 +1472,14 @@ begin end if; -- Take request from r1.req if there is one there, - -- else from req_op, ra, etc. + -- else from req_op_*, ra, etc. if r1.full = '1' then req := r1.req; else - req.op := req_op; + req.op_lmiss := req_op_load_miss; + req.op_store := req_op_store; + req.op_flush := req_op_flush; + req.nc := req_nc; req.valid := req_go; req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; @@ -1532,9 +1508,8 @@ begin req.same_tag := req_same_tag; -- Store the incoming request from r0, if it is a slow request - -- Note that r1.full = 1 implies req_op = OP_NONE - if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC or - req_op = OP_STORE or req_op = OP_MISC then + -- Note that r1.full = 1 implies none of the req_op_* are 1 + if req_op_load_miss = '1' or req_op_store = '1' or req_op_flush = '1' then r1.req <= req; r1.full <= '1'; end if; @@ -1548,7 +1523,7 @@ begin r1.victim_way <= plru_victim; report "victim way:" & to_hstring(plru_victim); end if; - if req_op = OP_LOAD_MISS or (r0.req.dcbz = '1' and req_is_hit = '0') then + if req_op_load_miss = '1' or (r0.req.dcbz = '1' and req_is_hit = '0') then r1.choose_victim <= '1'; end if; @@ -1584,46 +1559,43 @@ begin r1.store_way <= req.hit_way; end if; - -- Reset per-row valid bits, ready for handling OP_LOAD_MISS + -- Reset per-row valid bits, ready for handling the next load miss for i in 0 to ROW_PER_LINE - 1 loop r1.rows_valid(i) <= '0'; end loop; - case req.op is - when OP_LOAD_HIT => - -- stay in IDLE state - - when OP_LOAD_MISS => + if req.op_lmiss = '1' then -- Normal load cache miss, start the reload machine - -- - report "cache miss real addr:" & to_hstring(req.real_addr) & - " idx:" & to_hstring(get_index(req.real_addr)) & - " tag:" & to_hstring(get_tag(req.real_addr)); + -- Or non-cacheable load + if req.nc = '0' then + report "cache miss real addr:" & to_hstring(req.real_addr) & + " idx:" & to_hstring(get_index(req.real_addr)) & + " tag:" & to_hstring(get_tag(req.real_addr)); + end if; -- Start the wishbone cycle r1.wb.we <= '0'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; - -- Track that we had one request sent - r1.state <= RELOAD_WAIT_ACK; - r1.write_tag <= '1'; - ev.load_miss <= '1'; + if req.nc = '0' then + -- Track that we had one request sent + r1.state <= RELOAD_WAIT_ACK; + r1.write_tag <= '1'; + ev.load_miss <= '1'; - -- If this is a touch, complete the instruction - if req.touch = '1' then - r1.full <= '0'; - r1.slow_valid <= '1'; - r1.ls_valid <= '1'; + -- If this is a touch, complete the instruction + if req.touch = '1' then + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + end if; + else + r1.state <= NC_LOAD_WAIT_ACK; end if; + end if; - when OP_LOAD_NC => - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; - r1.wb.we <= '0'; - r1.state <= NC_LOAD_WAIT_ACK; - - when OP_STORE => + if req.op_store = '1' then if req.reserve = '1' then -- stcx needs to wait until next cycle -- for the reservation address check @@ -1650,20 +1622,12 @@ begin r1.wb.cyc <= '1'; r1.wb.stb <= '1'; end if; - if req.op = OP_STORE then - ev.store_miss <= not req.is_hit; - end if; + ev.store_miss <= not req.is_hit; + end if; - when OP_MISC => + if req.op_flush = '1' then r1.state <= FLUSH_CYCLE; - - -- OP_NONE and OP_BAD do nothing - -- OP_BAD & OP_NOP were handled above already - when OP_NONE => - when OP_BAD => - when OP_NOP => - - end case; + end if; when RELOAD_WAIT_ACK => -- If we are still sending requests, was one accepted ? @@ -1693,7 +1657,7 @@ begin assert not is_X(r1.req.real_addr); end if; if r1.full = '1' and r1.req.same_tag = '1' and - ((r1.dcbz = '1' and req.dcbz = '1') or r1.req.op = OP_LOAD_MISS) and + ((r1.dcbz = '1' and req.dcbz = '1') or r1.req.op_lmiss = '1') and r1.store_row = get_row(r1.req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; @@ -1745,7 +1709,7 @@ begin end if; assert not is_X(acks); if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and - req.op = OP_STORE then + req.op_store = '1' then r1.wb.stb <= '1'; stbs_done := false; r1.store_way <= req.hit_way; @@ -1854,7 +1818,7 @@ begin r1.wb.stb & r1.wb.cyc & d_out.error & d_out.valid & - std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) & + req_op_load_miss & req_op_store & req_op_bad & stall_out & std_ulogic_vector(resize(tlb_hit_way, 3)) & valid_ra & From c2dcf4b3348e4b1eb9ef6a2284b411e12c007033 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 2 Jan 2025 13:22:49 +1100 Subject: [PATCH 17/21] dcache: Generate a DSI on larx/stcx to non-cacheable memory Signed-off-by: Paul Mackerras --- common.vhdl | 1 + dcache.vhdl | 8 +++++--- loadstore1.vhdl | 4 +++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/common.vhdl b/common.vhdl index b1a2c8e..425bb79 100644 --- a/common.vhdl +++ b/common.vhdl @@ -626,6 +626,7 @@ package common is store_done : std_ulogic; error : std_ulogic; cache_paradox : std_ulogic; + reserve_nc : std_ulogic; end record; type DcacheEventType is record diff --git a/dcache.vhdl b/dcache.vhdl index 1a1087c..5eb659d 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -353,6 +353,7 @@ architecture rtl of dcache is mmu_done : std_ulogic; mmu_error : std_ulogic; cache_paradox : std_ulogic; + reserve_nc : std_ulogic; -- Signal to complete a failed stcx. stcx_fail : std_ulogic; @@ -1125,7 +1126,7 @@ begin else req_op_flush <= '1'; end if; - elsif nc = '1' and is_hit = '1' then + elsif nc = '1' and (is_hit = '1' or r0.req.reserve = '1') then req_op_bad <= '1'; elsif r0.req.load = '0' then req_op_store <= '1'; -- includes dcbz @@ -1167,6 +1168,7 @@ begin d_out.store_done <= not r1.stcx_fail; d_out.error <= r1.ls_error; d_out.cache_paradox <= r1.cache_paradox; + d_out.reserve_nc <= r1.reserve_nc; -- Outputs to MMU m_out.done <= r1.mmu_done; @@ -1354,16 +1356,16 @@ begin r1.hit_load_valid <= req_op_load_hit; r1.cache_hit <= req_op_load_hit or (req_op_store and req_is_hit); -- causes PLRU update + r1.cache_paradox <= access_ok and req_nc and req_is_hit; + r1.reserve_nc <= access_ok and r0.req.reserve and req_nc; if req_op_bad = '1' then report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); r1.ls_error <= not r0.mmu_req; r1.mmu_error <= r0.mmu_req; - r1.cache_paradox <= access_ok; else r1.ls_error <= '0'; r1.mmu_error <= '0'; - r1.cache_paradox <= '0'; end if; -- Record TLB hit information for updating TLB PLRU diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 69d053d..5e69352 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -738,7 +738,8 @@ begin end if; interrupt := (r2.req.valid and r2.req.align_intr) or - (d_in.error and d_in.cache_paradox) or m_in.err; + (d_in.error and (d_in.cache_paradox or d_in.reserve_nc)) or + m_in.err; if interrupt = '1' then v.req.valid := '0'; v.busy := '0'; @@ -905,6 +906,7 @@ begin -- signal an interrupt straight away exception := '1'; dsisr(63 - 38) := not r2.req.load; + dsisr(63 - 37) := d_in.reserve_nc; -- XXX there is no architected bit for this -- (probably should be a machine check in fact) dsisr(63 - 35) := d_in.cache_paradox; From 00efcc2c3b446bbdd41daca1c70ec826e9d00a5c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 2 Jan 2025 13:40:21 +1100 Subject: [PATCH 18/21] dcache: Make aligned quadword loads and stores actually be atomic This implements logic in the dcache to make aligned quadword loads and stores atomic with respect to other mechanisms that access memory. Such loads and stores are already marked with the atomic_qw bit in Loadstore1ToDcacheType. For quadword loads where the first dword access hits in the cache, we record the fact of the hit and the cache way used (r1.prev_hit and r1.prev_way). The second dword access then assumes a hit on the same way even if the cache line has been invalidated in the mean time by a snooped store. This gives the same effect as would loading both dwords at the time of the first dword load. For a lqarx, the reservation is set at the time of the first dword load, so if there is such a snooped store, the reservation will be invalid by the time the lqarx completes. If the first dword load hits on the cache line being refilled, so should the second, unless the refill finishes. In that case we set r1.prev_hit and r1.prev_way so the second load can use the line just refilled (but only if the first dword hit the line being refilled). For stores, the req.atomic_more flag is set on the first dword store, and that causes the STORE_WAIT_ACK state to wait for the next request without dropping cyc, so it is not possible for another wishbone master to insert an access between the writes of the two dwords to memory. For store-conditionals, DO_STCX state now transitions to STORE_WAIT_ACK state once the store has been accepted (stall is false). This means that the second store for a stqcx can be handled in the same way as the second store for a stq. Once the first store for a stqcx has succeeded, the second store is done unconditionally. Signed-off-by: Paul Mackerras --- dcache.vhdl | 141 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 96 insertions(+), 45 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 5eb659d..f4403e4 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -264,6 +264,23 @@ architecture rtl of dcache is -- subsequent load requests to the same line can be completed as -- soon as the necessary data comes in from memory, without -- waiting for the whole line to be read. + -- + -- Aligned loads and stores of a doubleword or less are atomic + -- because they are done in a single wishbone operation. + -- For quadword atomic loads and stores we rely on the wishbone + -- arbiter not interrupting access to a target once it has first + -- given access; i.e. once we have the main wishbone, no other + -- master gets access until we drop cyc. + -- + -- Note on loads potentially hitting the victim line that is + -- currently being replaced: the new tag is available starting + -- with the 3rd cycle of RELOAD_WAIT_ACK state. As long as the + -- first read on the wishbone takes at least one cycle (i.e. the + -- ack doesn't arrive in the same cycle as stb was asserted), + -- r1.full will be true at least until that 3rd cycle and so a load + -- following a load miss can't hit on the old tag of the victim + -- line. As long as ack is not generated combinationally from + -- stb, this will be fine. -- Stage 0 register, basically contains just the latched request type reg_stage_0_t is record @@ -307,12 +324,16 @@ architecture rtl of dcache is full : std_ulogic; -- have uncompleted request mmu_req : std_ulogic; -- request is from MMU req : mem_access_request_t; + atomic_more : std_ulogic; -- atomic request isn't finished -- Cache hit state hit_way : way_t; hit_load_valid : std_ulogic; hit_index : index_t; cache_hit : std_ulogic; + prev_hit : std_ulogic; + prev_way : way_t; + prev_hit_reload : std_ulogic; -- TLB hit state tlb_hit : std_ulogic; @@ -389,6 +410,7 @@ architecture rtl of dcache is signal req_same_tag : std_ulogic; signal req_go : std_ulogic; signal req_nc : std_ulogic; + signal req_hit_reload : std_ulogic; signal early_req_row : row_t; signal early_rd_valid : std_ulogic; @@ -927,6 +949,7 @@ begin variable fwd_match : std_ulogic; variable snp_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable snoop_match : std_ulogic; + variable hit_reload : std_ulogic; begin -- Extract line, row and tag from request rindex := get_index(r0.req.addr); @@ -1071,6 +1094,7 @@ begin assert not is_X(rindex); assert not is_X(r1.store_index); end if; + hit_reload := '0'; if r1.state = RELOAD_WAIT_ACK and rel_match = '1' and rindex = r1.store_index then -- Ignore is_hit from above, because a load miss writes the new tag @@ -1085,11 +1109,23 @@ begin r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or use_forward_rl; hit_way := replace_way; + hit_reload := is_hit; + elsif r0.req.load = '1' and r0.req.atomic_qw = '1' and r0.req.atomic_first = '0' and + r0.req.nc = '0' and perm_attr.nocache = '0' and r1.prev_hit = '1' then + -- For the second half of an atomic quadword load, just use the + -- same way as the first half, without considering whether the line + -- is valid; it is as if we had read the second dword at the same + -- time as the first dword, and the line was valid back then. + -- (Cases where the line is currently being reloaded are handled above.) + -- NB lq to noncacheable isn't required to be atomic per the ISA. + is_hit := '1'; + hit_way := r1.prev_way; end if; -- The way that matched on a hit req_hit_way <= hit_way; req_is_hit <= is_hit; + req_hit_reload <= hit_reload; -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP @@ -1418,6 +1454,8 @@ begin r1.acks_pending <= to_unsigned(0, 3); r1.stalled <= '0'; r1.dec_acks <= '0'; + r1.prev_hit <= '0'; + r1.prev_hit_reload <= '0'; reservation.valid <= '0'; reservation.addr <= (others => '0'); @@ -1443,9 +1481,7 @@ begin if req_go = '1' and access_ok = '1' and r0.req.load = '1' and r0.req.reserve = '1' and r0.req.atomic_first = '1' then reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); - if req_is_hit = '1' then - reservation.valid <= not req_snoop_hit; - end if; + reservation.valid <= req_is_hit and not req_snoop_hit; end if; -- Do invalidations from snooped stores to memory @@ -1488,8 +1524,8 @@ begin req.flush := r0.req.flush; req.touch := r0.req.touch; req.reserve := r0.req.reserve; - req.first_dw := r0.req.atomic_first; - req.last_dw := r0.req.atomic_last; + req.first_dw := not r0.req.atomic_qw or r0.req.atomic_first; + req.last_dw := not r0.req.atomic_qw or r0.req.atomic_last; req.real_addr := ra; -- Force data to 0 for dcbz if r0.req.dcbz = '1' then @@ -1528,6 +1564,11 @@ begin if req_op_load_miss = '1' or (r0.req.dcbz = '1' and req_is_hit = '0') then r1.choose_victim <= '1'; end if; + if req_go = '1' then + r1.prev_hit <= req_is_hit; + r1.prev_way <= req_hit_way; + r1.prev_hit_reload <= req_hit_reload; + end if; -- Update count of pending acks acks := r1.acks_pending; @@ -1549,6 +1590,7 @@ begin r1.wb.sel <= req.byte_sel; r1.wb.dat <= req.data; r1.dcbz <= req.dcbz; + r1.atomic_more <= not req.last_dw; -- Keep track of our index and way for subsequent stores. r1.store_index <= get_index(req.real_addr); @@ -1659,7 +1701,7 @@ begin assert not is_X(r1.req.real_addr); end if; if r1.full = '1' and r1.req.same_tag = '1' and - ((r1.dcbz = '1' and req.dcbz = '1') or r1.req.op_lmiss = '1') and + ((r1.dcbz = '1' and r1.req.dcbz = '1') or r1.req.op_lmiss = '1') and r1.store_row = get_row(r1.req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; @@ -1668,12 +1710,9 @@ begin else r1.mmu_done <= '1'; end if; - -- NB: for lqarx, set the reservation on the first - -- dword so that a snooped store between the two - -- dwords will kill the reservation. - if req.reserve = '1' and req.first_dw = '1' then + -- NB: for lqarx, set the reservation on the first dword + if r1.req.reserve = '1' and r1.req.first_dw = '1' then reservation.valid <= '1'; - reservation.addr <= req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); end if; end if; @@ -1690,6 +1729,10 @@ begin cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '1'; ev.dcache_refill <= not r1.dcbz; + -- Second half of a lq/lqarx can assume a hit on this line now + -- if the first half hit this line. + r1.prev_hit <= r1.prev_hit_reload; + r1.prev_way <= r1.store_way; r1.state <= IDLE; end if; @@ -1703,6 +1746,10 @@ begin if wishbone_in.stall = '0' then -- See if there is another store waiting to be done -- which is in the same real page. + -- This could be either in r1.req or in r0. + -- Ignore store-conditionals, they have to go through + -- DO_STCX state, unless they are the second half of a + -- successful stqcx, which is handled here. if req.valid = '1' then r1.wb.adr(SET_SIZE_BITS - ROW_OFF_BITS - 1 downto 0) <= req.real_addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS); @@ -1710,28 +1757,33 @@ begin r1.wb.sel <= req.byte_sel; end if; assert not is_X(acks); - if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and - req.op_store = '1' then - r1.wb.stb <= '1'; - stbs_done := false; - r1.store_way <= req.hit_way; - r1.store_row <= get_row(req.real_addr); - r1.write_bram <= req.is_hit; - r1.full <= '0'; - r1.slow_valid <= '1'; - -- Store requests never come from the MMU - r1.ls_valid <= '1'; - stbs_done := false; + r1.wb.stb <= '0'; + if req.op_store = '1' and req.same_tag = '1' and req.dcbz = '0' and + (req.reserve = '0' or r1.atomic_more = '1') then + if acks < 7 then + r1.wb.stb <= '1'; + stbs_done := false; + r1.store_way <= req.hit_way; + r1.store_row <= get_row(req.real_addr); + r1.write_bram <= req.is_hit; + r1.atomic_more <= not req.last_dw; + r1.full <= '0'; + r1.slow_valid <= '1'; + -- Store requests never come from the MMU + r1.ls_valid <= '1'; + end if; else - r1.wb.stb <= '0'; stbs_done := true; + if req.valid = '1' then + r1.atomic_more <= '0'; + end if; end if; end if; -- Got ack ? See if complete. - if wishbone_in.ack = '1' then + if stbs_done and r1.atomic_more = '0' then assert not is_X(acks); - if stbs_done and acks = 1 then + if acks = 0 or (wishbone_in.ack = '1' and acks = 1) then r1.state <= IDLE; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; @@ -1770,31 +1822,30 @@ begin r1.wb.cyc <= '0'; r1.wb.stb <= '0'; reservation.valid <= '0'; + -- If this is the first half of a stqcx., the second half + -- will fail also because the reservation is not valid. + r1.state <= IDLE; elsif r1.wb.cyc = '0' then -- Right address and have reservation, so start the -- wishbone cycle r1.wb.we <= '1'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; - else - if wishbone_in.stall = '0' then - -- Store has been accepted, so now we can write the - -- cache data RAM - r1.write_bram <= req.is_hit; - r1.wb.stb <= '0'; - end if; - if wishbone_in.ack = '1' then - r1.state <= IDLE; - r1.wb.cyc <= '0'; - r1.wb.stb <= '0'; - r1.full <= '0'; - r1.slow_valid <= '1'; - r1.ls_valid <= '1'; - -- For stqcx., kill the reservation on the last dword - if r1.req.last_dw = '1' then - reservation.valid <= '0'; - end if; - end if; + elsif r1.wb.stb = '1' and wishbone_in.stall = '0' then + -- Store has been accepted, so now we can write the + -- cache data RAM and complete the request + r1.write_bram <= r1.req.is_hit; + r1.wb.stb <= '0'; + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + reservation.valid <= '0'; + -- For a stqcx, STORE_WAIT_ACK will issue the second half + -- without checking the reservation, which is what we want + -- given that the first half has gone out. + -- With r1.atomic_more set, STORE_WAIT_ACK won't exit to + -- IDLE state until it sees the second half. + r1.state <= STORE_WAIT_ACK; end if; when FLUSH_CYCLE => From 5121e0f392ee99513bedb5006d944e5d436b7d2e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 9 May 2021 19:58:59 +1000 Subject: [PATCH 19/21] core: Implement sync instructions This implements all the sync variants (sync, lwsync, ptesync, etc.) as a LSU op that gets sent down to the dcache and completes once the dcache state machine is idle. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + dcache.vhdl | 23 ++++++++++++++++++++--- decode1.vhdl | 2 +- loadstore1.vhdl | 9 +++++++-- 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/common.vhdl b/common.vhdl index 425bb79..c04bbe4 100644 --- a/common.vhdl +++ b/common.vhdl @@ -605,6 +605,7 @@ package common is dcbz : std_ulogic; flush : std_ulogic; touch : std_ulogic; + sync : std_ulogic; nc : std_ulogic; reserve : std_ulogic; atomic_qw : std_ulogic; -- part of a quadword atomic op diff --git a/dcache.vhdl b/dcache.vhdl index f4403e4..b921095 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -299,11 +299,13 @@ architecture rtl of dcache is op_lmiss : std_ulogic; op_store : std_ulogic; op_flush : std_ulogic; + op_sync : std_ulogic; nc : std_ulogic; valid : std_ulogic; dcbz : std_ulogic; flush : std_ulogic; touch : std_ulogic; + sync : std_ulogic; reserve : std_ulogic; first_dw : std_ulogic; last_dw : std_ulogic; @@ -404,6 +406,7 @@ architecture rtl of dcache is signal req_op_load_miss : std_ulogic; signal req_op_store : std_ulogic; signal req_op_flush : std_ulogic; + signal req_op_sync : std_ulogic; signal req_op_bad : std_ulogic; signal req_op_nop : std_ulogic; signal req_data : std_ulogic_vector(63 downto 0); @@ -1144,8 +1147,11 @@ begin req_op_store <= '0'; req_op_nop <= '0'; req_op_flush <= '0'; + req_op_sync <= '0'; if go = '1' then - if r0.req.touch = '1' then + if r0.req.sync = '1' then + req_op_sync <= '1'; + elsif r0.req.touch = '1' then if access_ok = '1' and is_hit = '0' and nc = '0' then req_op_load_miss <= '1'; elsif access_ok = '1' and is_hit = '1' and nc = '0' then @@ -1241,7 +1247,7 @@ begin report "completing ld/st with error"; end if; - -- Slow ops (load miss, NC, stores) + -- Slow ops (load miss, NC, stores, sync) if r1.slow_valid = '1' then report "completing store or load miss data=" & to_hstring(r1.data_out); end if; @@ -1517,12 +1523,14 @@ begin req.op_lmiss := req_op_load_miss; req.op_store := req_op_store; req.op_flush := req_op_flush; + req.op_sync := req_op_sync; req.nc := req_nc; req.valid := req_go; req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; req.flush := r0.req.flush; req.touch := r0.req.touch; + req.sync := r0.req.sync; req.reserve := r0.req.reserve; req.first_dw := not r0.req.atomic_qw or r0.req.atomic_first; req.last_dw := not r0.req.atomic_qw or r0.req.atomic_last; @@ -1547,7 +1555,8 @@ begin -- Store the incoming request from r0, if it is a slow request -- Note that r1.full = 1 implies none of the req_op_* are 1 - if req_op_load_miss = '1' or req_op_store = '1' or req_op_flush = '1' then + if req_op_load_miss = '1' or req_op_store = '1' or req_op_flush = '1' or + req_op_sync = '1' then r1.req <= req; r1.full <= '1'; end if; @@ -1673,6 +1682,14 @@ begin r1.state <= FLUSH_CYCLE; end if; + if req.op_sync = '1' then + -- sync/lwsync can complete now that the state machine + -- is idle. + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + end if; + when RELOAD_WAIT_ACK => -- If we are still sending requests, was one accepted ? if wishbone_in.stall = '0' and r1.wb.stb = '1' then diff --git a/decode1.vhdl b/decode1.vhdl index 9047cf8..7b480a3 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -377,7 +377,7 @@ architecture behaviour of decode1 is INSN_subfic => (ALU, NONE, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_subfme => (ALU, NONE, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RCOE, '0', '0', NONE), INSN_subfze => (ALU, NONE, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RCOE, '0', '0', NONE), - INSN_sync => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_sync => (LDST, NONE, OP_SYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), INSN_td => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_tdi => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_tlbie => (LDST, NONE, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 5e69352..485947b 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -63,6 +63,7 @@ architecture behave of loadstore1 is store : std_ulogic; flush : std_ulogic; touch : std_ulogic; + sync : std_ulogic; tlbie : std_ulogic; dcbz : std_ulogic; read_spr : std_ulogic; @@ -103,7 +104,7 @@ architecture behave of loadstore1 is incomplete : std_ulogic; end record; constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', - flush => '0', touch => '0', tlbie => '0', + flush => '0', touch => '0', sync => '0', tlbie => '0', dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0', instr_fault => '0', do_update => '0', mode_32bit => '0', prefixed => '0', @@ -506,6 +507,8 @@ begin end if; case l_in.op is + when OP_SYNC => + v.sync := '1'; when OP_STORE => v.store := '1'; if l_in.length = "0000" then @@ -547,7 +550,7 @@ begin v.mmu_op := '1'; when others => end case; - v.dc_req := l_in.valid and (v.load or v.store or v.dcbz) and not v.align_intr; + v.dc_req := l_in.valid and (v.load or v.store or v.sync or v.dcbz) and not v.align_intr; v.incomplete := v.dc_req and v.two_dwords; -- Work out controls for load and store formatting @@ -994,6 +997,7 @@ begin d_out.dcbz <= stage1_req.dcbz; d_out.flush <= stage1_req.flush; d_out.touch <= stage1_req.touch; + d_out.sync <= stage1_req.sync; d_out.nc <= stage1_req.nc; d_out.reserve <= stage1_req.reserve; d_out.atomic_qw <= stage1_req.atomic_qw; @@ -1009,6 +1013,7 @@ begin d_out.dcbz <= r2.req.dcbz; d_out.flush <= r2.req.flush; d_out.touch <= r2.req.touch; + d_out.sync <= r2.req.sync; d_out.nc <= r2.req.nc; d_out.reserve <= r2.req.reserve; d_out.atomic_qw <= r2.req.atomic_qw; From d531e8aa1077f81f0b48d8112b3c5d9af684d453 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 2 Jan 2025 22:11:06 +1100 Subject: [PATCH 20/21] dcache: Improve timing Previously we only put slow requests in r1.req, but that caused timing problems because it meant the clock enable for all the registers in r1.req depended on whether we have a TLB and cache hit or not. Now we put any valid request (i.e. with req_go = 1) into r1.req, which has better timing because req_go is a relatively simple function of registered values (r0_full, r0_valid, r0.tlbie, r0.tlbld, r1.full, r1.ls_error, d_in.hold). We still have to work out if we have a slow request, but that is only needed for the D input of one register (r1.full). Signed-off-by: Paul Mackerras --- dcache.vhdl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index b921095..ce7b351 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -1554,11 +1554,12 @@ begin req.same_tag := req_same_tag; -- Store the incoming request from r0, if it is a slow request - -- Note that r1.full = 1 implies none of the req_op_* are 1 - if req_op_load_miss = '1' or req_op_store = '1' or req_op_flush = '1' or - req_op_sync = '1' then + -- Note that r1.full = 1 implies none of the req_op_* are 1. + -- For the sake of timing we put any valid request in r1.req, + -- but only set r1.full if it is a slow request. + if req_go = '1' then r1.req <= req; - r1.full <= '1'; + r1.full <= req_op_load_miss or req_op_store or req_op_flush or req_op_sync; end if; end if; From 7437f699cab9701199ed9a50de20a9ea8ec0be23 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 8 Jan 2025 18:16:26 +1100 Subject: [PATCH 21/21] core: Implement the PIR SPR This reports the CPU core number, currently always 0, but this will be useful in future for distinguishing which CPU is which in a multiprocessor system. Signed-off-by: Paul Mackerras --- common.vhdl | 2 ++ core.vhdl | 2 ++ decode1.vhdl | 2 ++ execute1.vhdl | 5 ++++- soc.vhdl | 1 + 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/common.vhdl b/common.vhdl index c04bbe4..76eaec2 100644 --- a/common.vhdl +++ b/common.vhdl @@ -63,6 +63,7 @@ package common is constant SPR_UDSCR : spr_num_t := 3; constant SPR_DSCR : spr_num_t := 17; constant SPR_VRSAVE : spr_num_t := 256; + constant SPR_PIR : spr_num_t := 1023; -- PMU registers constant SPR_UPMC1 : spr_num_t := 771; @@ -172,6 +173,7 @@ package common is constant SPRSEL_HEIR : spr_selector := 4x"9"; constant SPRSEL_CTRL : spr_selector := 4x"a"; constant SPRSEL_DSCR : spr_selector := 4x"b"; + constant SPRSEL_PIR : spr_selector := 4x"c"; constant SPRSEL_XER : spr_selector := 4x"f"; -- FSCR and HFSCR bit numbers diff --git a/core.vhdl b/core.vhdl index bba1004..187e176 100644 --- a/core.vhdl +++ b/core.vhdl @@ -9,6 +9,7 @@ use work.wishbone_types.all; entity core is generic ( SIM : boolean := false; + CPU_INDEX : natural := 0; DISABLE_FLATTEN : boolean := false; EX1_BYPASS : boolean := true; HAS_FPU : boolean := true; @@ -364,6 +365,7 @@ begin execute1_0: entity work.execute1 generic map ( SIM => SIM, + CPU_INDEX => CPU_INDEX, EX1_BYPASS => EX1_BYPASS, HAS_FPU => HAS_FPU, LOG_LENGTH => LOG_LENGTH diff --git a/decode1.vhdl b/decode1.vhdl index 7b480a3..0ea9ed1 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -486,6 +486,8 @@ architecture behaviour of decode1 is i.sel := SPRSEL_DSCR; when SPR_DSCR => i.sel := SPRSEL_DSCR; + when SPR_PIR => + i.sel := SPRSEL_PIR; when others => i.valid := '0'; end case; diff --git a/execute1.vhdl b/execute1.vhdl index 84a6fbe..3b7ec2f 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -15,6 +15,7 @@ entity execute1 is SIM : boolean := false; EX1_BYPASS : boolean := true; HAS_FPU : boolean := true; + CPU_INDEX : natural; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -702,7 +703,8 @@ begin ex2 <= ex2in; ctrl <= ctrl_tmp; if valid_in = '1' then - report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) & + report "CPU " & natural'image(CPU_INDEX) & " execute " & to_hstring(e_in.nia) & + " op=" & insn_type_t'image(e_in.insn_type) & " wr=" & to_hstring(ex1in.e.write_reg) & " we=" & std_ulogic'image(ex1in.e.write_enable) & " tag=" & integer'image(ex1in.e.instr_tag.tag) & std_ulogic'image(ex1in.e.instr_tag.valid) & " 2nd=" & std_ulogic'image(e_in.second); @@ -1874,6 +1876,7 @@ begin ctrl.heir when SPRSEL_HEIR, assemble_ctrl(ctrl, ex1.msr(MSR_PR)) when SPRSEL_CTRL, 39x"0" & ctrl.dscr when SPRSEL_DSCR, + 56x"0" & std_ulogic_vector(to_unsigned(CPU_INDEX, 8)) when SPRSEL_PIR, assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; stage2_stall <= l_in.l2stall or fp_in.f2stall; diff --git a/soc.vhdl b/soc.vhdl index 71474df..3e3b438 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -351,6 +351,7 @@ begin processor: entity work.core generic map( SIM => SIM, + CPU_INDEX => 0, HAS_FPU => HAS_FPU, HAS_BTC => HAS_BTC, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,