From f8215561047e3811f255ee2b4fca6e6fc6243091 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 5 May 2021 13:45:51 -0500 Subject: [PATCH] Add MMA chapter Signed-off-by: Bill Schmidt --- Intrinsics_Reference/bk_main.xml | 1 + Intrinsics_Reference/ch_mma_reference.xml | 1014 +++++++++++++++++++++ Intrinsics_Reference/ch_vec_reference.xml | 2 +- 3 files changed, 1016 insertions(+), 1 deletion(-) create mode 100644 Intrinsics_Reference/ch_mma_reference.xml diff --git a/Intrinsics_Reference/bk_main.xml b/Intrinsics_Reference/bk_main.xml index 2a4ed58..c183b01 100644 --- a/Intrinsics_Reference/bk_main.xml +++ b/Intrinsics_Reference/bk_main.xml @@ -121,6 +121,7 @@ + diff --git a/Intrinsics_Reference/ch_mma_reference.xml b/Intrinsics_Reference/ch_mma_reference.xml new file mode 100644 index 0000000..7b68d4e --- /dev/null +++ b/Intrinsics_Reference/ch_mma_reference.xml @@ -0,0 +1,1014 @@ + + + + + Matrix Multiply Accelerate (MMA) Intrinsic Reference + +
+ Introduction + + Version 3.1 of the Power Instruction Set Architecture + Specification (see ) + introduced instructions to accelerate matrix multiplication + computations. These instructions operate both on the VSRs and + on new 512-bit accumulator registers (ACCs). Intrinsic + functions to access these instructions are described in this + chapter. + +
+ +
+ Type Support + + Many of the MMA instructions operate on aligned pairs of vectors + (that is, an even numbered vector and the next-higher numbered + vector), or on aligned quads of vectors (that is, a vector + number divisible by four and the three next-higher numbered + vectors). Compilers that support the MMA intrinsic functions + must define two types, __vector_pair and + __vector_quad, to represent these concepts. + Pointers and references to these types must also be supported + where these concepts exist in the source language. + +
+ +
+ Intrinsic Functions + + The intrinsics in this section are not overloaded. Each is + presented with its prototype and the instruction it represents. + The string "vuc" is used as shorthand for "vector unsigned + char" throughout. + +
+ Memory Access + + Load and store vector pairs. + + + + + + + + + + Prototype + + + Instruction + + + + + + + + __vector pair __builtin_vsx_lxvp (long long int a, const __vector_pair* b) + + + + + lxvp r,a(b) + + + + + + + void __builtin_vsx_stxvp (__vector_pair s, long long int a, const __vector_pair* b) + + + + + stxvp s,a(b) + + + + + + + +
+
+ Assembly and Disassembly of Large Types + + The following builtins are used to construct + __vector_pair and __vector_quad + objects from smaller vectors, and deconstruct them into such + vectors. The disassembly interfaces place the results into + arrays of vectors. + + + FIXME: Not clear when __builtin_mma versus __builtin_vsx is + used here. Document shows __builtin_vsx for pairs only. We + also have some late-breaking changes around endianness that + need to be properly documented. + + + FIXME: I've chosen not to include sample code generation here, + but I can be persuaded if folks think that's useful. + + + + + + + + + + void __builtin_mma_assemble_acc (__vector_quad*, vuc, vuc, vuc, vuc) + + + + + + + void __builtin_vsx_assemble_pair (__vector_pair*, vuc, vuc) + + + + + + + void __builtin_mma_disassemble_acc (void*, __vector_quad*) + + + + + + + void __builtin_vsx_disassemble_pair (void*, __vector_pair*) + + + + + + + +
+
+ Accumulator Move Operations + + These instructions move data from vector quads to accumulators + (a "priming" operation) or vice versa ( a "depriming" + operation), or initialize an accumulator to zeros. + + + + + + + + + + Prototype + + + Instruction + + + + + + + + void __builtin_mma_xxmfacc (__vector_quad* a) + + + + + xxmfacc a + + + + + + + void __builtin_mma_xxmtacc (__vector_quad* a) + + + + + xxmtacc a + + + + + + + void __builtin_mma_xxsetaccz (__vector_quad* a) + + + + + xxsetaccz a + + + + + + + +
+
+ Outer Product Operations + + Each of these intrinsics generates an instruction to perform + an outer product operation. + + + + + + + + + + Prototype + + + Instruction + + + + + + + + void __builtin_mma_pmxvbf16ger2 (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvbf16ger2 a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvbf16ger2nn (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvbf16ger2nn a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvbf16ger2np (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvbf16ger2np a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvbf16ger2pn (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvbf16ger2pn a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvbf16ger2pp (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvbf16ger2pp a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvf16ger2 (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvf16ger2 a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvf16ger2nn (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvf16ger2nn a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvf16ger2np (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvf16ger2np a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvf16ger2pn (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvf16ger2pn a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvf16ger2pp (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvf16ger2pp a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvf32ger (__vector_quad* a, vuc b, vuc c, + const int d, const int e) + + + + + pmxvf32ger a,b,c,d,e + + + + + + + + void __builtin_mma_pmxvf32gernn (__vector_quad* a, vuc b, vuc c, + const int d, const int e) + + + + + pmxvf32gernn a,b,c,d,e + + + + + + + + void __builtin_mma_pmxvf32gernp (__vector_quad* a, vuc b, vuc c, + const int d, const int e) + + + + + pmxvf32gernp a,b,c,d,e + + + + + + + + void __builtin_mma_pmxvf32gerpn (__vector_quad* a, vuc b, vuc c, + const int d, const int e) + + + + + pmxvf32gerpn a,b,c,d,e + + + + + + + + void __builtin_mma_pmxvf32gerpp (__vector_quad* a, vuc b, vuc c, + const int d, const int e) + + + + + pmxvf32gerpp a,b,c,d,e + + + + + + + + void __builtin_mma_pmxvf64ger (__vector_quad* a, __vector_pair b, + vuc c, const int d, const int e) + + + + + pmxvf64ger a,b,c,d,e + + + + + + + + void __builtin_mma_pmxvf64gernn (__vector_quad* a, __vector_pair b, + vuc c, const int d, const int e) + + + + + pmxvf64gernn a,b,c,d,e + + + + + + + + void __builtin_mma_pmxvf64gernp (__vector_quad* a, __vector_pair b, + vuc c, const int d, const int e) + + + + + pmxvf64gernp a,b,c,d,e + + + + + + + + void __builtin_mma_pmxvf64gerpn (__vector_quad* a, __vector_pair b, + vuc c, const int d, const int e) + + + + + pmxvf64gerpn a,b,c,d,e + + + + + + + + void __builtin_mma_pmxvf64gerpp (__vector_quad* a, __vector_pair b, + vuc c, const int d, const int e) + + + + + pmxvf64gerpp a,b,c,d,e + + + + + + + + void __builtin_mma_pmxvi64ger2 (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvi64ger2 a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvi64ger2pp (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvi64ger2pp a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvi64ger2s (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvi64ger2s a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvi64ger2spp (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvi64ger2spp a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvi4ger8 (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvi4ger8 a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvi4ger8pp (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvi4ger8pp a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvi8ger4 (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvi8ger4 a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvi8ger4pp (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvi8ger4pp a,b,c,d,e,f + + + + + + + + void __builtin_mma_pmxvi8ger4spp (__vector_quad* a, vuc b, vuc c, + const int d, const int e, const int f) + + + + + pmxvi8ger4spp a,b,c,d,e,f + + + + + + + + void __builtin_mma_xvbf16ger2 (__vector_quad* a, vuc b, vuc c) + + + + + xvbf16ger2 a,b,c + + + + + + + void __builtin_mma_xvbf16ger2nn (__vector_quad* a, vuc b, vuc c) + + + + + xvbf16ger2nn a,b,c + + + + + + + void __builtin_mma_xvbf16ger2np (__vector_quad* a, vuc b, vuc c) + + + + + xvbf16ger2np a,b,c + + + + + + + void __builtin_mma_xvbf16ger2pn (__vector_quad* a, vuc b, vuc c) + + + + + xvbf16ger2pn a,b,c + + + + + + + void __builtin_mma_xvbf16ger2pp (__vector_quad* a, vuc b, vuc c) + + + + + xvbf16ger2pp a,b,c + + + + + + + void __builtin_mma_xvf16ger2 (__vector_quad* a, vuc b, vuc c) + + + + + xvf16ger2 a,b,c + + + + + + + void __builtin_mma_xvf16ger2nn (__vector_quad* a, vuc b, vuc c) + + + + + xvf16ger2nn a,b,c + + + + + + + void __builtin_mma_xvf16ger2np (__vector_quad* a, vuc b, vuc c) + + + + + xvf16ger2np a,b,c + + + + + + + void __builtin_mma_xvf16ger2pn (__vector_quad* a, vuc b, vuc c) + + + + + xvf16ger2pn a,b,c + + + + + + + void __builtin_mma_xvf16ger2pp (__vector_quad* a, vuc b, vuc c) + + + + + xvf16ger2pp a,b,c + + + + + + + void __builtin_mma_xvf32ger (__vector_quad* a, vuc b, vuc c) + + + + + xvf32ger a,b,c + + + + + + + void __builtin_mma_xvf32gernn (__vector_quad* a, vuc b, vuc c) + + + + + xvf32gernn a,b,c + + + + + + + void __builtin_mma_xvf32gernp (__vector_quad* a, vuc b, vuc c) + + + + + xvf32gernp a,b,c + + + + + + + void __builtin_mma_xvf32gerpn (__vector_quad* a, vuc b, vuc c) + + + + + xvf32gerpn a,b,c + + + + + + + void __builtin_mma_xvf32gerpp (__vector_quad* a, vuc b, vuc c) + + + + + xvf32gerpp a,b,c + + + + + + + void __builtin_mma_xvf64ger (__vector_quad* a, __vector_pair b, vuc c) + + + + + xvf64ger a,b,c + + + + + + + void __builtin_mma_xvf64gernn (__vector_quad* a, __vector_pair b, vuc c) + + + + + xvf64gernn a,b,c + + + + + + + void __builtin_mma_xvf64gernp (__vector_quad* a, __vector_pair b, vuc c) + + + + + xvf64gernp a,b,c + + + + + + + void __builtin_mma_xvf64gerpn (__vector_quad* a, __vector_pair b, vuc c) + + + + + xvf64gerpn a,b,c + + + + + + + void __builtin_mma_xvf64gerpp (__vector_quad* a, __vector_pair b, vuc c) + + + + + xvf64gerpp a,b,c + + + + + + + void __builtin_mma_xvi16ger2 (__vector_quad* a, vuc b, vuc c) + + + + + xvi16ger2 a,b,c + + + + + + + void __builtin_mma_xvi16ger2pp (__vector_quad* a, vuc b, vuc c) + + + + + xvi16ger2pp a,b,c + + + + + + + void __builtin_mma_xvi16ger2s (__vector_quad* a, vuc b, vuc c) + + + + + xvi16ger2s a,b,c + + + + + + + void __builtin_mma_xvi16ger2spp (__vector_quad* a, vuc b, vuc c) + + + + + xvi16ger2spp a,b,c + + + + + + + void __builtin_mma_xvi4ger8 (__vector_quad* a, vuc b, vuc c) + + + + + xvi4ger8 a,b,c + + + + + + + void __builtin_mma_xvi4ger8pp (__vector_quad* a, vuc b, vuc c) + + + + + xvi4ger8pp a,b,c + + + + + + + void __builtin_mma_xvi8ger4 (__vector_quad* a, vuc b, vuc c) + + + + + xvi8ger4 a,b,c + + + + + + + void __builtin_mma_xvi8ger4pp (__vector_quad* a, vuc b, vuc c) + + + + + xvi8ger4pp a,b,c + + + + + + + void __builtin_mma_xvi8ger4spp (__vector_quad* a, vuc b, vuc c) + + + + + xvi8ger4spp a,b,c + + + + + + + +
+
+ +
diff --git a/Intrinsics_Reference/ch_vec_reference.xml b/Intrinsics_Reference/ch_vec_reference.xml index b0d7ec7..174dbf4 100644 --- a/Intrinsics_Reference/ch_vec_reference.xml +++ b/Intrinsics_Reference/ch_vec_reference.xml @@ -1,5 +1,5 @@