diff --git a/Intrinsics_Reference/bk_main.xml b/Intrinsics_Reference/bk_main.xml
index 2a4ed58..c183b01 100644
--- a/Intrinsics_Reference/bk_main.xml
+++ b/Intrinsics_Reference/bk_main.xml
@@ -121,6 +121,7 @@
+
diff --git a/Intrinsics_Reference/ch_mma_reference.xml b/Intrinsics_Reference/ch_mma_reference.xml
new file mode 100644
index 0000000..7b68d4e
--- /dev/null
+++ b/Intrinsics_Reference/ch_mma_reference.xml
@@ -0,0 +1,1014 @@
+
+
+
+
+ Matrix Multiply Accelerate (MMA) Intrinsic Reference
+
+
+ Introduction
+
+ Version 3.1 of the Power Instruction Set Architecture
+ Specification (see )
+ introduced instructions to accelerate matrix multiplication
+ computations. These instructions operate both on the VSRs and
+ on new 512-bit accumulator registers (ACCs). Intrinsic
+ functions to access these instructions are described in this
+ chapter.
+
+
+
+
+ Type Support
+
+ Many of the MMA instructions operate on aligned pairs of vectors
+ (that is, an even numbered vector and the next-higher numbered
+ vector), or on aligned quads of vectors (that is, a vector
+ number divisible by four and the three next-higher numbered
+ vectors). Compilers that support the MMA intrinsic functions
+ must define two types, __vector_pair
and
+ __vector_quad
, to represent these concepts.
+ Pointers and references to these types must also be supported
+ where these concepts exist in the source language.
+
+
+
+
+ Intrinsic Functions
+
+ The intrinsics in this section are not overloaded. Each is
+ presented with its prototype and the instruction it represents.
+ The string "vuc" is used as shorthand for "vector unsigned
+ char" throughout.
+
+
+ Memory Access
+
+ Load and store vector pairs.
+
+
+
+
+
+
+
+
+
+ Prototype
+
+
+ Instruction
+
+
+
+
+
+
+
+ __vector pair __builtin_vsx_lxvp (long long int a, const __vector_pair* b)
+
+
+
+
+ lxvp r,a(b)
+
+
+
+
+
+
+ void __builtin_vsx_stxvp (__vector_pair s, long long int a, const __vector_pair* b)
+
+
+
+
+ stxvp s,a(b)
+
+
+
+
+
+
+
+
+
+ Assembly and Disassembly of Large Types
+
+ The following builtins are used to construct
+ __vector_pair
and __vector_quad
+ objects from smaller vectors, and deconstruct them into such
+ vectors. The disassembly interfaces place the results into
+ arrays of vectors.
+
+
+ FIXME: Not clear when __builtin_mma versus __builtin_vsx is
+ used here. Document shows __builtin_vsx for pairs only. We
+ also have some late-breaking changes around endianness that
+ need to be properly documented.
+
+
+ FIXME: I've chosen not to include sample code generation here,
+ but I can be persuaded if folks think that's useful.
+
+
+
+
+
+
+
+
+
+ void __builtin_mma_assemble_acc (__vector_quad*, vuc, vuc, vuc, vuc)
+
+
+
+
+
+
+ void __builtin_vsx_assemble_pair (__vector_pair*, vuc, vuc)
+
+
+
+
+
+
+ void __builtin_mma_disassemble_acc (void*, __vector_quad*)
+
+
+
+
+
+
+ void __builtin_vsx_disassemble_pair (void*, __vector_pair*)
+
+
+
+
+
+
+
+
+
+ Accumulator Move Operations
+
+ These instructions move data from vector quads to accumulators
+ (a "priming" operation) or vice versa ( a "depriming"
+ operation), or initialize an accumulator to zeros.
+
+
+
+
+
+
+
+
+
+ Prototype
+
+
+ Instruction
+
+
+
+
+
+
+
+ void __builtin_mma_xxmfacc (__vector_quad* a)
+
+
+
+
+ xxmfacc a
+
+
+
+
+
+
+ void __builtin_mma_xxmtacc (__vector_quad* a)
+
+
+
+
+ xxmtacc a
+
+
+
+
+
+
+ void __builtin_mma_xxsetaccz (__vector_quad* a)
+
+
+
+
+ xxsetaccz a
+
+
+
+
+
+
+
+
+
+ Outer Product Operations
+
+ Each of these intrinsics generates an instruction to perform
+ an outer product operation.
+
+
+
+
+
+
+
+
+
+ Prototype
+
+
+ Instruction
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvbf16ger2 (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvbf16ger2 a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvbf16ger2nn (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvbf16ger2nn a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvbf16ger2np (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvbf16ger2np a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvbf16ger2pn (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvbf16ger2pn a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvbf16ger2pp (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvbf16ger2pp a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf16ger2 (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvf16ger2 a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf16ger2nn (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvf16ger2nn a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf16ger2np (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvf16ger2np a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf16ger2pn (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvf16ger2pn a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf16ger2pp (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvf16ger2pp a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf32ger (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e)
+
+
+
+
+ pmxvf32ger a,b,c,d,e
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf32gernn (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e)
+
+
+
+
+ pmxvf32gernn a,b,c,d,e
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf32gernp (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e)
+
+
+
+
+ pmxvf32gernp a,b,c,d,e
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf32gerpn (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e)
+
+
+
+
+ pmxvf32gerpn a,b,c,d,e
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf32gerpp (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e)
+
+
+
+
+ pmxvf32gerpp a,b,c,d,e
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf64ger (__vector_quad* a, __vector_pair b,
+ vuc c, const int d, const int e)
+
+
+
+
+ pmxvf64ger a,b,c,d,e
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf64gernn (__vector_quad* a, __vector_pair b,
+ vuc c, const int d, const int e)
+
+
+
+
+ pmxvf64gernn a,b,c,d,e
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf64gernp (__vector_quad* a, __vector_pair b,
+ vuc c, const int d, const int e)
+
+
+
+
+ pmxvf64gernp a,b,c,d,e
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf64gerpn (__vector_quad* a, __vector_pair b,
+ vuc c, const int d, const int e)
+
+
+
+
+ pmxvf64gerpn a,b,c,d,e
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvf64gerpp (__vector_quad* a, __vector_pair b,
+ vuc c, const int d, const int e)
+
+
+
+
+ pmxvf64gerpp a,b,c,d,e
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvi64ger2 (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvi64ger2 a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvi64ger2pp (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvi64ger2pp a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvi64ger2s (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvi64ger2s a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvi64ger2spp (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvi64ger2spp a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvi4ger8 (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvi4ger8 a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvi4ger8pp (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvi4ger8pp a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvi8ger4 (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvi8ger4 a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvi8ger4pp (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvi8ger4pp a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_pmxvi8ger4spp (__vector_quad* a, vuc b, vuc c,
+ const int d, const int e, const int f)
+
+
+
+
+ pmxvi8ger4spp a,b,c,d,e,f
+
+
+
+
+
+
+
+ void __builtin_mma_xvbf16ger2 (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvbf16ger2 a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvbf16ger2nn (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvbf16ger2nn a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvbf16ger2np (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvbf16ger2np a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvbf16ger2pn (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvbf16ger2pn a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvbf16ger2pp (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvbf16ger2pp a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf16ger2 (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvf16ger2 a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf16ger2nn (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvf16ger2nn a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf16ger2np (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvf16ger2np a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf16ger2pn (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvf16ger2pn a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf16ger2pp (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvf16ger2pp a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf32ger (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvf32ger a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf32gernn (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvf32gernn a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf32gernp (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvf32gernp a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf32gerpn (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvf32gerpn a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf32gerpp (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvf32gerpp a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf64ger (__vector_quad* a, __vector_pair b, vuc c)
+
+
+
+
+ xvf64ger a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf64gernn (__vector_quad* a, __vector_pair b, vuc c)
+
+
+
+
+ xvf64gernn a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf64gernp (__vector_quad* a, __vector_pair b, vuc c)
+
+
+
+
+ xvf64gernp a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf64gerpn (__vector_quad* a, __vector_pair b, vuc c)
+
+
+
+
+ xvf64gerpn a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvf64gerpp (__vector_quad* a, __vector_pair b, vuc c)
+
+
+
+
+ xvf64gerpp a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvi16ger2 (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvi16ger2 a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvi16ger2pp (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvi16ger2pp a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvi16ger2s (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvi16ger2s a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvi16ger2spp (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvi16ger2spp a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvi4ger8 (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvi4ger8 a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvi4ger8pp (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvi4ger8pp a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvi8ger4 (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvi8ger4 a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvi8ger4pp (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvi8ger4pp a,b,c
+
+
+
+
+
+
+ void __builtin_mma_xvi8ger4spp (__vector_quad* a, vuc b, vuc c)
+
+
+
+
+ xvi8ger4spp a,b,c
+
+
+
+
+
+
+
+
+
+
+
diff --git a/Intrinsics_Reference/ch_vec_reference.xml b/Intrinsics_Reference/ch_vec_reference.xml
index b0d7ec7..174dbf4 100644
--- a/Intrinsics_Reference/ch_vec_reference.xml
+++ b/Intrinsics_Reference/ch_vec_reference.xml
@@ -1,5 +1,5 @@