From a9b5e260cf862fcc5ad023a3449b03a65643f9a6 Mon Sep 17 00:00:00 2001
From: sjmunroe <munroesj.us.ibm.com>
Date: Mon, 30 Oct 2017 10:23:19 -0500
Subject: [PATCH] Updates to describe issues associated with larger vector
 sizes and proposed solutions.

Signed-off-by: Steven Munroe <sjmunroe@us.ibm.com>
---
 Porting_Vector_Intrinsics/bk_main.xml         | 13 ++-
 Porting_Vector_Intrinsics/ch_howto_start.xml  | 11 ++-
 .../sec_handling_avx.xml                      | 81 ++++++++++++++++-
 .../sec_handling_mmx.xml                      |  5 +-
 .../sec_intel_intrinsic_functions.xml         |  5 +-
 .../sec_intel_intrinsic_types.xml             | 87 +++++++++++++++++--
 .../sec_powerisa_vector_intrinsics.xml        |  8 +-
 .../sec_simple_examples.xml                   |  1 +
 8 files changed, 191 insertions(+), 20 deletions(-)
diff --git a/Porting_Vector_Intrinsics/bk_main.xml b/Porting_Vector_Intrinsics/bk_main.xml
index 2854c48..b6aeaf3 100644
--- a/Porting_Vector_Intrinsics/bk_main.xml
+++ b/Porting_Vector_Intrinsics/bk_main.xml
@@ -39,7 +39,7 @@
       <holder>OpenPOWER Foundation</holder>
     </copyright>
     <!-- TODO: Set the correct document releaseinfo -->
-    <releaseinfo>Revision 0.2</releaseinfo>
+    <releaseinfo>Revision 0.3</releaseinfo>
     <productname>OpenPOWER</productname>
     <pubdate/>
 
@@ -68,6 +68,17 @@
 
     <revhistory>
       <!-- TODO: Update as new revisions created -->
+      <revision>
+          <date>2017-10-30</date>
+          <revdescription>
+            <itemizedlist spacing="compact">
+              <listitem>
+                <para>Revision 0.3 - Updates to describe issues associated with larger vector sizes
+                and proposed solutions.</para>
+              </listitem>
+            </itemizedlist>
+          </revdescription>
+      </revision>
       <revision>
           <date>2017-09-14</date>
           <revdescription>
diff --git a/Porting_Vector_Intrinsics/ch_howto_start.xml b/Porting_Vector_Intrinsics/ch_howto_start.xml
index 3c52da1..d83e12b 100644
--- a/Porting_Vector_Intrinsics/ch_howto_start.xml
+++ b/Porting_Vector_Intrinsics/ch_howto_start.xml
@@ -72,13 +72,22 @@
 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
 
 /* Internal data types for implementing the intrinsics.  */
-typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef __vector float __v4sf;
 /* more typedefs.  */
 
 /* The intrinsic implmentations go here.  */
 
 #endif /* EMMINTRIN_H_ */]]></programlisting></para>
 
+  <note><para>
+  The interface typedef (__m128) uses the GCC vector builtin
+  extension syntax, while the internal typedef (__v4sf) uses the altivec
+  vector extension syntax.
+  This allows the internal typedefs to work correctly with the PowerPC
+  overloaded vector builtins. Also we use the __vector (vs vector) type prefix
+  to avoid name space conflicts with C++.
+  </para></note>
+
   <para>Then you can start adding small groups of related intrinsic 
   implementations to the header to be compiled and  examine the generated code. 
   Once you have what looks like reasonable code you can grep through 
diff --git a/Porting_Vector_Intrinsics/sec_handling_avx.xml b/Porting_Vector_Intrinsics/sec_handling_avx.xml
index 7bb8d73..26a158d 100644
--- a/Porting_Vector_Intrinsics/sec_handling_avx.xml
+++ b/Porting_Vector_Intrinsics/sec_handling_avx.xml
@@ -75,14 +75,87 @@
   the function prologue). This frees up to 64 vectors (32 x 256-bit or 16 x 
   512-bit structs) for code optimization. </para>
 
-  <para>Based on the specifics of our ISA and ABI we will not not use 
+  <para>Based on the specifics of our ISA and ABI we will not use 
   <literal>__vector_size__</literal> (32) or (64) in the PowerPC implementation of 
   <literal>__m256</literal> and <literal>__m512</literal> 
-  types. Instead we will typedef structs of 2 or 4 vector (<literal>__m128</literal>) fields. This 
+  types. Instead we will typedef structs of 2 or 4 vector (<literal>__vector</literal>) fields. This 
   allows efficient handling of these larger data types without requiring new GCC 
-  language extensions. </para>
+  language extensions or vector builtins. For example:
+<programlisting><![CDATA[/* Internal data types for implementing the AVX in PowerISA intrinsics.  */
+typedef struct __v4df
+{
+  __vector double vd0;
+  __vector double vd1;
+} __vx4df;
 
-  <para>In the end we should use the same type names and definitions as the 
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef struct __m256d
+{
+  __vector double vd0;
+  __vector double vd1;
+}__attribute__ ((__may_alias__)) __m256d;]]></programlisting></para>
+
+  <para>This requires a different syntax for operations
+  where the 128-bit vector chunks are explicitly referenced. 
+  For example:
+  <programlisting><![CDATA[extern __inline __mx256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_pd (__m256d __A, __m256d __B)
+{
+  __m256d temp;
+  temp.vd0 = __A.vd0 + __B.vd0;
+  temp.vd1 = __A.vd1 + __B.vd1;
+  return (temp);
+}]]></programlisting></para>
+
+  <para>But this creates a new issue because
+  the C language does not allow direct casts between structs.
+  This can be an issue where the intrinsic interface type is not the correct type for the operation.
+  For example AVX2 integer operations:
+  <programlisting><![CDATA[
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef struct __m256i
+{
+  __vector long long vdi0;
+  __vector long long vdi1;
+} __m256i;
+
+/* Internal data types for implementing the AVX in PowerISA intrinsics.  */
+typedef struct __v16hi
+{
+  __vector short vhi0;
+  __vector short vhi1;
+} __v16hi;
+]]></programlisting></para>
+
+  <para>For the AVX2 intrinsic <literal>_mm256_add_epi16</literal>
+  we need to cast the input vectors
+  of 64-bit long long (<literal>__m256i</literal>) into vectors of 16-bit short
+  (<literal>__v16hi</literal>) before the overloaded add operations.
+  Here we need to use a pointer reference cast.
+  For example:
+  <programlisting><![CDATA[
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mx256_add_epi16 (__m256i __A, __m256i __B)
+{
+  __m256i result;
+  __v16hi a = *((__v16hi *)&__A);
+  __v16hi b = *((__v16hi *)&__B);
+  __v16hi c;
+
+  c.vhi0 = a.vhi0 + b.vhi0;
+  c.vhi1 = a.vhi1 + b.vhi1;
+
+  result = *((__m256i *)&c);
+  return (result);
+}]]></programlisting></para>
+  <para>As this and related examples are inlined,
+  we expect the compiler to recognize this
+  is a "nop cast" and avoid generating any additional instructions.</para>
+
+  <para>In the end we should try
+  to use the same type names and definitions as the 
   GCC X86 intrinsic headers where possible. Where that is not possible we can 
   define new typedefs that provide the best mapping to the underlying PowerISA 
   hardware.</para>
diff --git a/Porting_Vector_Intrinsics/sec_handling_mmx.xml b/Porting_Vector_Intrinsics/sec_handling_mmx.xml
index 8b31d02..9b60ded 100644
--- a/Porting_Vector_Intrinsics/sec_handling_mmx.xml
+++ b/Porting_Vector_Intrinsics/sec_handling_mmx.xml
@@ -35,7 +35,10 @@
   implies that MMX integer types can be handled as an internal union of arrays for 
   the supported element types. So a 64-bit unsigned long long is the best type 
   for parameter passing and return values, especially for the 64-bit (_si64) 
-  operations as these normally generate a single PowerISA instruction.</para>
+  operations as these normally generate a single PowerISA instruction.
+  <phrase revisionflag="added">So for the PowerPC implementation we will define
+  <literal>__m64</literal> as:</phrase>
+  <programlisting><![CDATA[typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;]]></programlisting></para>
 
   <para>The SSE extensions include some copy / convert operations for 
   <literal>_m128</literal> to / 
diff --git a/Porting_Vector_Intrinsics/sec_intel_intrinsic_functions.xml b/Porting_Vector_Intrinsics/sec_intel_intrinsic_functions.xml
index 1539acc..f429ae2 100644
--- a/Porting_Vector_Intrinsics/sec_intel_intrinsic_functions.xml
+++ b/Porting_Vector_Intrinsics/sec_intel_intrinsic_functions.xml
@@ -85,8 +85,9 @@ v2di __builtin_ia32_paddq (v2di, v2di)]]></programlisting></para>
 
   <note><para>A key difference between GCC built-ins for i386 and PowerPC is 
   that the x86 built-ins have different names of each operation and type while the 
-  PowerPC altivec built-ins tend to have a single generic built-in for  each 
-  operation, across a set of compatible operand types. </para></note>
+  PowerPC altivec built-ins tend to have a single overloaded
+  built-in for each operation, 
+  across a set of compatible operand types. </para></note>
 
   <para>In GCC the Intel Intrinsic header (*intrin.h) files are implemented 
   as a set of inline functions using the Intel Intrinsic API names and types. 
diff --git a/Porting_Vector_Intrinsics/sec_intel_intrinsic_types.xml b/Porting_Vector_Intrinsics/sec_intel_intrinsic_types.xml
index f2e43f9..4ddc86c 100644
--- a/Porting_Vector_Intrinsics/sec_intel_intrinsic_types.xml
+++ b/Porting_Vector_Intrinsics/sec_intel_intrinsic_types.xml
@@ -45,9 +45,17 @@ typedef float __v4sf __attribute__ ((__vector_size__ (16)));]]></programlisting>
   <para>There are a couple of issues here:
   <itemizedlist spacing="compact">
     <listitem>
-      <para>The API seems to force the compiler to assume 
+      <para>The use of __may_alias__ in the API seems to force the compiler to assume 
       aliasing of any parameter passed by reference.</para>
     </listitem>
+    <listitem>
+      <para>
+      The GCC vector builtin type system (example above) is slightly different
+      syntax from the original Altivec __vector types. Internally the two typedef forms
+      may represent the same 128-bit vector type,
+      but for early source parsing and overloaded vector builtins they are
+      handled differently.</para>
+    </listitem>
     <listitem>
       <para>The data type used at the interface may not be
       the correct type for the implied operation.</para>
@@ -60,13 +68,16 @@ typedef float __v4sf __attribute__ ((__vector_size__ (16)));]]></programlisting>
   (defined as vector long long). </para>
 
   <para>This may not matter when using x86 built-ins but does matter when 
-  the implementation uses C vector extensions or in our case uses PowerPC generic 
+  the implementation uses C vector extensions or in our case using PowerPC 
+  overloaded
   vector built-ins 
   (<xref linkend="sec_powerisa_vector_intrinsics"/>). 
   For the latter cases the type must be correct for 
-  the compiler to generate the correct type (char, short, int, long) 
-  (<xref linkend="sec_api_implemented"/>) for the generic 
-  builtin operation. There is also concern that excessive use of 
+  the compiler to generate the correct code for the
+  type (char, short, int, long) 
+  (<xref linkend="sec_api_implemented"/>) for
+  overloaded builtin operations.
+  There is also concern that excessive use of 
   <literal>__may_alias__</literal>
   will limit compiler optimization. We are not sure how important this attribute 
   is to the correct operation of the API.  So at a later stage we should 
@@ -83,16 +94,76 @@ typedef float __v4sf __attribute__ ((__vector_size__ (16)));]]></programlisting>
   implemented as vector attribute extensions of the appropriate  size (   
   <literal>__vector_size__</literal> ({8 | 16 | 32, and 64}). For the PowerPC target  GCC currently 
   only supports the native <literal>__vector_size__</literal> ( 16 ). These we can support directly 
-  in VMX/VSX registers and associated instructions. GCC will compile code with 
+  in VMX/VSX registers and associated instructions.</para>
+
+  <para>GCC will compile code with 
   other   <literal>__vector_size__</literal> values, but the resulting types are treated as simple 
   arrays of the element type. This does not allow the compiler to use the vector 
-  registers and vector instructions for these (nonnative) vectors.</para>
+  registers for parameter passing and return values.
+  For example this intrinsic from immintrin.h:
+  <programlisting><![CDATA[typedef double __m256d __attribute__ ((__vector_size__ (32), __may_alias__));
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) ((__v4df)__A + (__v4df)__B);
+}
+]]></programlisting></para>
+  <para>And test case:
+  <programlisting><![CDATA[__m256d
+test_mm256_add_pd (__m256d __A, __m256d __B)
+{
+  return (_mm256_add_pd (__A, __B));
+}
+]]></programlisting></para>
+  <para>Current GCC generates:
+  <programlisting><![CDATA[0000000000000970 <test_mm256_add_pd>:
+ 970:	10 00 20 39 	li      r9,16
+ 974:	98 26 80 7d 	lxvd2x  vs12,0,r4
+ 978:	98 2e 40 7d 	lxvd2x  vs10,0,r5
+ 97c:	20 00 e0 38 	li      r7,32
+ 980:	f8 ff e1 fb 	std     r31,-8(r1)
+ 984:	b1 ff 21 f8 	stdu    r1,-80(r1)
+ 988:	30 00 00 39 	li      r8,48
+ 98c:	98 4e 04 7c 	lxvd2x  vs0,r4,r9
+ 990:	98 4e 65 7d 	lxvd2x  vs11,r5,r9
+ 994:	00 53 8c f1 	xvadddp vs12,vs12,vs10
+ 998:	00 00 c1 e8 	ld      r6,0(r1)
+ 99c:	78 0b 3f 7c 	mr      r31,r1
+ 9a0:	00 5b 00 f0 	xvadddp vs0,vs0,vs11
+ 9a4:	c1 ff c1 f8 	stdu    r6,-64(r1)
+ 9a8:	98 3f 9f 7d 	stxvd2x vs12,r31,r7
+ 9ac:	98 47 1f 7c 	stxvd2x vs0,r31,r8
+ 9b0:	98 3e 9f 7d 	lxvd2x  vs12,r31,r7
+ 9b4:	98 46 1f 7c 	lxvd2x  vs0,r31,r8
+ 9b8:	50 00 3f 38 	addi    r1,r31,80
+ 9bc:	f8 ff e1 eb 	ld      r31,-8(r1)
+ 9c0:	98 1f 80 7d 	stxvd2x vs12,0,r3
+ 9c4:	98 4f 03 7c 	stxvd2x vs0,r3,r9
+ 9c8:	20 00 80 4e 	blr]]></programlisting></para>
+
+  <para>The compiler treats the parameters and return value
+  as scalar arrays, which are passed by reference.
+  The operation is vectorized in this case,
+  but the 256-bit result is returned through storage.</para>
+
+  <para>This is not what we want to see for a simple 4 by double add.
+  It would be better if we can pass and return 
+  MMX (<xref linkend="sec_handling_mmx"/>) and AVX (<xref linkend="sec_handling_avx"/>)
+  values as PowerPC registers and avoid the storage references.
+  If we can get the parameter and return values as registers,
+  this example will reduce to:
+  <programlisting><![CDATA[0000000000000970 <test_mx256_add_pd>:
+ 970:	xvadddp vs34,vs34,vs36
+ 974:	xvadddp vs35,vs35,vs37
+ 978:	blr]]></programlisting></para>
 
   <para>So the PowerISA VMX/VSX facilities and GCC compiler support for 
   128-bit/16-byte vectors and associated vector built-ins
   are well matched to implementing equivalent X86 SSE intrinsic functions.
   However implementing the older MMX (64-bit) and the latest
-  AVX (256 / 512-bit) extensions requires more thought and some ingenuity.</para>
+  AVX (256 / 512-bit) extensions requires more thought and some 
+  ingenuity.</para>
     
   <xi:include href="sec_handling_mmx.xml"/>
   <xi:include href="sec_handling_avx.xml"/>
diff --git a/Porting_Vector_Intrinsics/sec_powerisa_vector_intrinsics.xml b/Porting_Vector_Intrinsics/sec_powerisa_vector_intrinsics.xml
index 79a08ac..7dd8fdf 100644
--- a/Porting_Vector_Intrinsics/sec_powerisa_vector_intrinsics.xml
+++ b/Porting_Vector_Intrinsics/sec_powerisa_vector_intrinsics.xml
@@ -37,8 +37,9 @@
   Vector Built-in Functions</emphasis></emphasis>.</para>
 
   <para>Appendix A is organized (sorted) by built-in name, output type, then 
-  parameter types. Most built-ins are generic as the named operation (add, 
-  sub, mul, cmpeq, ...) applies to multiple types. </para>
+  parameter types. Most built-ins are overloaded
+  as the named operation (vec_add, 
+  vec_sub, vec_mul, vec_cmpeq, ...) applies to multiple types. </para>
 
   <para>So the <literal>vec_add</literal> built-in applies to all the signed and unsigned 
   integer types (char, short, in, and long) plus float and double floating-point 
@@ -59,7 +60,8 @@ vector float vec_add (vector float, vector float);
 vector double vec_add (vector double, vector double);]]></programlisting></para>
 
   <para>This is one key difference between PowerISA built-ins and Intel 
-  Intrinsics (Intel Intrinsics are not generic and include type information in 
+  Intrinsics (Intel Intrinsics are not overloaded
+  and include type information in 
   the name). This is why it is so important to understand the vector element 
   types and to add the appropriate type casts to get the correct results.</para>
 
diff --git a/Porting_Vector_Intrinsics/sec_simple_examples.xml b/Porting_Vector_Intrinsics/sec_simple_examples.xml
index c4dc656..b7db1c8 100644
--- a/Porting_Vector_Intrinsics/sec_simple_examples.xml
+++ b/Porting_Vector_Intrinsics/sec_simple_examples.xml
@@ -48,6 +48,7 @@ _mm_mullo_epi16 (__m128i __A, __m128i __B)
   
   <para>Note this requires a cast for the compiler to generate the correct 
   code for the intended operation. The parameters and result are the generic 
+  interface
   type <literal>__m128i</literal>, which is a vector long long with the 
   <literal>__may_alias__</literal> attribute. But 
   operation is a vector multiply low on unsigned short elements.