summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--alpha_mmintrin.h92
1 files changed, 92 insertions, 0 deletions
diff --git a/alpha_mmintrin.h b/alpha_mmintrin.h
new file mode 100644
index 0000000..34a2b63
--- /dev/null
+++ b/alpha_mmintrin.h
@@ -0,0 +1,92 @@
+#include <stdint.h>
+
+
+#if (defined(__GNUC__) \
+ && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) \
+ && __alpha_max__)
+
+#define __bic __builtin_alpha_bic
+#define __eqv __builtin_alpha_eqv
+#define __ornot __builtin_alpha_ornot
+#define __cmpbge __builtin_alpha_cmpbge
+#define __extql __builtin_alpha_extql
+#define __extqh __builtin_alpha_extqh
+#define __extbl __builtin_alpha_extbl
+#define __extwl __builtin_alpha_extwl
+#define __extwh __builtin_alpha_extwh
+#define __inswl __builtin_alpha_inswl
+#define __zap __builtin_alpha_zap
+#define __zapnot __builtin_alpha_zapnot
+#define __amask __builtin_alpha_amask
+#define __implver __builtin_alpha_implver
+#define __rpcc __builtin_alpha_rpcc
+#define __minub8 __builtin_alpha_minub8
+#define __minsb8 __builtin_alpha_minsb8
+#define __minuw4 __builtin_alpha_minuw4
+#define __minsw4 __builtin_alpha_minsw4
+#define __maxub8 __builtin_alpha_maxub8
+#define __maxsb8 __builtin_alpha_maxsb8
+#define __maxuw4 __builtin_alpha_maxuw4
+#define __maxsw4 __builtin_alpha_maxsw4
+#define __perr __builtin_alpha_perr
+#define __pklb __builtin_alpha_pklb
+#define __pkwb __builtin_alpha_pkwb
+#define __unpkbl __builtin_alpha_unpkbl
+#define __unpkbw __builtin_alpha_unpkbw
+
+static __inline uint64_t
+__ctlz(uint64_t val) {
+ uint64_t ret;
+ asm volatile(
+ "ctlz %1,%0"
+ : "=r" (ret)
+ : "r" (val)
+ );
+ return ret;
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2 (paddw). */
+static __inline uint64_t
+__adduw4(uint64_t m1, uint64_t m2) {
+ uint64_t signs = (m1 ^ m2) & 0x8000800080008000;
+ m1 &= ~signs; // ??? gcc doesn't use bic here
+ m2 &= ~signs;
+ m1 += m2;
+ m1 ^= signs;
+ return m1;
+}
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
+ * saturated arithmetic (paddusb) */
+static __inline uint64_t
+__addusb8(uint64_t m1, uint64_t m2) {
+ return m1 + __minub8(m2, ~m1);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
+ * saturating arithmetic (paddusw) */
+static __inline uint64_t
+__addusw4(uint64_t m1, uint64_t m2) {
+ return m1 + __minuw4(m2, ~m1);
+}
+
+/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
+ the low 16 bits of the results (pmullw). */
+static __inline uint64_t
+__mulsw4sl(uint64_t m1, uint64_t m2) {
+ uint64_t t0, t2, t4, t6;
+
+ t0 = (int) m1 * (int) m2 ;
+ t2 = (int) __extwl(m1, 2) * (int) __extwl(m2, 2);
+ t4 = (int) __extwl(m1, 4) * (int) __extwl(m2, 4);
+ t6 = (int) __extwl(m1, 6) * (int) __extwl(m2, 6);
+
+ t0 = __inswl(t0, 0);
+ t2 = __inswl(t2, 2);
+ t4 = __inswl(t4, 4);
+ t6 = __inswl(t6, 6);
+
+ return t0 | t2 | t4 | t6;
+}
+
+#endif