diff options
-rw-r--r-- | alpha_mmintrin.h | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/alpha_mmintrin.h b/alpha_mmintrin.h new file mode 100644 index 0000000..34a2b63 --- /dev/null +++ b/alpha_mmintrin.h @@ -0,0 +1,92 @@ +#include <stdint.h> + + +#if (defined(__GNUC__) \ + && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) \ + && __alpha_max__) + +#define __bic __builtin_alpha_bic +#define __eqv __builtin_alpha_eqv +#define __ornot __builtin_alpha_ornot +#define __cmpbge __builtin_alpha_cmpbge +#define __extql __builtin_alpha_extql +#define __extqh __builtin_alpha_extqh +#define __extbl __builtin_alpha_extbl +#define __extwl __builtin_alpha_extwl +#define __extwh __builtin_alpha_extwh +#define __inswl __builtin_alpha_inswl +#define __zap __builtin_alpha_zap +#define __zapnot __builtin_alpha_zapnot +#define __amask __builtin_alpha_amask +#define __implver __builtin_alpha_implver +#define __rpcc __builtin_alpha_rpcc +#define __minub8 __builtin_alpha_minub8 +#define __minsb8 __builtin_alpha_minsb8 +#define __minuw4 __builtin_alpha_minuw4 +#define __minsw4 __builtin_alpha_minsw4 +#define __maxub8 __builtin_alpha_maxub8 +#define __maxsb8 __builtin_alpha_maxsb8 +#define __maxuw4 __builtin_alpha_maxuw4 +#define __maxsw4 __builtin_alpha_maxsw4 +#define __perr __builtin_alpha_perr +#define __pklb __builtin_alpha_pklb +#define __pkwb __builtin_alpha_pkwb +#define __unpkbl __builtin_alpha_unpkbl +#define __unpkbw __builtin_alpha_unpkbw + +static __inline uint64_t +__ctlz(uint64_t val) { + uint64_t ret; + asm volatile( + "ctlz %1,%0" + : "=r" (ret) + : "r" (val) + ); + return ret; +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2 (paddw). */ +static __inline uint64_t +__adduw4(uint64_t m1, uint64_t m2) { + uint64_t signs = (m1 ^ m2) & 0x8000800080008000; + m1 &= ~signs; // ??? gcc doesn't use bic here + m2 &= ~signs; + m1 += m2; + m1 ^= signs; + return m1; +} + +/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned + * saturated arithmetic (paddusb) */ +static __inline uint64_t +__addusb8(uint64_t m1, uint64_t m2) { + return m1 + __minub8(m2, ~m1); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned + * saturating arithmetic (paddusw) */ +static __inline uint64_t +__addusw4(uint64_t m1, uint64_t m2) { + return m1 + __minuw4(m2, ~m1); +} + +/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce + the low 16 bits of the results (pmullw). */ +static __inline uint64_t +__mulsw4sl(uint64_t m1, uint64_t m2) { + uint64_t t0, t2, t4, t6; + + t0 = (int) m1 * (int) m2 ; + t2 = (int) __extwl(m1, 2) * (int) __extwl(m2, 2); + t4 = (int) __extwl(m1, 4) * (int) __extwl(m2, 4); + t6 = (int) __extwl(m1, 6) * (int) __extwl(m2, 6); + + t0 = __inswl(t0, 0); + t2 = __inswl(t2, 2); + t4 = __inswl(t4, 4); + t6 = __inswl(t6, 6); + + return t0 | t2 | t4 | t6; +} + +#endif |