#include #if (defined(__GNUC__) \ && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) \ && __alpha_max__) #define __bic __builtin_alpha_bic #define __eqv __builtin_alpha_eqv #define __ornot __builtin_alpha_ornot #define __cmpbge __builtin_alpha_cmpbge #define __extql __builtin_alpha_extql #define __extqh __builtin_alpha_extqh #define __extbl __builtin_alpha_extbl #define __extwl __builtin_alpha_extwl #define __extwh __builtin_alpha_extwh #define __inswl __builtin_alpha_inswl #define __zap __builtin_alpha_zap #define __zapnot __builtin_alpha_zapnot #define __amask __builtin_alpha_amask #define __implver __builtin_alpha_implver #define __rpcc __builtin_alpha_rpcc #define __minub8 __builtin_alpha_minub8 #define __minsb8 __builtin_alpha_minsb8 #define __minuw4 __builtin_alpha_minuw4 #define __minsw4 __builtin_alpha_minsw4 #define __maxub8 __builtin_alpha_maxub8 #define __maxsb8 __builtin_alpha_maxsb8 #define __maxuw4 __builtin_alpha_maxuw4 #define __maxsw4 __builtin_alpha_maxsw4 #define __perr __builtin_alpha_perr #define __pklb __builtin_alpha_pklb #define __pkwb __builtin_alpha_pkwb #define __unpkbl __builtin_alpha_unpkbl #define __unpkbw __builtin_alpha_unpkbw static __inline uint64_t __ctlz(uint64_t val) { uint64_t ret; asm volatile( "ctlz %1,%0" : "=r" (ret) : "r" (val) ); return ret; } /* Add the 8-bit values in M! to the 8-bit values in M2 (paddb). */ static __inline uint64_t __addub8(uint64_t m1, uint64_t m2) { uint64_t signs = (m1 ^ m2) & 0x8080808080808080; m1 &= ~signs; m2 &= ~signs; m1 += m2; m1 ^= signs; return m1; } /* Add the 16-bit values in M1 to the 16-bit values in M2 (paddw). */ static __inline uint64_t __adduw4(uint64_t m1, uint64_t m2) { uint64_t signs = (m1 ^ m2) & 0x8000800080008000; m1 &= ~signs; // ??? gcc doesn't use bic here m2 &= ~signs; m1 += m2; m1 ^= signs; return m1; } /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned * saturated arithmetic (paddusb) */ static __inline uint64_t __addusb8(uint64_t m1, uint64_t m2) { return m1 + __minub8(m2, ~m1); } /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned * saturating arithmetic (paddusw) */ static __inline uint64_t __addusw4(uint64_t m1, uint64_t m2) { return m1 + __minuw4(m2, ~m1); } /* Subtract the 8-bit values in M1 to the 8-bit values in M2 using unsigned * saturated arithmetic (psubusb) */ static __inline uint64_t __subusb8(uint64_t m1, uint64_t m2) { return m1 - __minub8(m2, m1); } /* Subtract the 16-bit values in M1 to the 16-bit values in M2 using unsigned * saturating arithmetic (psubusw) */ static __inline uint64_t __subusw4(uint64_t m1, uint64_t m2) { return m1 - __minuw4(m2, m1); } /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce the low 16 bits of the results (pmullw). */ static __inline uint64_t __mulsw4sl(uint64_t m1, uint64_t m2) { uint64_t t0, t2, t4, t6; t0 = (int) m1 * (int) m2 ; t2 = (int) __extwl(m1, 2) * (int) __extwl(m2, 2); t4 = (int) __extwl(m1, 4) * (int) __extwl(m2, 4); t6 = (int) __extwl(m1, 6) * (int) __extwl(m2, 6); t0 = __inswl(t0, 0); t2 = __inswl(t2, 2); t4 = __inswl(t4, 4); t6 = __inswl(t6, 6); return t0 | t2 | t4 | t6; } #endif