#include <stdint.h>


#if (defined(__GNUC__)						  \
     && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) \
     && __alpha_max__)

#define __bic		__builtin_alpha_bic
#define __eqv		__builtin_alpha_eqv
#define __ornot		__builtin_alpha_ornot
#define __cmpbge	__builtin_alpha_cmpbge
#define __extql		__builtin_alpha_extql	
#define __extqh		__builtin_alpha_extqh	
#define __extbl		__builtin_alpha_extbl
#define __extwl		__builtin_alpha_extwl
#define __extwh		__builtin_alpha_extwh
#define __inswl		__builtin_alpha_inswl
#define __zap		__builtin_alpha_zap	
#define __zapnot	__builtin_alpha_zapnot
#define __amask		__builtin_alpha_amask	
#define __implver	__builtin_alpha_implver
#define __rpcc		__builtin_alpha_rpcc	
#define __minub8	__builtin_alpha_minub8
#define __minsb8	__builtin_alpha_minsb8
#define __minuw4	__builtin_alpha_minuw4
#define __minsw4	__builtin_alpha_minsw4
#define __maxub8	__builtin_alpha_maxub8
#define __maxsb8	__builtin_alpha_maxsb8
#define __maxuw4	__builtin_alpha_maxuw4
#define __maxsw4	__builtin_alpha_maxsw4
#define __perr		__builtin_alpha_perr	
#define __pklb		__builtin_alpha_pklb	
#define __pkwb		__builtin_alpha_pkwb	
#define __unpkbl	__builtin_alpha_unpkbl
#define __unpkbw	__builtin_alpha_unpkbw

static __inline uint64_t
__ctlz(uint64_t val) {
	uint64_t ret;
	asm volatile(
		"ctlz	%1,%0"
		: "=r" (ret)
		: "r" (val)
	);
	return ret;
}

/* Add the 8-bit values in M! to the 8-bit values in M2 (paddb). */
static __inline uint64_t
__addub8(uint64_t m1, uint64_t m2) {
	uint64_t signs = (m1 ^ m2) & 0x8080808080808080;
	m1 &= ~signs;
	m2 &= ~signs;
	m1 += m2;
	m1 ^= signs;
	return m1;
}

/* Add the 16-bit values in M1 to the 16-bit values in M2 (paddw). */
static __inline uint64_t
__adduw4(uint64_t m1, uint64_t m2) {
	uint64_t signs = (m1 ^ m2) & 0x8000800080008000;
	m1 &= ~signs;             // ??? gcc doesn't use bic here
	m2 &= ~signs;
	m1 += m2;
	m1 ^= signs;
	return m1;
}

/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 * saturated arithmetic (paddusb) */
static __inline uint64_t
__addusb8(uint64_t m1, uint64_t m2) {
	return m1 + __minub8(m2, ~m1);
}

/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 * saturating arithmetic (paddusw) */
static __inline uint64_t
__addusw4(uint64_t m1, uint64_t m2) {
	return m1 + __minuw4(m2, ~m1);
}

/* Subtract the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 * saturated arithmetic (psubusb) */
static __inline uint64_t
__subusb8(uint64_t m1, uint64_t m2) {
	return m1 - __minub8(m2, m1);
}

/* Subtract the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 * saturating arithmetic (psubusw) */
static __inline uint64_t
__subusw4(uint64_t m1, uint64_t m2) {
	return m1 - __minuw4(m2, m1);
}

/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
   the low 16 bits of the results (pmullw).  */
static __inline uint64_t
__mulsw4sl(uint64_t m1, uint64_t m2) {
	uint64_t t0, t2, t4, t6;

	t0 = (int)         m1     * (int)         m2    ;
	t2 = (int) __extwl(m1, 2) * (int) __extwl(m2, 2);
	t4 = (int) __extwl(m1, 4) * (int) __extwl(m2, 4);
	t6 = (int) __extwl(m1, 6) * (int) __extwl(m2, 6);

	t0 = __inswl(t0, 0);
	t2 = __inswl(t2, 2);
	t4 = __inswl(t4, 4);
	t6 = __inswl(t6, 6);

	return t0 | t2 | t4 | t6;
}

#endif