1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
#include <stdint.h>
#if (defined(__GNUC__) \
&& (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) \
&& __alpha_max__)
#define __bic __builtin_alpha_bic
#define __eqv __builtin_alpha_eqv
#define __ornot __builtin_alpha_ornot
#define __cmpbge __builtin_alpha_cmpbge
#define __extql __builtin_alpha_extql
#define __extqh __builtin_alpha_extqh
#define __extbl __builtin_alpha_extbl
#define __extwl __builtin_alpha_extwl
#define __extwh __builtin_alpha_extwh
#define __inswl __builtin_alpha_inswl
#define __zap __builtin_alpha_zap
#define __zapnot __builtin_alpha_zapnot
#define __amask __builtin_alpha_amask
#define __implver __builtin_alpha_implver
#define __rpcc __builtin_alpha_rpcc
#define __minub8 __builtin_alpha_minub8
#define __minsb8 __builtin_alpha_minsb8
#define __minuw4 __builtin_alpha_minuw4
#define __minsw4 __builtin_alpha_minsw4
#define __maxub8 __builtin_alpha_maxub8
#define __maxsb8 __builtin_alpha_maxsb8
#define __maxuw4 __builtin_alpha_maxuw4
#define __maxsw4 __builtin_alpha_maxsw4
#define __perr __builtin_alpha_perr
#define __pklb __builtin_alpha_pklb
#define __pkwb __builtin_alpha_pkwb
#define __unpkbl __builtin_alpha_unpkbl
#define __unpkbw __builtin_alpha_unpkbw
static __inline uint64_t
__ctlz(uint64_t val) {
uint64_t ret;
asm volatile(
"ctlz %1,%0"
: "=r" (ret)
: "r" (val)
);
return ret;
}
/* Add the 8-bit values in M! to the 8-bit values in M2 (paddb). */
static __inline uint64_t
__addub8(uint64_t m1, uint64_t m2) {
uint64_t signs = (m1 ^ m2) & 0x8080808080808080;
m1 &= ~signs;
m2 &= ~signs;
m1 += m2;
m1 ^= signs;
return m1;
}
/* Add the 16-bit values in M1 to the 16-bit values in M2 (paddw). */
static __inline uint64_t
__adduw4(uint64_t m1, uint64_t m2) {
uint64_t signs = (m1 ^ m2) & 0x8000800080008000;
m1 &= ~signs; // ??? gcc doesn't use bic here
m2 &= ~signs;
m1 += m2;
m1 ^= signs;
return m1;
}
/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
* saturated arithmetic (paddusb) */
static __inline uint64_t
__addusb8(uint64_t m1, uint64_t m2) {
return m1 + __minub8(m2, ~m1);
}
/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
* saturating arithmetic (paddusw) */
static __inline uint64_t
__addusw4(uint64_t m1, uint64_t m2) {
return m1 + __minuw4(m2, ~m1);
}
/* Subtract the 8-bit values in M1 to the 8-bit values in M2 using unsigned
* saturated arithmetic (psubusb) */
static __inline uint64_t
__subusb8(uint64_t m1, uint64_t m2) {
return m1 - __minub8(m2, m1);
}
/* Subtract the 16-bit values in M1 to the 16-bit values in M2 using unsigned
* saturating arithmetic (psubusw) */
static __inline uint64_t
__subusw4(uint64_t m1, uint64_t m2) {
return m1 - __minuw4(m2, m1);
}
/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
the low 16 bits of the results (pmullw). */
static __inline uint64_t
__mulsw4sl(uint64_t m1, uint64_t m2) {
uint64_t t0, t2, t4, t6;
t0 = (int) m1 * (int) m2 ;
t2 = (int) __extwl(m1, 2) * (int) __extwl(m2, 2);
t4 = (int) __extwl(m1, 4) * (int) __extwl(m2, 4);
t6 = (int) __extwl(m1, 6) * (int) __extwl(m2, 6);
t0 = __inswl(t0, 0);
t2 = __inswl(t2, 2);
t4 = __inswl(t4, 4);
t6 = __inswl(t6, 6);
return t0 | t2 | t4 | t6;
}
#endif
|