M4RI 20140914
xor_template.h
1#include <m4ri/m4ri_config.h>
2#include <m4ri/misc.h>
3
12static inline void __M4RI_TEMPLATE_NAME(_mzd_combine)(word *m, word const *t[N], wi_t wide) {
13 assert(1 <= N && N <= 8);
14
15#if __M4RI_HAVE_SSE2
16
17 assert( (__M4RI_ALIGNMENT(m,16) == 8) | (__M4RI_ALIGNMENT(m,16) == 0) );
18
19 switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
20 case 8: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[7],16));
21 case 7: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[6],16));
22 case 6: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[5],16));
23 case 5: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[4],16));
24 case 4: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[3],16));
25 case 3: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[2],16));
26 case 2: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[1],16));
27 case 1: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[0],16));
28 };
29
30 if (__M4RI_UNLIKELY(__M4RI_ALIGNMENT(m,16) == 8)) {
31 switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
32 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; break;
33 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;
34 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;
35 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;
36 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;
37 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;
38 case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;
39 case 1: *m++ ^= *t[0]++; break;
40 };
41 wide--;
42 }
43
44 __m128i *m__ = (__m128i*)m;
45 __m128i *t__[N];
46
47 switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
48 case 8: t__[N-8] = (__m128i*)t[N-8];
49 case 7: t__[N-7] = (__m128i*)t[N-7];
50 case 6: t__[N-6] = (__m128i*)t[N-6];
51 case 5: t__[N-5] = (__m128i*)t[N-5];
52 case 4: t__[N-4] = (__m128i*)t[N-4];
53 case 3: t__[N-3] = (__m128i*)t[N-3];
54 case 2: t__[N-2] = (__m128i*)t[N-2];
55 case 1: t__[N-1] = (__m128i*)t[N-1];
56 };
57
58 __m128i xmm0, xmm1, xmm2, xmm3;
59
60 wi_t i=0;
61 for(; i+4<= (wide>>1); i+=4) {
62 xmm0 = m__[0]; xmm1 = m__[1]; xmm2 = m__[2]; xmm3 = m__[3];
63 switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
64 case 8: xmm0 = _mm_xor_si128(xmm0, t__[7][0]); xmm1 = _mm_xor_si128(xmm1, t__[7][1]); xmm2 = _mm_xor_si128(xmm2, t__[7][2]); xmm3 = _mm_xor_si128(xmm3, t__[7][3]); t__[7]+=4;
65 case 7: xmm0 = _mm_xor_si128(xmm0, t__[6][0]); xmm1 = _mm_xor_si128(xmm1, t__[6][1]); xmm2 = _mm_xor_si128(xmm2, t__[6][2]); xmm3 = _mm_xor_si128(xmm3, t__[6][3]); t__[6]+=4;
66 case 6: xmm0 = _mm_xor_si128(xmm0, t__[5][0]); xmm1 = _mm_xor_si128(xmm1, t__[5][1]); xmm2 = _mm_xor_si128(xmm2, t__[5][2]); xmm3 = _mm_xor_si128(xmm3, t__[5][3]); t__[5]+=4;
67 case 5: xmm0 = _mm_xor_si128(xmm0, t__[4][0]); xmm1 = _mm_xor_si128(xmm1, t__[4][1]); xmm2 = _mm_xor_si128(xmm2, t__[4][2]); xmm3 = _mm_xor_si128(xmm3, t__[4][3]); t__[4]+=4;
68 case 4: xmm0 = _mm_xor_si128(xmm0, t__[3][0]); xmm1 = _mm_xor_si128(xmm1, t__[3][1]); xmm2 = _mm_xor_si128(xmm2, t__[3][2]); xmm3 = _mm_xor_si128(xmm3, t__[3][3]); t__[3]+=4;
69 case 3: xmm0 = _mm_xor_si128(xmm0, t__[2][0]); xmm1 = _mm_xor_si128(xmm1, t__[2][1]); xmm2 = _mm_xor_si128(xmm2, t__[2][2]); xmm3 = _mm_xor_si128(xmm3, t__[2][3]); t__[2]+=4;
70 case 2: xmm0 = _mm_xor_si128(xmm0, t__[1][0]); xmm1 = _mm_xor_si128(xmm1, t__[1][1]); xmm2 = _mm_xor_si128(xmm2, t__[1][2]); xmm3 = _mm_xor_si128(xmm3, t__[1][3]); t__[1]+=4;
71 case 1: xmm0 = _mm_xor_si128(xmm0, t__[0][0]); xmm1 = _mm_xor_si128(xmm1, t__[0][1]); xmm2 = _mm_xor_si128(xmm2, t__[0][2]); xmm3 = _mm_xor_si128(xmm3, t__[0][3]); t__[0]+=4;
72 }
73 m__[0] = xmm0; m__[1] = xmm1; m__[2] = xmm2; m__[3] = xmm3;
74 m__ += 4;
75 }
76
77 for(; i< (wide>>1); i++) {
78 switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
79 case 8:
80 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
81 xmm2 = _mm_xor_si128(*t__[4]++, *t__[5]++); xmm3 = _mm_xor_si128(*t__[6]++, *t__[7]++);
82 xmm0 = _mm_xor_si128(xmm0, xmm1); xmm2 = _mm_xor_si128(xmm2, xmm3);
83 xmm0 = _mm_xor_si128(xmm0, xmm2); xmm0 = _mm_xor_si128(*m__, xmm0);
84 break;
85 case 7:
86 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
87 xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
88 xmm0 = _mm_xor_si128(xmm0, *t__[6]++); xmm0 = _mm_xor_si128(xmm0, xmm1);
89 xmm0 = _mm_xor_si128(*m__, xmm0);
90 break;
91 case 6:
92 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
93 xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
94 xmm0 = _mm_xor_si128(xmm0, xmm1); xmm0 = _mm_xor_si128(*m__, xmm0);
95 break;
96 case 5:
97 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
98 xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm0 = _mm_xor_si128(xmm0, xmm1);
99 xmm0 = _mm_xor_si128(*m__, xmm0);
100 break;
101 case 4:
102 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
103 xmm0 = _mm_xor_si128(xmm0, xmm1); xmm0 = _mm_xor_si128(*m__, xmm0);
104 break;
105 case 3:
106 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*m__, *t__[2]++);
107 xmm0 = _mm_xor_si128(xmm0, xmm1);
108 break;
109 case 2:
110 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm0 = _mm_xor_si128(*m__, xmm0);
111 break;
112 case 1:
113 xmm0 = _mm_xor_si128(*m__, *t__[0]++);
114 break;
115 };
116 *m__++ = xmm0;
117 }
118
119 if(wide & 0x1) {
120 m = (word*)m__;
121 switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
122 case 8: t[N-8] = (word*)t__[N-8];
123 case 7: t[N-7] = (word*)t__[N-7];
124 case 6: t[N-6] = (word*)t__[N-6];
125 case 5: t[N-5] = (word*)t__[N-5];
126 case 4: t[N-4] = (word*)t__[N-4];
127 case 3: t[N-3] = (word*)t__[N-3];
128 case 2: t[N-2] = (word*)t__[N-2];
129 case 1: t[N-1] = (word*)t__[N-1];
130 }
131
132 switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
133 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; break;
134 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;
135 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;
136 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;
137 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;
138 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;
139 case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;
140 case 1: *m++ ^= *t[0]++; break;
141 }
142 }
143 return;
144#else
145
146 for(wi_t i=0; i< wide; i++) {
147 switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
148 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; break;
149 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;
150 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;
151 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;
152 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;
153 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;
154 case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;
155 case 1: *m++ ^= *t[0]++; break;
156 }
157 }
158
159 return;
160#endif // __M4RI_HAVE_SSE2
161}
162
163
Helper functions.
#define __M4RI_ALIGNMENT(addr, n)
Return alignment of addr w.r.t. n. For example the address 17 would be 1 aligned w....
Definition: misc.h:421
#define __M4RI_UNLIKELY(cond)
Macro to help with branch prediction.
Definition: misc.h:449
uint64_t word
A word is the typical packed data structure to represent packed bits.
Definition: misc.h:87
int wi_t
Type of word indexes.
Definition: misc.h:80