diff options
Diffstat (limited to 'libraries/evas/src/lib/include/evas_mmx.h')
-rw-r--r-- | libraries/evas/src/lib/include/evas_mmx.h | 735 |
1 files changed, 735 insertions, 0 deletions
diff --git a/libraries/evas/src/lib/include/evas_mmx.h b/libraries/evas/src/lib/include/evas_mmx.h new file mode 100644 index 0000000..e1095e1 --- /dev/null +++ b/libraries/evas/src/lib/include/evas_mmx.h | |||
@@ -0,0 +1,735 @@ | |||
1 | /* mmx.h | ||
2 | |||
3 | MultiMedia eXtensions GCC interface library for IA32. | ||
4 | |||
5 | To use this library, simply include this header file | ||
6 | and compile with GCC. You MUST have inlining enabled | ||
7 | in order for mmx_ok() to work; this can be done by | ||
8 | simply using -O on the GCC command line. | ||
9 | |||
10 | Compiling with -DMMX_TRACE will cause detailed trace | ||
11 | output to be sent to stderr for each mmx operation. | ||
12 | This adds lots of code, and obviously slows execution to | ||
13 | a crawl, but can be very useful for debugging. | ||
14 | |||
15 | THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY | ||
16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT | ||
17 | LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY | ||
18 | AND FITNESS FOR ANY PARTICULAR PURPOSE. | ||
19 | |||
20 | 1997-98 by H. Dietz and R. Fisher | ||
21 | |||
22 | History: | ||
23 | 97-98* R.Fisher Early versions | ||
24 | 980501 R.Fisher Original Release | ||
25 | 980611* H.Dietz Rewrite, correctly implementing inlines, and | ||
26 | R.Fisher including direct register accesses. | ||
27 | 980616 R.Fisher Release of 980611 as 980616. | ||
28 | 980714 R.Fisher Minor corrections to Makefile, etc. | ||
29 | 980715 R.Fisher mmx_ok() now prevents optimizer from using | ||
30 | clobbered values. | ||
31 | mmx_ok() now checks if cpuid instruction is | ||
32 | available before trying to use it. | ||
33 | 980726* R.Fisher mm_support() searches for AMD 3DNow, Cyrix | ||
34 | Extended MMX, and standard MMX. It returns a | ||
35 | value which is positive if any of these are | ||
36 | supported, and can be masked with constants to | ||
37 | see which. mmx_ok() is now a call to this | ||
38 | 980726* R.Fisher Added i2r support for shift functions | ||
39 | 980919 R.Fisher Fixed AMD extended feature recognition bug. | ||
40 | 980921 R.Fisher Added definition/check for _MMX_H. | ||
41 | Added "float s[2]" to mmx_t for use with | ||
42 | 3DNow and EMMX. So same mmx_t can be used. | ||
43 | 981013 R.Fisher Fixed cpuid function 1 bug (looked at wrong reg) | ||
44 | Fixed psllq_i2r error in mmxtest.c | ||
45 | |||
46 | * Unreleased (internal or interim) versions | ||
47 | |||
48 | Notes: | ||
49 | It appears that the latest gas has the pand problem fixed, therefore | ||
50 | I'll undefine BROKEN_PAND by default. | ||
51 | String compares may be quicker than the multiple test/jumps in vendor | ||
52 | test sequence in mmx_ok(), but I'm not concerned with that right now. | ||
53 | |||
54 | Acknowledgments: | ||
55 | Jussi Laako for pointing out the errors ultimately found to be | ||
56 | connected to the failure to notify the optimizer of clobbered values. | ||
57 | Roger Hardiman for reminding us that CPUID isn't everywhere, and that | ||
58 | someone may actually try to use this on a machine without CPUID. | ||
59 | Also for suggesting code for checking this. | ||
60 | Robert Dale for pointing out the AMD recognition bug. | ||
61 | Jimmy Mayfield and Carl Witty for pointing out the Intel recognition | ||
62 | bug. | ||
63 | Carl Witty for pointing out the psllq_i2r test bug. | ||
64 | */ | ||
65 | |||
66 | #ifndef _MMX_H | ||
67 | #define _MMX_H | ||
68 | |||
69 | /* Warning: at this writing, the version of GAS packaged | ||
70 | with most Linux distributions does not handle the | ||
71 | parallel AND operation mnemonic correctly. If the | ||
72 | symbol BROKEN_PAND is defined, a slower alternative | ||
73 | coding will be used. If execution of mmxtest results | ||
74 | in an illegal instruction fault, define this symbol. | ||
75 | */ | ||
76 | #undef BROKEN_PAND | ||
77 | |||
78 | |||
79 | /* The type of an value that fits in an MMX register | ||
80 | (note that long long constant values MUST be suffixed | ||
81 | by LL and unsigned long long values by ULL, lest | ||
82 | they be truncated by the compiler) | ||
83 | */ | ||
84 | typedef union { | ||
85 | long long q; /* Quadword (64-bit) value */ | ||
86 | unsigned long long uq; /* Unsigned Quadword */ | ||
87 | int d[2]; /* 2 Doubleword (32-bit) values */ | ||
88 | unsigned int ud[2]; /* 2 Unsigned Doubleword */ | ||
89 | short w[4]; /* 4 Word (16-bit) values */ | ||
90 | unsigned short uw[4]; /* 4 Unsigned Word */ | ||
91 | char b[8]; /* 8 Byte (8-bit) values */ | ||
92 | unsigned char ub[8]; /* 8 Unsigned Byte */ | ||
93 | float s[2]; /* Single-precision (32-bit) value */ | ||
94 | } __attribute__ ((aligned (8))) mmx_t; | ||
95 | |||
96 | /* Helper functions for the instruction macros that follow... | ||
97 | (note that memory-to-register, m2r, instructions are nearly | ||
98 | as efficient as register-to-register, r2r, instructions; | ||
99 | however, memory-to-memory instructions are really simulated | ||
100 | as a convenience, and are only 1/3 as efficient) | ||
101 | */ | ||
102 | |||
103 | /* These macros are a lot simpler without the tracing... | ||
104 | */ | ||
105 | |||
106 | #define mmx_i2r(op, imm, reg) \ | ||
107 | __asm__ __volatile__ (#op " $" #imm ", %%" #reg \ | ||
108 | : /* nothing */ \ | ||
109 | : /* nothing */); | ||
110 | |||
111 | #define mmx_m2r(op, mem, reg) \ | ||
112 | __asm__ __volatile__ (#op " %0, %%" #reg \ | ||
113 | : /* nothing */ \ | ||
114 | : "m" (mem)) | ||
115 | |||
116 | #define mmx_r2m(op, reg, mem) \ | ||
117 | __asm__ __volatile__ (#op " %%" #reg ", %0" \ | ||
118 | : "=m" (mem) \ | ||
119 | : /* nothing */ ) | ||
120 | |||
121 | #define mmx_a2r(op, mem, reg) \ | ||
122 | __asm__ __volatile__ (#op " %0, %%" #reg \ | ||
123 | : /* nothing */ \ | ||
124 | : "m" (mem)) | ||
125 | |||
126 | #define mmx_r2a(op, reg, mem) \ | ||
127 | __asm__ __volatile__ (#op " %%" #reg ", %0" \ | ||
128 | : "=m" (mem) \ | ||
129 | : /* nothing */ ) | ||
130 | |||
131 | #define mmx_r2r(op, regs, regd) \ | ||
132 | __asm__ __volatile__ (#op " %" #regs ", %" #regd) | ||
133 | |||
134 | #define mmx_m2m(op, mems, memd) \ | ||
135 | __asm__ __volatile__ ("movq %0, %%mm0\n\t" \ | ||
136 | #op " %1, %%mm0\n\t" \ | ||
137 | "movq %%mm0, %0" \ | ||
138 | : "=X" (memd) \ | ||
139 | : "X" (mems)) | ||
140 | |||
141 | /* 1x64 MOVE Quadword | ||
142 | (this is both a load and a store... | ||
143 | in fact, it is the only way to store) | ||
144 | */ | ||
145 | #define movq_m2r(var, reg) mmx_m2r(movq, var, reg) | ||
146 | #define movq_r2m(reg, var) mmx_r2m(movq, reg, var) | ||
147 | #define movq_r2r(regs, regd) mmx_r2r(movq, regs, regd) | ||
148 | #define movq(vars, vard) \ | ||
149 | __asm__ __volatile__ ("movq %1, %%mm0\n\t" \ | ||
150 | "movq %%mm0, %0" \ | ||
151 | : "=X" (vard) \ | ||
152 | : "X" (vars)) | ||
153 | #define movntq_r2m(reg, var) mmx_r2m(movntq, reg, var) | ||
154 | |||
155 | |||
156 | /* 1x32 MOVE Doubleword | ||
157 | (like movq, this is both load and store... | ||
158 | but is most useful for moving things between | ||
159 | mmx registers and ordinary registers) | ||
160 | */ | ||
161 | #define movd_m2r(var, reg) mmx_a2r(movd, var, reg) | ||
162 | #define movd_r2m(reg, var) mmx_r2a(movd, reg, var) | ||
163 | #define movd_r2r(regs, regd) mmx_r2r(movd, regs, regd) | ||
164 | #define movd(vars, vard) \ | ||
165 | __asm__ __volatile__ ("movd %1, %%mm0\n\t" \ | ||
166 | "movd %%mm0, %0" \ | ||
167 | : "=X" (vard) \ | ||
168 | : "X" (vars)) | ||
169 | |||
170 | |||
171 | /* 2x32, 4x16, and 8x8 Parallel ADDs | ||
172 | */ | ||
173 | #define paddd_m2r(var, reg) mmx_m2r(paddd, var, reg) | ||
174 | #define paddd_r2r(regs, regd) mmx_r2r(paddd, regs, regd) | ||
175 | #define paddd(vars, vard) mmx_m2m(paddd, vars, vard) | ||
176 | |||
177 | #define paddw_m2r(var, reg) mmx_m2r(paddw, var, reg) | ||
178 | #define paddw_r2r(regs, regd) mmx_r2r(paddw, regs, regd) | ||
179 | #define paddw(vars, vard) mmx_m2m(paddw, vars, vard) | ||
180 | |||
181 | #define paddb_m2r(var, reg) mmx_m2r(paddb, var, reg) | ||
182 | #define paddb_r2r(regs, regd) mmx_r2r(paddb, regs, regd) | ||
183 | #define paddb(vars, vard) mmx_m2m(paddb, vars, vard) | ||
184 | |||
185 | |||
186 | /* 4x16 and 8x8 Parallel ADDs using Saturation arithmetic | ||
187 | */ | ||
188 | #define paddsw_m2r(var, reg) mmx_m2r(paddsw, var, reg) | ||
189 | #define paddsw_r2r(regs, regd) mmx_r2r(paddsw, regs, regd) | ||
190 | #define paddsw(vars, vard) mmx_m2m(paddsw, vars, vard) | ||
191 | |||
192 | #define paddsb_m2r(var, reg) mmx_m2r(paddsb, var, reg) | ||
193 | #define paddsb_r2r(regs, regd) mmx_r2r(paddsb, regs, regd) | ||
194 | #define paddsb(vars, vard) mmx_m2m(paddsb, vars, vard) | ||
195 | |||
196 | |||
197 | /* 4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic | ||
198 | */ | ||
199 | #define paddusw_m2r(var, reg) mmx_m2r(paddusw, var, reg) | ||
200 | #define paddusw_r2r(regs, regd) mmx_r2r(paddusw, regs, regd) | ||
201 | #define paddusw(vars, vard) mmx_m2m(paddusw, vars, vard) | ||
202 | |||
203 | #define paddusb_m2r(var, reg) mmx_m2r(paddusb, var, reg) | ||
204 | #define paddusb_r2r(regs, regd) mmx_r2r(paddusb, regs, regd) | ||
205 | #define paddusb(vars, vard) mmx_m2m(paddusb, vars, vard) | ||
206 | |||
207 | |||
208 | /* 2x32, 4x16, and 8x8 Parallel SUBs | ||
209 | */ | ||
210 | #define psubd_m2r(var, reg) mmx_m2r(psubd, var, reg) | ||
211 | #define psubd_r2r(regs, regd) mmx_r2r(psubd, regs, regd) | ||
212 | #define psubd(vars, vard) mmx_m2m(psubd, vars, vard) | ||
213 | |||
214 | #define psubw_m2r(var, reg) mmx_m2r(psubw, var, reg) | ||
215 | #define psubw_r2r(regs, regd) mmx_r2r(psubw, regs, regd) | ||
216 | #define psubw(vars, vard) mmx_m2m(psubw, vars, vard) | ||
217 | |||
218 | #define psubb_m2r(var, reg) mmx_m2r(psubb, var, reg) | ||
219 | #define psubb_r2r(regs, regd) mmx_r2r(psubb, regs, regd) | ||
220 | #define psubb(vars, vard) mmx_m2m(psubb, vars, vard) | ||
221 | |||
222 | |||
223 | /* 4x16 and 8x8 Parallel SUBs using Saturation arithmetic | ||
224 | */ | ||
225 | #define psubsw_m2r(var, reg) mmx_m2r(psubsw, var, reg) | ||
226 | #define psubsw_r2r(regs, regd) mmx_r2r(psubsw, regs, regd) | ||
227 | #define psubsw(vars, vard) mmx_m2m(psubsw, vars, vard) | ||
228 | |||
229 | #define psubsb_m2r(var, reg) mmx_m2r(psubsb, var, reg) | ||
230 | #define psubsb_r2r(regs, regd) mmx_r2r(psubsb, regs, regd) | ||
231 | #define psubsb(vars, vard) mmx_m2m(psubsb, vars, vard) | ||
232 | |||
233 | |||
234 | /* 4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic | ||
235 | */ | ||
236 | #define psubusw_m2r(var, reg) mmx_m2r(psubusw, var, reg) | ||
237 | #define psubusw_r2r(regs, regd) mmx_r2r(psubusw, regs, regd) | ||
238 | #define psubusw(vars, vard) mmx_m2m(psubusw, vars, vard) | ||
239 | |||
240 | #define psubusb_m2r(var, reg) mmx_m2r(psubusb, var, reg) | ||
241 | #define psubusb_r2r(regs, regd) mmx_r2r(psubusb, regs, regd) | ||
242 | #define psubusb(vars, vard) mmx_m2m(psubusb, vars, vard) | ||
243 | |||
244 | |||
245 | /* 4x16 Parallel MULs giving Low 4x16 portions of results | ||
246 | */ | ||
247 | #define pmullw_m2r(var, reg) mmx_m2r(pmullw, var, reg) | ||
248 | #define pmullw_r2r(regs, regd) mmx_r2r(pmullw, regs, regd) | ||
249 | #define pmullw(vars, vard) mmx_m2m(pmullw, vars, vard) | ||
250 | |||
251 | |||
252 | /* 4x16 Parallel MULs giving High 4x16 portions of results | ||
253 | */ | ||
254 | #define pmulhw_m2r(var, reg) mmx_m2r(pmulhw, var, reg) | ||
255 | #define pmulhw_r2r(regs, regd) mmx_r2r(pmulhw, regs, regd) | ||
256 | #define pmulhw(vars, vard) mmx_m2m(pmulhw, vars, vard) | ||
257 | |||
258 | |||
259 | /* 4x16->2x32 Parallel Mul-ADD | ||
260 | (muls like pmullw, then adds adjacent 16-bit fields | ||
261 | in the multiply result to make the final 2x32 result) | ||
262 | */ | ||
263 | #define pmaddwd_m2r(var, reg) mmx_m2r(pmaddwd, var, reg) | ||
264 | #define pmaddwd_r2r(regs, regd) mmx_r2r(pmaddwd, regs, regd) | ||
265 | #define pmaddwd(vars, vard) mmx_m2m(pmaddwd, vars, vard) | ||
266 | |||
267 | |||
268 | /* 1x64 bitwise AND | ||
269 | */ | ||
270 | #ifdef BROKEN_PAND | ||
271 | #define pand_m2r(var, reg) \ | ||
272 | { \ | ||
273 | mmx_m2r(pandn, (mmx_t) -1LL, reg); \ | ||
274 | mmx_m2r(pandn, var, reg); \ | ||
275 | } | ||
276 | #define pand_r2r(regs, regd) \ | ||
277 | { \ | ||
278 | mmx_m2r(pandn, (mmx_t) -1LL, regd); \ | ||
279 | mmx_r2r(pandn, regs, regd) \ | ||
280 | } | ||
281 | #define pand(vars, vard) \ | ||
282 | { \ | ||
283 | movq_m2r(vard, mm0); \ | ||
284 | mmx_m2r(pandn, (mmx_t) -1LL, mm0); \ | ||
285 | mmx_m2r(pandn, vars, mm0); \ | ||
286 | movq_r2m(mm0, vard); \ | ||
287 | } | ||
288 | #else | ||
289 | #define pand_m2r(var, reg) mmx_m2r(pand, var, reg) | ||
290 | #define pand_r2r(regs, regd) mmx_r2r(pand, regs, regd) | ||
291 | #define pand(vars, vard) mmx_m2m(pand, vars, vard) | ||
292 | #endif | ||
293 | |||
294 | |||
295 | /* 1x64 bitwise AND with Not the destination | ||
296 | */ | ||
297 | #define pandn_m2r(var, reg) mmx_m2r(pandn, var, reg) | ||
298 | #define pandn_r2r(regs, regd) mmx_r2r(pandn, regs, regd) | ||
299 | #define pandn(vars, vard) mmx_m2m(pandn, vars, vard) | ||
300 | |||
301 | |||
302 | /* 1x64 bitwise OR | ||
303 | */ | ||
304 | #define por_m2r(var, reg) mmx_m2r(por, var, reg) | ||
305 | #define por_r2r(regs, regd) mmx_r2r(por, regs, regd) | ||
306 | #define por(vars, vard) mmx_m2m(por, vars, vard) | ||
307 | |||
308 | |||
309 | /* 1x64 bitwise eXclusive OR | ||
310 | */ | ||
311 | #define pxor_m2r(var, reg) mmx_m2r(pxor, var, reg) | ||
312 | #define pxor_r2r(regs, regd) mmx_r2r(pxor, regs, regd) | ||
313 | #define pxor(vars, vard) mmx_m2m(pxor, vars, vard) | ||
314 | |||
315 | |||
316 | /* 2x32, 4x16, and 8x8 Parallel CoMPare for EQuality | ||
317 | (resulting fields are either 0 or -1) | ||
318 | */ | ||
319 | #define pcmpeqd_m2r(var, reg) mmx_m2r(pcmpeqd, var, reg) | ||
320 | #define pcmpeqd_r2r(regs, regd) mmx_r2r(pcmpeqd, regs, regd) | ||
321 | #define pcmpeqd(vars, vard) mmx_m2m(pcmpeqd, vars, vard) | ||
322 | |||
323 | #define pcmpeqw_m2r(var, reg) mmx_m2r(pcmpeqw, var, reg) | ||
324 | #define pcmpeqw_r2r(regs, regd) mmx_r2r(pcmpeqw, regs, regd) | ||
325 | #define pcmpeqw(vars, vard) mmx_m2m(pcmpeqw, vars, vard) | ||
326 | |||
327 | #define pcmpeqb_m2r(var, reg) mmx_m2r(pcmpeqb, var, reg) | ||
328 | #define pcmpeqb_r2r(regs, regd) mmx_r2r(pcmpeqb, regs, regd) | ||
329 | #define pcmpeqb(vars, vard) mmx_m2m(pcmpeqb, vars, vard) | ||
330 | |||
331 | |||
332 | /* 2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than | ||
333 | (resulting fields are either 0 or -1) | ||
334 | */ | ||
335 | #define pcmpgtd_m2r(var, reg) mmx_m2r(pcmpgtd, var, reg) | ||
336 | #define pcmpgtd_r2r(regs, regd) mmx_r2r(pcmpgtd, regs, regd) | ||
337 | #define pcmpgtd(vars, vard) mmx_m2m(pcmpgtd, vars, vard) | ||
338 | |||
339 | #define pcmpgtw_m2r(var, reg) mmx_m2r(pcmpgtw, var, reg) | ||
340 | #define pcmpgtw_r2r(regs, regd) mmx_r2r(pcmpgtw, regs, regd) | ||
341 | #define pcmpgtw(vars, vard) mmx_m2m(pcmpgtw, vars, vard) | ||
342 | |||
343 | #define pcmpgtb_m2r(var, reg) mmx_m2r(pcmpgtb, var, reg) | ||
344 | #define pcmpgtb_r2r(regs, regd) mmx_r2r(pcmpgtb, regs, regd) | ||
345 | #define pcmpgtb(vars, vard) mmx_m2m(pcmpgtb, vars, vard) | ||
346 | |||
347 | |||
348 | /* 1x64, 2x32, and 4x16 Parallel Shift Left Logical | ||
349 | */ | ||
350 | #define psllq_i2r(imm, reg) mmx_i2r(psllq, imm, reg) | ||
351 | #define psllq_m2r(var, reg) mmx_m2r(psllq, var, reg) | ||
352 | #define psllq_r2r(regs, regd) mmx_r2r(psllq, regs, regd) | ||
353 | #define psllq(vars, vard) mmx_m2m(psllq, vars, vard) | ||
354 | |||
355 | #define pslld_i2r(imm, reg) mmx_i2r(pslld, imm, reg) | ||
356 | #define pslld_m2r(var, reg) mmx_m2r(pslld, var, reg) | ||
357 | #define pslld_r2r(regs, regd) mmx_r2r(pslld, regs, regd) | ||
358 | #define pslld(vars, vard) mmx_m2m(pslld, vars, vard) | ||
359 | |||
360 | #define psllw_i2r(imm, reg) mmx_i2r(psllw, imm, reg) | ||
361 | #define psllw_m2r(var, reg) mmx_m2r(psllw, var, reg) | ||
362 | #define psllw_r2r(regs, regd) mmx_r2r(psllw, regs, regd) | ||
363 | #define psllw(vars, vard) mmx_m2m(psllw, vars, vard) | ||
364 | |||
365 | |||
366 | /* 1x64, 2x32, and 4x16 Parallel Shift Right Logical | ||
367 | */ | ||
368 | #define psrlq_i2r(imm, reg) mmx_i2r(psrlq, imm, reg) | ||
369 | #define psrlq_m2r(var, reg) mmx_m2r(psrlq, var, reg) | ||
370 | #define psrlq_r2r(regs, regd) mmx_r2r(psrlq, regs, regd) | ||
371 | #define psrlq(vars, vard) mmx_m2m(psrlq, vars, vard) | ||
372 | |||
373 | #define psrld_i2r(imm, reg) mmx_i2r(psrld, imm, reg) | ||
374 | #define psrld_m2r(var, reg) mmx_m2r(psrld, var, reg) | ||
375 | #define psrld_r2r(regs, regd) mmx_r2r(psrld, regs, regd) | ||
376 | #define psrld(vars, vard) mmx_m2m(psrld, vars, vard) | ||
377 | |||
378 | #define psrlw_i2r(imm, reg) mmx_i2r(psrlw, imm, reg) | ||
379 | #define psrlw_m2r(var, reg) mmx_m2r(psrlw, var, reg) | ||
380 | #define psrlw_r2r(regs, regd) mmx_r2r(psrlw, regs, regd) | ||
381 | #define psrlw(vars, vard) mmx_m2m(psrlw, vars, vard) | ||
382 | |||
383 | |||
384 | /* 2x32 and 4x16 Parallel Shift Right Arithmetic | ||
385 | */ | ||
386 | #define psrad_i2r(imm, reg) mmx_i2r(psrad, imm, reg) | ||
387 | #define psrad_m2r(var, reg) mmx_m2r(psrad, var, reg) | ||
388 | #define psrad_r2r(regs, regd) mmx_r2r(psrad, regs, regd) | ||
389 | #define psrad(vars, vard) mmx_m2m(psrad, vars, vard) | ||
390 | |||
391 | #define psraw_i2r(imm, reg) mmx_i2r(psraw, imm, reg) | ||
392 | #define psraw_m2r(var, reg) mmx_m2r(psraw, var, reg) | ||
393 | #define psraw_r2r(regs, regd) mmx_r2r(psraw, regs, regd) | ||
394 | #define psraw(vars, vard) mmx_m2m(psraw, vars, vard) | ||
395 | |||
396 | |||
397 | /* 2x32->4x16 and 4x16->8x8 PACK and Signed Saturate | ||
398 | (packs source and dest fields into dest in that order) | ||
399 | */ | ||
400 | #define packssdw_m2r(var, reg) mmx_m2r(packssdw, var, reg) | ||
401 | #define packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd) | ||
402 | #define packssdw(vars, vard) mmx_m2m(packssdw, vars, vard) | ||
403 | |||
404 | #define packsswb_m2r(var, reg) mmx_m2r(packsswb, var, reg) | ||
405 | #define packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd) | ||
406 | #define packsswb(vars, vard) mmx_m2m(packsswb, vars, vard) | ||
407 | |||
408 | |||
409 | /* 4x16->8x8 PACK and Unsigned Saturate | ||
410 | (packs source and dest fields into dest in that order) | ||
411 | */ | ||
412 | #define packuswb_m2r(var, reg) mmx_m2r(packuswb, var, reg) | ||
413 | #define packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd) | ||
414 | #define packuswb(vars, vard) mmx_m2m(packuswb, vars, vard) | ||
415 | |||
416 | |||
417 | /* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low | ||
418 | (interleaves low half of dest with low half of source | ||
419 | as padding in each result field) | ||
420 | */ | ||
421 | #define punpckldq_m2r(var, reg) mmx_m2r(punpckldq, var, reg) | ||
422 | #define punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd) | ||
423 | #define punpckldq(vars, vard) mmx_m2m(punpckldq, vars, vard) | ||
424 | |||
425 | #define punpcklwd_m2r(var, reg) mmx_m2r(punpcklwd, var, reg) | ||
426 | #define punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd) | ||
427 | #define punpcklwd(vars, vard) mmx_m2m(punpcklwd, vars, vard) | ||
428 | |||
429 | #define punpcklbw_m2r(var, reg) mmx_m2r(punpcklbw, var, reg) | ||
430 | #define punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd) | ||
431 | #define punpcklbw(vars, vard) mmx_m2m(punpcklbw, vars, vard) | ||
432 | |||
433 | |||
434 | /* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High | ||
435 | (interleaves high half of dest with high half of source | ||
436 | as padding in each result field) | ||
437 | */ | ||
438 | #define punpckhdq_m2r(var, reg) mmx_m2r(punpckhdq, var, reg) | ||
439 | #define punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd) | ||
440 | #define punpckhdq(vars, vard) mmx_m2m(punpckhdq, vars, vard) | ||
441 | |||
442 | #define punpckhwd_m2r(var, reg) mmx_m2r(punpckhwd, var, reg) | ||
443 | #define punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd) | ||
444 | #define punpckhwd(vars, vard) mmx_m2m(punpckhwd, vars, vard) | ||
445 | |||
446 | #define punpckhbw_m2r(var, reg) mmx_m2r(punpckhbw, var, reg) | ||
447 | #define punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd) | ||
448 | #define punpckhbw(vars, vard) mmx_m2m(punpckhbw, vars, vard) | ||
449 | |||
450 | #define MOVE_8DWORDS_MMX(src,dst) \ | ||
451 | __asm__ ( \ | ||
452 | "movq (%1), %%mm0 \n" \ | ||
453 | "movq 0x8(%1), %%mm1 \n" \ | ||
454 | "movq 0x10(%1), %%mm2 \n" \ | ||
455 | "movq 0x18(%1), %%mm3 \n" \ | ||
456 | "movq %%mm0, (%0) \n" \ | ||
457 | "movq %%mm1, 0x8(%0) \n" \ | ||
458 | "movq %%mm2, 0x10(%0) \n" \ | ||
459 | "movq %%mm3, 0x18(%0) \n" \ | ||
460 | : \ | ||
461 | : "q" (dst), "r" (src) \ | ||
462 | : "memory", "st"); | ||
463 | |||
464 | #define MOVE_10DWORDS_MMX(src,dst) \ | ||
465 | __asm__ ( \ | ||
466 | "movq (%1), %%mm0 \n" \ | ||
467 | "movq 0x8(%1), %%mm1 \n" \ | ||
468 | "movq 0x10(%1), %%mm2 \n" \ | ||
469 | "movq 0x18(%1), %%mm3 \n" \ | ||
470 | "movq 0x20(%1), %%mm4 \n" \ | ||
471 | "movq %%mm0, (%0) \n" \ | ||
472 | "movq %%mm1, 0x8(%0) \n" \ | ||
473 | "movq %%mm2, 0x10(%0) \n" \ | ||
474 | "movq %%mm3, 0x18(%0) \n" \ | ||
475 | "movq %%mm4, 0x20(%0) \n" \ | ||
476 | : \ | ||
477 | : "q" (dst), "r" (src) \ | ||
478 | : "memory", "st"); | ||
479 | |||
480 | #define MOVE_16DWORDS_MMX(src,dst) \ | ||
481 | __asm__ ( \ | ||
482 | "movq (%1), %%mm0 \n" \ | ||
483 | "movq 0x8(%1), %%mm1 \n" \ | ||
484 | "movq 0x10(%1), %%mm2 \n" \ | ||
485 | "movq 0x18(%1), %%mm3 \n" \ | ||
486 | "movq 0x20(%1), %%mm4 \n" \ | ||
487 | "movq 0x28(%1), %%mm5 \n" \ | ||
488 | "movq 0x30(%1), %%mm6 \n" \ | ||
489 | "movq 0x38(%1), %%mm7 \n" \ | ||
490 | "movq %%mm0, (%0) \n" \ | ||
491 | "movq %%mm1, 0x8(%0) \n" \ | ||
492 | "movq %%mm2, 0x10(%0) \n" \ | ||
493 | "movq %%mm3, 0x18(%0) \n" \ | ||
494 | "movq %%mm4, 0x20(%0) \n" \ | ||
495 | "movq %%mm5, 0x28(%0) \n" \ | ||
496 | "movq %%mm6, 0x30(%0) \n" \ | ||
497 | "movq %%mm7, 0x38(%0) \n" \ | ||
498 | : \ | ||
499 | : "q" (dst), "r" (src) \ | ||
500 | : "memory", "st"); | ||
501 | |||
502 | #define MOVE_16DWORDS_MMX2(src,dst) \ | ||
503 | __asm__ ( \ | ||
504 | "movq (%1), %%mm0 \n" \ | ||
505 | "movq 0x8(%1), %%mm1 \n" \ | ||
506 | "movq 0x10(%1), %%mm2 \n" \ | ||
507 | "movq 0x18(%1), %%mm3 \n" \ | ||
508 | "movq 0x20(%1), %%mm4 \n" \ | ||
509 | "movq 0x28(%1), %%mm5 \n" \ | ||
510 | "movq 0x30(%1), %%mm6 \n" \ | ||
511 | "movq 0x38(%1), %%mm7 \n" \ | ||
512 | "movntq %%mm0, (%0) \n" \ | ||
513 | "movntq %%mm1, 0x8(%0) \n" \ | ||
514 | "movntq %%mm2, 0x10(%0) \n" \ | ||
515 | "movntq %%mm3, 0x18(%0) \n" \ | ||
516 | "movntq %%mm4, 0x20(%0) \n" \ | ||
517 | "movntq %%mm5, 0x28(%0) \n" \ | ||
518 | "movntq %%mm6, 0x30(%0) \n" \ | ||
519 | "movntq %%mm7, 0x38(%0) \n" \ | ||
520 | : \ | ||
521 | : "q" (dst), "r" (src) \ | ||
522 | : "memory", "st"); | ||
523 | |||
524 | #define MOVE_32DWORDS_SSE2(src,dst) \ | ||
525 | __asm__ ( \ | ||
526 | "movdqu (%1), %%xmm0 \n" \ | ||
527 | "movdqu 0x10(%1), %%xmm1 \n" \ | ||
528 | "movdqu 0x20(%1), %%xmm2 \n" \ | ||
529 | "movdqu 0x30(%1), %%xmm3 \n" \ | ||
530 | "movdqu 0x40(%1), %%xmm4 \n" \ | ||
531 | "movdqu 0x50(%1), %%xmm5 \n" \ | ||
532 | "movdqu 0x60(%1), %%xmm6 \n" \ | ||
533 | "movdqu 0x70(%1), %%xmm7 \n" \ | ||
534 | "movntdq %%xmm0, (%0) \n" \ | ||
535 | "movntdq %%xmm1, 0x10(%0) \n" \ | ||
536 | "movntdq %%xmm2, 0x20(%0) \n" \ | ||
537 | "movntdq %%xmm3, 0x30(%0) \n" \ | ||
538 | "movntdq %%xmm4, 0x40(%0) \n" \ | ||
539 | "movntdq %%xmm5, 0x50(%0) \n" \ | ||
540 | "movntdq %%xmm6, 0x60(%0) \n" \ | ||
541 | "movntdq %%xmm7, 0x70(%0) \n" \ | ||
542 | : \ | ||
543 | : "q" (dst), "r" (src) \ | ||
544 | : "memory", "st"); | ||
545 | |||
546 | #define MOVE_32DWORDS_ALIGNED_SSE2(src,dst) \ | ||
547 | __asm__ ( \ | ||
548 | "movdqa (%1), %%xmm0 \n" \ | ||
549 | "movdqa 0x10(%1), %%xmm1 \n" \ | ||
550 | "movdqa 0x20(%1), %%xmm2 \n" \ | ||
551 | "movdqa 0x30(%1), %%xmm3 \n" \ | ||
552 | "movdqa 0x40(%1), %%xmm4 \n" \ | ||
553 | "movdqa 0x50(%1), %%xmm5 \n" \ | ||
554 | "movdqa 0x60(%1), %%xmm6 \n" \ | ||
555 | "movdqa 0x70(%1), %%xmm7 \n" \ | ||
556 | "movntdq %%xmm0, (%0) \n" \ | ||
557 | "movntdq %%xmm1, 0x10(%0) \n" \ | ||
558 | "movntdq %%xmm2, 0x20(%0) \n" \ | ||
559 | "movntdq %%xmm3, 0x30(%0) \n" \ | ||
560 | "movntdq %%xmm4, 0x40(%0) \n" \ | ||
561 | "movntdq %%xmm5, 0x50(%0) \n" \ | ||
562 | "movntdq %%xmm6, 0x60(%0) \n" \ | ||
563 | "movntdq %%xmm7, 0x70(%0) \n" \ | ||
564 | : \ | ||
565 | : "q" (dst), "r" (src) \ | ||
566 | : "memory", "st"); | ||
567 | |||
568 | /* Empty MMx State | ||
569 | (used to clean-up when going from mmx to float use | ||
570 | of the registers that are shared by both; note that | ||
571 | there is no float-to-mmx operation needed, because | ||
572 | only the float tag word info is corruptible) | ||
573 | */ | ||
574 | |||
575 | #define emms() __asm__ __volatile__ ("emms":::"memory") | ||
576 | #define sfence() __asm__ __volatile__ ("sfence":::"memory") | ||
577 | |||
578 | /* additions to detect mmx - */ | ||
579 | /* Raster <raster@rasterman.com> */ | ||
580 | |||
581 | #define CPUID_MMX (1 << 23) /* flags: mmx */ | ||
582 | #define CPUID_SSE (1 << 25) /* flags: xmm */ | ||
583 | #define CPUID_SSE2 (1 << 26) /* flags: ? */ | ||
584 | |||
585 | /* | ||
586 | #ifdef __amd64 | ||
587 | #define have_cpuid(cpuid_ret) \ | ||
588 | __asm__ __volatile__ ( \ | ||
589 | ".align 32 \n" \ | ||
590 | " pushq %%rbx \n" \ | ||
591 | " pushfq \n" \ | ||
592 | " popq %%rax \n" \ | ||
593 | " movq %%rax, %%rbx \n" \ | ||
594 | " xorq $0x200000, %%rax \n" \ | ||
595 | " pushq %%rax \n" \ | ||
596 | " popfq \n" \ | ||
597 | " pushfq \n" \ | ||
598 | " popq %%rax \n" \ | ||
599 | " cmpq %%rax, %%rbx \n" \ | ||
600 | " je 1f \n" \ | ||
601 | " movl $1, %0 \n" \ | ||
602 | " jmp 2f \n" \ | ||
603 | "1: \n" \ | ||
604 | " movl $0, %0 \n" \ | ||
605 | "2: \n" \ | ||
606 | " popq %%rbx \n" \ | ||
607 | : "=m" (cpuid_ret) \ | ||
608 | ); | ||
609 | |||
610 | #define get_cpuid(cpuid_ret) \ | ||
611 | __asm__ __volatile__ ( \ | ||
612 | ".align 32 \n" \ | ||
613 | " pushq %%rax \n" \ | ||
614 | " movl $1, %%eax \n" \ | ||
615 | " cpuid \n" \ | ||
616 | " test $0x00800000, %%edx\n" \ | ||
617 | "1: \n" \ | ||
618 | " movl %%edx, %0 \n" \ | ||
619 | " jmp 2f \n" \ | ||
620 | "2: \n" \ | ||
621 | " movl $0, %0 \n" \ | ||
622 | " popq %%rax \n" \ | ||
623 | : "=m" (cpuid_ret) \ | ||
624 | ); | ||
625 | #else | ||
626 | #define have_cpuid(cpuid_ret) \ | ||
627 | __asm__ __volatile__ ( \ | ||
628 | ".align 32 \n" \ | ||
629 | " pushl %%ebx \n" \ | ||
630 | " pushfl \n" \ | ||
631 | " popl %%eax \n" \ | ||
632 | " movl %%eax, %%ebx \n" \ | ||
633 | " xorl $0x200000, %%eax \n" \ | ||
634 | " pushl %%eax \n" \ | ||
635 | " popfl \n" \ | ||
636 | " pushfl \n" \ | ||
637 | " popl %%eax \n" \ | ||
638 | " cmpl %%eax, %%ebx \n" \ | ||
639 | " je 1f \n" \ | ||
640 | " movl $1, %0 \n" \ | ||
641 | " jmp 2f \n" \ | ||
642 | "1: \n" \ | ||
643 | " movl $0, %0 \n" \ | ||
644 | "2: \n" \ | ||
645 | " popl %%ebx \n" \ | ||
646 | : "=m" (cpuid_ret) \ | ||
647 | ); | ||
648 | |||
649 | #define get_cpuid(cpuid_ret) \ | ||
650 | __asm__ __volatile__ ( \ | ||
651 | ".align 32 \n" \ | ||
652 | " pushl %%eax \n" \ | ||
653 | " movl $1, %%eax \n" \ | ||
654 | " cpuid \n" \ | ||
655 | " test $0x00800000, %%edx\n" \ | ||
656 | "1: \n" \ | ||
657 | " movl %%edx, %0 \n" \ | ||
658 | " jmp 2f \n" \ | ||
659 | "2: \n" \ | ||
660 | " movl $0, %0 \n" \ | ||
661 | " popl %%eax \n" \ | ||
662 | : "=m" (cpuid_ret) \ | ||
663 | ); | ||
664 | #endif | ||
665 | */ | ||
666 | |||
667 | #define prefetch(var) \ | ||
668 | __asm__ __volatile__ ( \ | ||
669 | "prefetchnta (%0) \n" \ | ||
670 | : \ | ||
671 | : "r" (var) \ | ||
672 | ); | ||
673 | #define prefetch0(var) \ | ||
674 | __asm__ __volatile__ ( \ | ||
675 | "prefetcht0 (%0) \n" \ | ||
676 | : \ | ||
677 | : "r" (var) \ | ||
678 | ); | ||
679 | #define prefetch1(var) \ | ||
680 | __asm__ __volatile__ ( \ | ||
681 | "prefetcht1 (%0) \n" \ | ||
682 | : \ | ||
683 | : "r" (var) \ | ||
684 | ); | ||
685 | #define prefetch2(var) \ | ||
686 | __asm__ __volatile__ ( \ | ||
687 | "prefetcht2 (%0) \n" \ | ||
688 | : \ | ||
689 | : "r" (var) \ | ||
690 | ); | ||
691 | #define pshufw(r1, r2, imm) \ | ||
692 | __asm__ __volatile__ ( \ | ||
693 | "pshufw $" #imm ", %" #r1 ", %" #r2 " \n" \ | ||
694 | ); | ||
695 | |||
696 | #define pshufhw(r1, r2, imm) \ | ||
697 | __asm__ __volatile__ ( \ | ||
698 | "pshufhw $" #imm ", %" #r1 ", %" #r2 " \n" \ | ||
699 | ); | ||
700 | |||
701 | #define pshuflw(r1, r2, imm) \ | ||
702 | __asm__ __volatile__ ( \ | ||
703 | "pshuflw $" #imm ", %" #r1 ", %" #r2 " \n" \ | ||
704 | ); | ||
705 | #define pshufd(r1, r2, imm) \ | ||
706 | __asm__ __volatile__ ( \ | ||
707 | "pshufd $" #imm ", %" #r1 ", %" #r2 " \n" \ | ||
708 | ); | ||
709 | |||
710 | /* 1x238 MOVE Doouble Quadword | ||
711 | (this is both a load and a store... | ||
712 | in fact, it is the only way to store) | ||
713 | */ | ||
714 | #define movdqu_m2r(var, reg) mmx_m2r(movdqu, var, reg) | ||
715 | #define movdqu_r2m(reg, var) mmx_r2m(movdqu, reg, var) | ||
716 | #define movdqu_r2r(regs, regd) mmx_r2r(movdqu, regs, regd) | ||
717 | #define movdqu(vars, vard) \ | ||
718 | __asm__ __volatile__ ("movdqu %1, %%xmm0\n\t" \ | ||
719 | "movdqu %%xmm0, %0" \ | ||
720 | : "=X" (vard) \ | ||
721 | : "X" (vars)) | ||
722 | #define movdqa_m2r(var, reg) mmx_m2r(movdqa, var, reg) | ||
723 | #define movdqa_r2m(reg, var) mmx_r2m(movdqa, reg, var) | ||
724 | #define movdqa_r2r(regs, regd) mmx_r2r(movdqa, regs, regd) | ||
725 | #define movdqa(vars, vard) \ | ||
726 | __asm__ __volatile__ ("movdqa %1, %%xmm0\n\t" \ | ||
727 | "movdqa %%xmm0, %0" \ | ||
728 | : "=X" (vard) \ | ||
729 | : "X" (vars)) | ||
730 | #define movntdq_r2m(reg, var) mmx_r2m(movntdq, reg, var) | ||
731 | |||
732 | |||
733 | /* end additions */ | ||
734 | |||
735 | #endif | ||