2 TiMidity++ -- MIDI to WAVE converter and player
3 Copyright (C) 1999-2002 Masanao Izumo <mo@goice.co.jp>
4 Copyright (C) 1995 Tuukka Toivonen <tt@cgs.fi>
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 #ifndef OPTCODE_H_INCLUDED
22 #define OPTCODE_H_INCLUDED 1
25 #pragma clang diagnostic push
26 #pragma clang diagnostic ignored "-Wmacro-redefined"
29 #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_X86_) || defined(__X86__) || defined(__I86__)
33 #if defined(_M_X64) || defined(_AMD64_) || defined(_X64_) || defined(__X64__) || defined(__x86_64__)
39 #if defined(_IA64_) || defined(__IA64__) || defined(__I64__)
56 #if OPT_MODE == 1 && !defined(IX86CPU)
62 #if OPT_MODE == * && !defined(AMD64CPU)
69 #if OPT_MODE == * && !defined(IA64CPU)
76 #if OPT_MODE == * && !defined(ARMCPU)
83 #if OPT_MODE == * && !defined(ARM64CPU)
92 /*****************************************************************************/
94 intrinsic
\82ð
\8eg
\97p
\82µ
\82Ä
\82Ý
\82é
\83e
\83X
\83g gcc
\82Å
\82à
\8eg
\82¦
\82é
\82ç
\82µ
\82¢
\82µ
95 CPU
\82Ì
\8ag
\92£
\8b@
\94\
\82Ì
\91Î
\89\9e\82Ì
\88á
\82¢,
\83r
\83\8b\83h
\8aÂ
\8b«
\82Ìasm/intrin
\91Î
\89\9e\82Ì
\88á
\82¢
\82ª
\82 \82é
\82Ì
\82Å
96 arch_ext_asm/intrin
\82ð
\8cÂ
\95Ê
\82É
\8ew
\92è
\82Å
\82«
\82é
\82æ
\82¤
\82É
\82·
\82é
97 asm/intrin
\97¼
\91Î
\89\9e\82Ì
\8fê
\8d\87 asm
\82ð
\97D
\90æ
\82µ
\82Ä
\8eg
\97p
\82·
\82é
98 x86_ext/x86_AMD_ext
\97¼
\91Î
\89\9e\82Ì
\8fê
\8d\87 x86_AMD_ext
\82ð
\97D
\90æ
\82µ
\82Ä
\8eg
\97p
\82·
\82é
99 intrin
\82Í
\88ê
\95\94\8f\9c\82¢
\82Äx86/x64
\8b¤
\92Ê
\82È
\82Ì
\82Å USE_X86_EXT_INTRIN
\82Íx64
\82Å
\82à
\97L
\8cø
\89»
100 x86/x64
\90ê
\97p
\96½
\97ß
\82Í USE_X64_EXT_INTRIN/IX64CPU
\93\99\82Å
\8bæ
\95Ê (gather
\93\99
102 \95ª
\8aò
\82Ì
\8f\87\8f\98\82Í
103 1 OPT_MODE or USE_X86_AMD_EXT_ASM or USE_X64_AMD_EXT_ASM
104 2 OPT_MODE or USE_X86_EXT_ASM or USE_X64_EXT_ASM
105 3 USE_X64_AMD_EXT_INTRIN
107 5 USE_X86_AMD_EXT_INTRIN
112 AMD
\82í
\82©
\82ç
\82ñ
\81E
\81E
\82½
\82Ô
\82ñ
\88á
\82¤
\82Ì
\82Å
\97v
\8fC
\90³ (
\8d¡
\82Ì
\82Æ
\82±
\82ë
\95K
\97v
\82È
\82¢
\82µ
\8eg
\82¤
\97\
\92è
\82à
\82È
\82¢
\82¯
\82Ç
113 \91Î
\89\9e\8b@
\94\
\83`
\83F
\83b
\83N
\82¢
\82ë
\82¢
\82ë
\89ö
\82µ
\82¢ (optcode.c is_x86ext_available()
\96¢
\8eg
\97p
114 OPT_MODE
\82Æ
\82Ì
\8aÖ
\8cW
\82ð
\82Ç
\82¤
\82·
\82é
\82©
\81E
\81E (
\8d¡
\82Ì
\82Æ
\82±
\82ëOPT_MODE
\97D
\90æ
115 \82Ü
\82Æ
\82ß
\82é
\82È
\82ç, 1: x86 asm / no intrin
\82É
\82µ
\82Ä
\88È
\89º
\82¸
\82ç
\82·, intrin
\94ñ
\91Î
\89\9e\82É
\82È
\82é
\8fð
\8c\8f\82ð
\95Ï
\8dX, _EXT
\82ð_OPT
\82É
\95Ï
\8dX
\82Æ
\82©
116 AVX2
\88È
\8fã
\82Ì
\83r
\83\8b\83h
\8aÂ
\8b«
\82ª
\82È
\82¢
\82Ì
\82Å
\93®
\8dì
\82Í
\95s
\96¾ (VC2013?
\88È
\8d~
119 #define USE_PENTIUM_4 // for pentium 4 (northwood steppingA) float/double denormal fix
121 #if !defined(IX86CPU)
125 //#define USE_SSE //
\83e
\83X
\83g
\97p
126 //#define USE_SSE2 //
\83e
\83X
\83g
\97p
127 //#define USE_SSE3 //
\83e
\83X
\83g
\97p
128 //#define USE_SSSE3 //
\83e
\83X
\83g
\97p
129 //#define USE_SSE41 //
\83e
\83X
\83g
\97p
130 //#define USE_SSE42 //
\83e
\83X
\83g
\97p
131 //#define USE_AVX //
\83e
\83X
\83g
\97p
132 //#define USE_AVX2 //
\83e
\83X
\83g
\97p
133 //#define USE_AVX512 //
\83e
\83X
\83g
\97p
135 /* x86 extension define */
137 \8eg
\97p
\82·
\82é
\8ag
\92£
\8b@
\94\
\82ð
\8ew
\92è
\82·
\82é (
\89º
\88Ê
\82Ì
\8ag
\92£
\8b@
\94\
\82ð
\8aÜ
\82Þ
140 USE_SSE // include MMX2
145 USE_SSE42 (SSE4.2 // include POPCNT
146 USE_SSE4 (SSE4.1 +SSE4.2
147 USE_AVX // include PCLMULQDQ
148 USE_AVX2 // include FMA BMI1 BMI2 F16C RDRAND
149 USE_AVX512 // F, CD, VL, DQ, BW
151 /* x86 AMD extension define */
153 \8eg
\97p
\82·
\82é
\8ag
\92£
\8b@
\94\
\82ð
\8ew
\92è
\82·
\82é (
\89º
\88Ê
\82Ì
\8ag
\92£
\8b@
\94\
\82ð
\8aÜ
\82Þ
154 x86 extension
\82à
\8d\87\82í
\82¹
\82Ä
\8ew
\92è
\82·
\82é
156 USE_3DNOW_ENH (3DNow+
157 USE_3DNOW_PRO (3DNow?
162 // x86 extension number
176 //x86 AMD extension number
178 X86_AMD_EXT_NONE = 0,
188 #if defined(__GNUC__) || defined(__MINGW32__) /* target specific option mismatch... */
219 #if !defined(__AVX512F__) || !defined(__AVX512CD__) || !defined(__AVX512VL__) || !defined(__AVX512DQ__) || !defined(__AVX512BW__)
222 #endif /* __GNUC__ */
224 #if defined(USE_AVX512) // _MSC_VER >= 1910 VC2017?
225 #define USE_X86_EXT_INTRIN 10 // F, CD, VL, DQ, BW
226 #elif defined(USE_AVX2) // _MSC_VER >= 1700 VC2013?
227 #define USE_X86_EXT_INTRIN 9
228 #elif defined(USE_AVX) // _MSC_VER >= 1600 VC2010?
229 #define USE_X86_EXT_INTRIN 8
230 #elif defined(USE_SSE42) || defined(USE_SSE4)
231 #define USE_X86_EXT_INTRIN 7
232 #elif defined(USE_SSE41) // _MSC_VER >= 1500 VC2008?
233 #define USE_X86_EXT_INTRIN 6
234 #elif defined(USE_SSSE3)
235 #define USE_X86_EXT_INTRIN 5
236 #elif defined(USE_SSE3) // _MSC_VER >= 1400?? VC2005?
237 #define USE_X86_EXT_INTRIN 4
238 #elif defined(USE_SSE2)
239 #define USE_X86_EXT_INTRIN 3
240 #elif defined(USE_SSE) || defined(USE_MMX2)
241 #define USE_X86_EXT_INTRIN 2 // include MMX2
242 #elif defined(USE_MMX) // _MSC_VER >= 1310 VC2003?
243 #define USE_X86_EXT_INTRIN 1
245 #define USE_X86_EXT_INTRIN 0
248 #if (USE_X86_EXT_INTRIN >= 4)
252 #if defined(USE_AVX512) // _MSC_VER >= 1910 VC2017?
253 #define USE_X64_EXT_INTRIN 10 // F, CD, VL, DQ, BW
254 #elif defined(USE_AVX2) // _MSC_VER >= 1700 VC2013?
255 #define USE_X64_EXT_INTRIN 9
256 #elif defined(USE_AVX) // _MSC_VER >= 1600 VC2010?
257 #define USE_X64_EXT_INTRIN 8
258 #elif defined(USE_SSE42) || defined(USE_SSE4)
259 #define USE_X64_EXT_INTRIN 7
260 #elif defined(USE_SSE41) // _MSC_VER >= 1500 VC2008?
261 #define USE_X64_EXT_INTRIN 6
262 #elif defined(USE_SSSE3)
263 #define USE_X64_EXT_INTRIN 5
264 #elif defined(USE_SSE3) // _MSC_VER >= 1400?? VC2005?
265 #define USE_X64_EXT_INTRIN 4
266 #elif defined(USE_SSE2)
267 #define USE_X64_EXT_INTRIN 3
268 #elif defined(USE_SSE) || defined(USE_MMX2)
269 #define USE_X64_EXT_INTRIN 2 // include MMX2
270 #elif defined(USE_MMX) // _MSC_VER >= 1310 VC2003?
271 #define USE_X64_EXT_INTRIN 1
273 #define USE_X64_EXT_INTRIN 0
276 #if defined(USE_SSE5) // _MSC_VER >= 1700 VC2012?
277 #define USE_X86_AMD_EXT_INTRIN 6
278 #elif defined(USE_SSE4A) // _MSC_VER >= 1600 VC2010?
279 #define USE_X86_AMD_EXT_INTRIN 5
280 #elif defined(USE_3DNOW_PRO)
281 #define USE_X86_AMD_EXT_INTRIN 4
282 #elif defined(USE_3DNOW_ENH)
283 #define USE_X86_AMD_EXT_INTRIN 3
284 #elif defined(USE_3DNOW)
285 #define USE_X86_AMD_EXT_INTRIN 2
286 #elif defined(USE_MMX_EXT)
287 #define USE_X86_AMD_EXT_INTRIN 1
289 #define USE_X86_AMD_EXT_INTRIN 0
292 #if (defined(_MSC_VER) && _MSC_VER >= 1920 && !defined(__clang__)) || defined(__INTEL_COMPILER)
296 #if defined(USE_AVX512)
297 #define USE_X86_EXT_ASM 10 // F, CD, VL, DQ, BW
298 #elif defined(USE_AVX2)
299 #define USE_X86_EXT_ASM 9
300 #elif defined(USE_AVX)
301 #define USE_X86_EXT_ASM 8
302 #elif defined(USE_SSE42) || defined(USE_SSE4)
303 #define USE_X86_EXT_ASM 7
304 #elif defined(USE_SSE41)
305 #define USE_X86_EXT_ASM 6
306 #elif defined(USE_SSSE3)
307 #define USE_X86_EXT_ASM 5
308 #elif defined(USE_SSE3)
309 #define USE_X86_EXT_ASM 4
310 #elif defined(USE_SSE2)
311 #define USE_X86_EXT_ASM 3
312 #elif defined(USE_SSE) || defined(USE_MMX2)
313 #define USE_X86_EXT_ASM 2 // include MMX2
314 #elif defined(USE_MMX)
315 #define USE_X86_EXT_ASM 1
317 #define USE_X86_EXT_ASM 0
320 #if defined(USE_AVX512)
321 #define USE_X64_EXT_ASM 10 // F, CD, VL, DQ, BW
322 #elif defined(USE_AVX2)
323 #define USE_X64_EXT_ASM 9
324 #elif defined(USE_AVX)
325 #define USE_X64_EXT_ASM 8
326 #elif defined(USE_SSE42) || defined(USE_SSE4)
327 #define USE_X64_EXT_ASM 7
328 #elif defined(USE_SSE41)
329 #define USE_X64_EXT_ASM 6
330 #elif defined(USE_SSSE3)
331 #define USE_X64_EXT_ASM 5
332 #elif defined(USE_SSE3)
333 #define USE_X64_EXT_ASM 4
334 #elif defined(USE_SSE2)
335 #define USE_X64_EXT_ASM 3
336 #elif defined(USE_SSE) || defined(USE_MMX2)
337 #define USE_X64_EXT_ASM 2 // include MMX2
338 #elif defined(USE_MMX)
339 #define USE_X64_EXT_ASM 1
341 #define USE_X64_EXT_ASM 0
344 #if defined(USE_SSE4A)
345 #define USE_X86_AMD_EXT_ASM 5
346 #elif defined(USE_3DNOW_PRO)
347 #define USE_X86_AMD_EXT_ASM 4
348 #elif defined(USE_3DNOW_ENH)
349 #define USE_X86_AMD_EXT_ASM 3
350 #elif defined(USE_3DNOW)
351 #define USE_X86_AMD_EXT_ASM 2
352 #elif defined(USE_MMX_EXT)
353 #define USE_X86_AMD_EXT_ASM 1
355 #define USE_X86_AMD_EXT_ASM 0
358 /* asm/intrin
\95s
\89Â
\8fð
\8c\8f \91¼
\82É
\82 \82ê
\82Î
\92Ç
\89Á */
359 #if !defined(IX64CPU)
360 #undef USE_X64_EXT_INTRIN
361 #define USE_X64_EXT_INTRIN 0
362 #undef USE_X64_AMD_EXT_INTRIN
363 #define USE_X64_AMD_EXT_INTRIN 0
365 #if !defined(IX86CPU) && !defined(IX64CPU)
366 #undef USE_X86_EXT_INTRIN
367 #define USE_X86_EXT_INTRIN 0
368 #undef USE_X86_AMD_EXT_INTRIN
369 #define USE_X86_AMD_EXT_INTRIN 0
372 /* Always disable inline asm */
373 #undef USE_X86_EXT_ASM
374 #define USE_X86_EXT_ASM 0
375 #undef USE_X86_AMD_EXT_ASM
376 #define USE_X86_AMD_EXT_ASM 0
377 #undef USE_X64_EXT_ASM
378 #define USE_X64_EXT_ASM 0
379 #undef USE_X64_AMD_EXT_ASM
380 #define USE_X64_AMD_EXT_ASM 0
382 #undef SUPPORT_ASM_INTEL
384 /*****************************************************************************/
385 /* PowerPC's AltiVec enhancement */
388 /* (need -faltivec option) */
390 #define USE_ALTIVEC 0
395 /*****************************************************************************/
396 /*****************************************************************************/
398 #ifdef HAVE_SYS_PARAM_H
399 #include <sys/param.h>
400 #endif/* <sys/param.h> */
401 #ifdef HAVE_SYS_SYSCTL_H
402 #include <sys/sysctl.h>
403 #endif/* <sys/sysctl.h> */
406 #elif defined(HAVE_STRINGS_H)
408 #endif/* <string.h> */
410 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
411 # include <stdbool.h>
414 /*****************************************************************************/
415 #if OPT_MODE == 1 && USE_X86_EXT_ASM > 0
422 #define _double2fixmagic 68719476736.0 * 1.5
424 #if defined(__BORLANDC__) && (__BORLANDC__ >= 1380)
425 extern int32 imuldiv8(int32 a, int32 b);
426 extern int32 imuldiv16(int32 a, int32 b);
427 extern int32 imuldiv24(int32 a, int32 b);
428 extern int32 imuldiv28(int32 a, int32 b);
430 #elif defined(SUPPORT_ASM_AT_AND_T) && defined(__ppc__)
431 static inline int32 imuldiv8(int32 a, int32 b)
433 register int32 ret, rah, ral, rlh, rll;
434 __asm__("mulhw %0,%7,%8\n\t"
436 "rlwinm %2,%0,24,0,7\n\t"
437 "rlwinm %3,%1,24,8,31\n\t"
439 :"=r"(rah),"=r"(ral),
447 static inline int32 imuldiv16(int32 a, int32 b)
449 register int32 ret, rah, ral, rlh, rll;
450 __asm__("mulhw %0,%7,%8\n\t"
452 "rlwinm %2,%0,16,0,15\n\t"
453 "rlwinm %3,%1,16,16,31\n\t"
455 :"=r"(rah),"=r"(ral),
463 static inline int32 imuldiv24(int32 a, int32 b)
465 register int32 ret, rah, ral, rlh, rll;
466 __asm__("mulhw %0,%7,%8\n\t"
468 "rlwinm %2,%0,8,0,23\n\t"
469 "rlwinm %3,%1,8,24,31\n\t"
471 :"=r"(rah),"=r"(ral),
479 static inline int32 imuldiv28(int32 a, int32 b)
481 register int32 ret, rah, ral, rlh, rll;
482 __asm__("mulhw %0,%7,%8\n\t"
484 "rlwinm %2,%0,4,0,27\n\t"
485 "rlwinm %3,%1,4,28,31\n\t"
487 :"=r"(rah),"=r"(ral),
495 #elif defined(SUPPORT_ASM_AT_AND_T)
496 static inline int32 imuldiv8(int32 a, int32 b)
499 __asm__("movl %1, %%eax\n\t"
504 "or %%edx, %%eax\n\t"
512 static inline int32 imuldiv16(int32 a, int32 b)
515 __asm__("movl %1, %%eax\n\t"
520 "or %%edx, %%eax\n\t"
528 static inline int32 imuldiv24(int32 a, int32 b)
531 __asm__("movl %1, %%eax\n\t"
536 "or %%edx, %%eax\n\t"
544 static inline int32 imuldiv28(int32 a, int32 b)
547 __asm__("movl %1, %%eax\n\t"
552 "or %%edx, %%eax\n\t"
560 #elif defined(SUPPORT_ASM_INTEL)
561 inline int32 imuldiv8(int32 a, int32 b) {
572 inline int32 imuldiv16(int32 a, int32 b) {
583 inline int32 imuldiv24(int32 a, int32 b) {
594 inline int32 imuldiv28(int32 a, int32 b) {
605 inline int64 imuldiv24_64bit(int64 a, int64 b) {
606 return ((int64)(a) * (int64)(b)) >> 24;
609 inline int64 int64_imuldiv24(int64 a, int64 b)
611 return ((int64)(a) * (int64)(b)) >> 24;
615 /* Generic version of imuldiv. */
616 #define imuldiv8(a, b) \
617 (int32)(((int64)(a) * (int64)(b)) >> 8)
619 #define imuldiv16(a, b) \
620 (int32)(((int64)(a) * (int64)(b)) >> 16)
622 #define imuldiv24(a, b) \
623 (int32)(((int64)(a) * (int64)(b)) >> 24)
625 #define imuldiv28(a, b) \
626 (int32)(((int64)(a) * (int64)(b)) >> 28)
628 #endif /* architectures */
630 #define ifloor_internal(a, b) \
631 ((a) & ~((1L << (b)) - 1))
634 ifloor_internal(a, 8)
636 #define ifloor16(a) \
637 ifloor_internal(a, 16)
639 #define ifloor24(a) \
640 ifloor_internal(a, 24)
642 #define ifloor28(a) \
643 ifloor_internal(a, 28)
645 static inline int32 signlong(int32 a)
647 return ((a | 0x7fffffff) >> 30);
651 /* Generic version of imuldiv. */
652 #define imuldiv8(a, b) \
653 (int32)(((int64)(a) * (int64)(b)) >> 8)
655 #define imuldiv16(a, b) \
656 (int32)(((int64)(a) * (int64)(b)) >> 16)
658 #define imuldiv24(a, b) \
659 (int32)(((int64)(a) * (int64)(b)) >> 24)
661 #define imuldiv28(a, b) \
662 (int32)(((int64)(a) * (int64)(b)) >> 28)
664 #endif /* OPT_MODE != 0 */
668 /*****************************************************************************/
669 #if (USE_X86_EXT_ASM || USE_X86_EXT_INTRIN || USE_X86_AMD_EXT_ASM || USE_X86_AMD_EXT_INTRIN)
671 #if (USE_X86_EXT_INTRIN || USE_X86_AMD_EXT_INTRIN)
673 #include <x86intrin.h>
674 #elif (_MSC_VER >= 1600) // VC2010(VC10)
676 #else // VC2003(VC7) VC2005(VC8) VC2008(VC9)
677 #include <emmintrin.h>
678 #if defined(USE_X86_AMD_EXT_INTRIN) && (USE_X86_AMD_EXT_INTRIN >= 2)
687 #if ((USE_X86_EXT_ASM >= 10) || (USE_X86_EXT_INTRIN >= 10)) // AVX512 64byte
688 #define ALIGN_SIZE 64
689 #define ALIGN __attribute__((aligned(ALIGN_SIZE)))
690 #define ALIGN32 __attribute__((aligned(32)))
691 #define ALIGN16 __attribute__((aligned(16)))
692 #define ALIGN8 __attribute__((aligned(8)))
693 #elif ((USE_X86_EXT_ASM >= 8) || (USE_X86_EXT_INTRIN >= 8)) // AVX 32byte
694 #define ALIGN_SIZE 32
695 #define ALIGN __attribute__((aligned(ALIGN_SIZE)))
696 #define ALIGN32 __attribute__((aligned(32)))
697 #define ALIGN16 __attribute__((aligned(16)))
698 #define ALIGN8 __attribute__((aligned(8)))
699 #elif ((USE_X86_EXT_ASM >= 2) || (USE_X86_EXT_INTRIN >= 2)) // SSE 16byte // AMD??
700 #define ALIGN_SIZE 16
701 #define ALIGN __attribute__((aligned(ALIGN_SIZE)))
702 #define ALIGN32 __attribute__((aligned(32)))
703 #define ALIGN16 __attribute__((aligned(16)))
704 #define ALIGN8 __attribute__((aligned(8)))
705 #elif ((USE_X86_EXT_ASM >= 1) || (USE_X86_EXT_INTRIN >= 1)) // MMX 8byte // AMD??
707 #define ALIGN __attribute__((aligned(ALIGN_SIZE)))
708 #define ALIGN32 __attribute__((aligned(32)))
709 #define ALIGN16 __attribute__((aligned(16)))
710 #define ALIGN8 __attribute__((aligned(8)))
713 #elif defined(_MSC_VER) || defined(MSC_VER)
715 #if ((USE_X86_EXT_ASM >= 10) || (USE_X86_EXT_INTRIN >= 10)) // AVX512 64byte
716 #define ALIGN_SIZE 64
717 #define ALIGN _declspec(align(ALIGN_SIZE))
718 #define ALIGN32 _declspec(align(32))
719 #define ALIGN16 _declspec(align(16))
720 #define ALIGN8 _declspec(align(8))
721 #elif ((USE_X86_EXT_ASM >= 8) || (USE_X86_EXT_INTRIN >= 8)) // AVX 32byte
722 #define ALIGN_SIZE 32
723 #define ALIGN _declspec(align(ALIGN_SIZE))
724 #define ALIGN32 _declspec(align(32))
725 #define ALIGN16 _declspec(align(16))
726 #define ALIGN8 _declspec(align(8))
727 #elif ((USE_X86_EXT_ASM >= 2) || (USE_X86_EXT_INTRIN >= 2)) // SSE 16byte // AMD??
728 #define ALIGN_SIZE 16
729 #define ALIGN _declspec(align(ALIGN_SIZE))
730 #define ALIGN32 _declspec(align(32))
731 #define ALIGN16 _declspec(align(16))
732 #define ALIGN8 _declspec(align(8))
733 #elif ((USE_X86_EXT_ASM >= 1) || (USE_X86_EXT_INTRIN >= 1)) // MMX 8byte // AMD??
735 #define ALIGN _declspec(align(ALIGN_SIZE))
736 #define ALIGN32 _declspec(align(32))
737 #define ALIGN16 _declspec(align(16))
738 #define ALIGN8 _declspec(align(8))
741 #endif /* __GNUC__, MSC_VER */
744 \88È
\89º
\82ÌFMA
\82Ì
\83}
\83N
\83\8d\82Í CPU
\82ÉFMA
\82Ì
\8eÀ
\91\95\82ª
\82È
\82¢
\8fê
\8d\87\82ÍMADD (
\8aÛ
\82ß
\97L
\96³
\82Ì
\90¸
\93x
\82Ì
\88á
\82¢
\82Í
\8dl
\97¶
\82µ
\82Ä
\82È
\82¢
\83~
\83X
\82Á
\82½
\81E
\81E
745 FMA(vec_a, vec_b, vec_c) : vec_a * vec_b + vec_c
746 FMA2(vec_a, vec_b, vec_c, vec_d) : vec_a * vec_b + vec_c * vec_d
747 LS_FMA(ptr, vec_a, vec_b) : store(ptr, load(ptr) + vec_a * vec_b) // *ptr += vec_a * vec_b
748 LS_ADD(ptr, vec_a) : store(ptr, load(ptr) + vec_a) // *ptr += vec_a
749 LS_MUL(ptr, vec_a) : store(ptr, load(ptr) * vec_a) // *ptr *= vec_a
750 LSU : Unalignment (use loadu/storeu
753 #if (USE_X86_EXT_INTRIN >= 10)
754 #define MM512_FMA_PD(vec_a, vec_b, vec_c) _mm512_fmadd_pd(vec_a, vec_b, vec_c)
755 #define MM512_FMA2_PD(vec_a, vec_b, vec_c, vec_d) _mm512_fmadd_pd(vec_a, vec_b, _mm512_mul_pd(vec_c, vec_d))
756 #define MM512_FMA3_PD(v00, v01, v10, v11, v20, v21) _mm512_fmadd_pd(v20, v21, _mm512_fmadd_pd(v10, v11, _mm512_mul_pd(v00, v01)))
757 #define MM512_FMA4_PD(v00, v01, v10, v11, v20, v21, v30, v31) _mm512_add_pd(\
758 _mm512_fmadd_pd(v30, v31, _mm512_mul_pd(v20, v21)), _mm512_fmadd_pd(v10, v11, _mm512_mul_pd(v00, v01)) )
759 #define MM512_LS_FMA_PD(ptr, vec_a, vec_b) _mm512_store_pd(ptr, _mm512_fmadd_pd(vec_a, vec_b, _mm512_load_pd(ptr)))
760 #define MM512_LSU_FMA_PD(ptr, vec_a, vec_b) _mm512_storeu_pd(ptr, _mm512_fmadd_pd(vec_a, vec_b, _mm512_loadu_pd(ptr)))
761 #define MM512_MSUB_PD(vec_a, vec_b, vec_c) _mm512_fmsub_pd(vec_a, vec_b, vec_c)
762 #define MM512_FMA_PS(vec_a, vec_b, vec_c) _mm512_fmadd_ps(vec_a, vec_b, vec_c)
763 #define MM512_FMA2_PS(vec_a, vec_b, vec_c, vec_d) _mm512_fmadd_ps(vec_a, vec_b, _mm512_mul_ps(vec_c, vec_d))
764 #define MM512_FMA3_PS(v00, v01, v10, v11, v20, v21) _mm512_fmadd_ps(v20, v21, _mm512_fmadd_ps(v10, v11, _mm512_mul_ps(v00, v01))
765 #define MM512_LS_FMA_PS(ptr, vec_a, vec_b) _mm512_store_ps(ptr, _mm512_fmadd_ps(vec_a, vec_b, _mm512_load_ps(ptr)))
766 #define MM512_LSU_FMA_PS(ptr, vec_a, vec_b) _mm512_storeu_ps(ptr, _mm512_fmadd_ps(vec_a, vec_b, _mm512_loadu_ps(ptr)))
767 #define MM512_MSUB_PS(vec_a, vec_b, vec_c) _mm512_fmsub_ps(vec_a, vec_b, vec_c)
768 #define MM512_LS_ADD_PD(ptr, vec_a) _mm512_store_pd(ptr, _mm512_add_pd(_mm512_load_pd(ptr), vec_a))
769 #define MM512_LSU_ADD_PD(ptr, vec_a) _mm512_storeu_pd(ptr, _mm512_add_pd(_mm512_loadu_pd(ptr), vec_a))
770 #define MM512_LS_MUL_PD(ptr, vec_a) _mm512_store_pd(ptr, _mm512_mul_pd(_mm512_load_pd(ptr), vec_a))
771 #define MM512_LSU_MUL_PD(ptr, vec_a) _mm512_storeu_pd(ptr, _mm512_mul_pd(_mm512_loadu_pd(ptr), vec_a))
772 #define MM512_LS_ADD_PS(ptr, vec_a) _mm512_store_ps(ptr, _mm512_add_ps(_mm512_load_ps(ptr), vec_a))
773 #define MM512_LSU_ADD_PS(ptr, vec_a) _mm512_storeu_ps(ptr, _mm512_add_ps(_mm512_loadu_ps(ptr), vec_a))
774 #define MM512_LS_MUL_PS(ptr, vec_a) _mm512_store_ps(ptr, _mm512_mul_ps(_mm512_load_ps(ptr), vec_a))
775 #define MM512_LSU_MUL_PS(ptr, vec_a) _mm512_storeu_ps(ptr, _mm512_mul_ps(_mm512_loadu_ps(ptr), vec_a))
776 #endif // (USE_X86_EXT_INTRIN >= 10)
778 #if (USE_X86_EXT_INTRIN >= 9)
779 #define MM256_SET2X_SI256(vec_a, vec_b) \
780 _mm256_inserti128_si256(_mm256_castsi128_si256(vec_a), vec_b, 0x1)
783 #if (USE_X86_EXT_INTRIN >= 8)
784 #if (USE_X86_EXT_INTRIN >= 9)
785 #define MM256_FMA_PD(vec_a, vec_b, vec_c) _mm256_fmadd_pd(vec_a, vec_b, vec_c)
786 #define MM256_FMA2_PD(vec_a, vec_b, vec_c, vec_d) _mm256_fmadd_pd(vec_a, vec_b, _mm256_mul_pd(vec_c, vec_d))
787 #define MM256_FMA3_PD(v00, v01, v10, v11, v20, v21) _mm256_fmadd_pd(v20, v21, _mm256_fmadd_pd(v10, v11, _mm256_mul_pd(v00, v01)))
788 #define MM256_FMA4_PD(v00, v01, v10, v11, v20, v21, v30, v31) _mm256_add_pd(\
789 _mm256_fmadd_pd(v30, v31, _mm256_mul_pd(v20, v21)), _mm256_fmadd_pd(v10, v11, _mm256_mul_pd(v00, v01)) )
790 #define MM256_LS_FMA_PD(ptr, vec_a, vec_b) _mm256_store_pd(ptr, _mm256_fmadd_pd(vec_a, vec_b, _mm256_load_pd(ptr)))
791 #define MM256_LSU_FMA_PD(ptr, vec_a, vec_b) _mm256_storeu_pd(ptr, _mm256_fmadd_pd(vec_a, vec_b, _mm256_loadu_pd(ptr)))
792 #define MM256_MSUB_PD(vec_a, vec_b, vec_c) _mm256_fmsub_pd(vec_a, vec_b, vec_c)
793 #define MM256_FMA_PS(vec_a, vec_b, vec_c) _mm256_fmadd_ps(vec_a, vec_b, vec_c)
794 #define MM256_FMA2_PS(vec_a, vec_b, vec_c, vec_d) _mm256_fmadd_ps(vec_a, vec_b, _mm256_mul_ps(vec_c, vec_d))
795 #define MM256_FMA3_PS(v00, v01, v10, v11, v20, v21) _mm256_fmadd_ps(v20, v21, _mm256_fmadd_ps(v10, v11, _mm256_mul_ps(v00, v01))
796 #define MM256_LS_FMA_PS(ptr, vec_a, vec_b) _mm256_store_ps(ptr, _mm256_fmadd_ps(vec_a, vec_b, _mm256_load_ps(ptr)))
797 #define MM256_LSU_FMA_PS(ptr, vec_a, vec_b) _mm256_storeu_ps(ptr, _mm256_fmadd_ps(vec_a, vec_b, _mm256_loadu_ps(ptr)))
798 #define MM256_MSUB_PS(vec_a, vec_b, vec_c) _mm256_fmsub_ps(vec_a, vec_b, vec_c)
799 #else // ! (USE_X86_EXT_INTRIN >= 9)
800 #define MM256_FMA_PD(vec_a, vec_b, vec_c) _mm256_add_pd(_mm256_mul_pd(vec_a, vec_b), vec_c)
801 #define MM256_FMA2_PD(vec_a, vec_b, vec_c, vec_d) _mm256_add_pd(_mm256_mul_pd(vec_a, vec_b), _mm256_mul_pd(vec_c, vec_d))
802 #define MM256_FMA3_PD(v00, v01, v10, v11, v20, v21) _mm256_add_pd(\
803 _mm256_add_pd(_mm256_mul_pd(v00, v01),_mm256_mul_pd(v10, v11)), _mm256_mul_pd(v20, v21))
804 #define MM256_FMA4_PD(v00, v01, v10, v11, v20, v21, v30, v31) _mm256_add_pd(\
805 _mm256_add_pd(_mm256_mul_pd(v00, v01),_mm256_mul_pd(v10, v11)), _mm256_add_pd(_mm256_mul_pd(v20, v21),_mm256_mul_pd(v30, v31)))
806 #define MM256_LS_FMA_PD(ptr, vec_a, vec_b) _mm256_store_pd(ptr, _mm256_add_pd(_mm256_load_pd(ptr), _mm256_mul_pd(vec_a, vec_b)))
807 #define MM256_LSU_FMA_PD(ptr, vec_a, vec_b) _mm256_storeu_pd(ptr, _mm256_add_pd(_mm256_loadu_pd(ptr), _mm256_mul_pd(vec_a, vec_b)))
808 #define MM256_MSUB_PD(vec_a, vec_b, vec_c) _mm256_sub_pd(_mm256_mul_pd(vec_a, vec_b), vec_c)
809 #define MM256_FMA_PS(vec_a, vec_b, vec_c) _mm256_add_ps(_mm256_mul_ps(vec_a, vec_b), vec_c)
810 #define MM256_FMA2_PS(vec_a, vec_b, vec_c, vec_d) _mm256_add_ps(_mm256_mul_ps(vec_a, vec_b), _mm256_mul_ps(vec_c, vec_d))
811 #define MM256_FMA3_PS(v00, v01, v10, v11, v20, v21) _mm256_add_ps(\
812 _mm256_add_ps(_mm256_mul_ps(v00, v01),_mm256_mul_ps(v10, v11)), _mm256_mul_ps(v20, v21)))
813 #define MM256_LS_FMA_PS(ptr, vec_a, vec_b) _mm256_store_ps(ptr, _mm256_add_ps(_mm256_load_ps(ptr), _mm256_mul_ps(vec_a, vec_b)))
814 #define MM256_LSU_FMA_PS(ptr, vec_a, vec_b) _mm256_storeu_ps(ptr, _mm256_add_ps(_mm256_loadu_ps(ptr), _mm256_mul_ps(vec_a, vec_b)))
815 #define MM256_MSUB_PS(vec_a, vec_b, vec_c) _mm256_sub_ps(_mm256_mul_ps(vec_a, vec_b), vec_c)
816 #endif // (USE_X86_EXT_INTRIN >= 9)
817 #define MM256_LS_ADD_PD(ptr, vec_a) _mm256_store_pd(ptr, _mm256_add_pd(_mm256_load_pd(ptr), vec_a))
818 #define MM256_LSU_ADD_PD(ptr, vec_a) _mm256_storeu_pd(ptr, _mm256_add_pd(_mm256_loadu_pd(ptr), vec_a))
819 #define MM256_LS_MUL_PD(ptr, vec_a) _mm256_store_pd(ptr, _mm256_mul_pd(_mm256_load_pd(ptr), vec_a))
820 #define MM256_LSU_MUL_PD(ptr, vec_a) _mm256_storeu_pd(ptr, _mm256_mul_pd(_mm256_loadu_pd(ptr), vec_a))
821 #define MM256_LS_ADD_PS(ptr, vec_a) _mm256_store_ps(ptr, _mm256_add_ps(_mm256_load_ps(ptr), vec_a))
822 #define MM256_LSU_ADD_PS(ptr, vec_a) _mm256_storeu_ps(ptr, _mm256_add_ps(_mm256_loadu_ps(ptr), vec_a))
823 #define MM256_LS_MUL_PS(ptr, vec_a) _mm256_store_ps(ptr, _mm256_mul_ps(_mm256_load_ps(ptr), vec_a))
824 #define MM256_LSU_MUL_PS(ptr, vec_a) _mm256_storeu_ps(ptr, _mm256_mul_ps(_mm256_loadu_ps(ptr), vec_a))
825 #define MM256_SET2X_PS(vec_a, vec_b) \
826 _mm256_insertf128_ps(_mm256_castps128_ps256(vec_a), vec_b, 0x1)
827 #define MM256_SET2X_PD(vec_a, vec_b) \
828 _mm256_insertf128_pd(_mm256_castpd128_pd256(vec_a), vec_b, 0x1)
829 #endif // (USE_X86_EXT_INTRIN >= 8)
831 #if (USE_X86_EXT_INTRIN >= 3)
832 #if (USE_X86_EXT_INTRIN >= 9)
833 #define MM_FMA_PD(vec_a, vec_b, vec_c) _mm_fmadd_pd(vec_a, vec_b, vec_c)
834 #define MM_FMA2_PD(vec_a, vec_b, vec_c, vec_d) _mm_fmadd_pd(vec_a, vec_b, _mm_mul_pd(vec_c, vec_d))
835 #define MM_FMA3_PD(v00, v01, v10, v11, v20, v21) _mm_fmadd_pd(v20, v21, _mm_fmadd_pd(v10, v11, _mm_mul_pd(v00, v01)) )
836 #define MM_FMA4_PD(v00, v01, v10, v11, v20, v21, v30, v31) _mm_add_pd(\
837 _mm_fmadd_pd(v30, v31, _mm_mul_pd(v20, v21)), _mm_fmadd_pd(v10, v11, _mm_mul_pd(v00, v01)) )
838 #define MM_FMA5_PD(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41) _mm_add_pd(_mm_fmadd_pd(v40, v41, \
839 _mm_fmadd_pd(v30, v31, _mm_mul_pd(v20, v21))), _mm_fmadd_pd(v10, v11, _mm_mul_pd(v00, v01)) )
840 #define MM_FMA6_PD(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41, v50, v51) _mm_add_pd(\
841 _mm_fmadd_pd(v50, v51, _mm_fmadd_pd(v40, v41, _mm_mul_pd(v30, v31))), \
842 _mm_fmadd_pd(v20, v21, _mm_fmadd_pd(v10, v11, _mm_mul_pd(v00, v01))) )
843 #define MM_MSUB_PD(vec_a, vec_b, vec_c) _mm_fmsub_pd(vec_a, vec_b, vec_c)
844 #define MM_LS_FMA_PD(ptr, vec_a, vec_b) _mm_store_pd(ptr, _mm_fmadd_pd(vec_a, vec_b, _mm_load_pd(ptr)))
845 #define MM_LSU_FMA_PD(ptr, vec_a, vec_b) _mm_storeu_pd(ptr, _mm_fmadd_pd(vec_a, vec_b, _mm_loadu_pd(ptr)))
846 #define MM_MSUB_PD(vec_a, vec_b, vec_c) _mm_fmsub_pd(vec_a, vec_b, vec_c)
847 #else // !(USE_X86_EXT_INTRIN >= 9)
848 #define MM_FMA_PD(vec_a, vec_b, vec_c) _mm_add_pd(_mm_mul_pd(vec_a, vec_b), vec_c)
849 #define MM_FMA2_PD(vec_a, vec_b, vec_c, vec_d) _mm_add_pd(_mm_mul_pd(vec_a, vec_b), _mm_mul_pd(vec_c, vec_d))
850 #define MM_FMA3_PD(v00, v01, v10, v11, v20, v21) _mm_add_pd(\
851 _mm_add_pd(_mm_mul_pd(v00, v01),_mm_mul_pd(v10, v11)), _mm_mul_pd(v20, v21) )
852 #define MM_FMA4_PD(v00, v01, v10, v11, v20, v21, v30, v31) _mm_add_pd(\
853 _mm_add_pd(_mm_mul_pd(v00, v01),_mm_mul_pd(v10, v11)), _mm_add_pd(_mm_mul_pd(v20, v21),_mm_mul_pd(v30, v31)))
854 #define MM_FMA5_PD(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41) _mm_add_pd(_mm_add_pd(\
855 _mm_add_pd(_mm_mul_pd(v00, v01),_mm_mul_pd(v10, v11)), _mm_add_pd(_mm_mul_pd(v20, v21),_mm_mul_pd(v30, v31)))\
856 , _mm_mul_pd(v40, v41))
857 #define MM_FMA6_PD(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41, v50, v51) _mm_add_pd(_mm_add_pd(\
858 _mm_add_pd(_mm_mul_pd(v00, v01),_mm_mul_pd(v10, v11)), _mm_add_pd(_mm_mul_pd(v20, v21),_mm_mul_pd(v30, v31)))\
859 , _mm_add_pd(_mm_mul_pd(v40, v41),_mm_mul_pd(v50, v51)))
860 #define MM_LS_FMA_PD(ptr, vec_a, vec_b) _mm_store_pd(ptr, _mm_add_pd(_mm_load_pd(ptr), _mm_mul_pd(vec_a, vec_b)))
861 #define MM_LSU_FMA_PD(ptr, vec_a, vec_b) _mm_storeu_pd(ptr, _mm_add_pd(_mm_loadu_pd(ptr), _mm_mul_pd(vec_a, vec_b)))
862 #define MM_MSUB_PD(vec_a, vec_b, vec_c) _mm_sub_pd(_mm_mul_pd(vec_a, vec_b), vec_c)
863 #endif // (USE_X86_EXT_INTRIN >= 9)
865 #define MM_LS_ADD_PD(ptr, vec_a) _mm_store_pd(ptr, _mm_add_pd(_mm_load_pd(ptr), vec_a))
866 #define MM_LSU_ADD_PD(ptr, vec_a) _mm_storeu_pd(ptr, _mm_add_pd(_mm_loadu_pd(ptr), vec_a))
867 #define MM_LS_MUL_PD(ptr, vec_a) _mm_store_pd(ptr, _mm_mul_pd(_mm_load_pd(ptr), vec_a))
868 #define MM_LSU_MUL_PD(ptr, vec_a) _mm_storeu_pd(ptr, _mm_mul_pd(_mm_loadu_pd(ptr), vec_a))
870 #if 0//(USE_X86_EXT_INTRIN >= 4) // sse3
871 #define MM_LOAD1_PD(ptr) _mm_loaddup_pd(ptr) // slow!
872 #else // !(USE_X86_EXT_INTRIN >= 4)
873 #define MM_LOAD1_PD(ptr) _mm_load1_pd(ptr)
874 #endif // (USE_X86_EXT_INTRIN >= 4)
876 #if (USE_X86_EXT_INTRIN >= 6) // sse4.1
877 #define MM_EXTRACT_EPI32(vec,num) _mm_extract_epi32(vec,num) // num:0~3
878 #else // ! (USE_X86_EXT_INTRIN >= 6)
879 #define MM_EXTRACT_EPI32(vec,num) _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, num)) // num:0~3
880 #endif // (USE_X86_EXT_INTRIN >= 6)
882 #if (USE_X86_EXT_INTRIN >= 6) // sse4.1
883 #define MM_BLENDV_PS(va, vb, vmask) _mm_blendv_ps(va, vb, vmask)
884 #define MM_BLENDV_PD(va, vb, vmask) _mm_blendv_pd(va, vb, vmask)
886 // Every element of vmask must be either 0 or -1!
887 #define MM_BLENDV_PS(va, vb, vmask) _mm_or_ps(_mm_andnot_ps(vmask, va), _mm_and_ps(vmask, vb))
888 #define MM_BLENDV_PD(va, vb, vmask) _mm_or_pd(_mm_andnot_pd(vmask, va), _mm_and_pd(vmask, vb))
891 #endif // (USE_X86_EXT_INTRIN >= 3)
893 #if (USE_X86_EXT_INTRIN >= 2)
894 #if (USE_X86_EXT_INTRIN >= 9)
895 #define MM_FMA_PS(vec_a, vec_b, vec_c) _mm_fmadd_ps(vec_a, vec_b, vec_c)
896 #define MM_FMA2_PS(vec_a, vec_b, vec_c, vec_d) _mm_fmadd_ps(vec_a, vec_b, _mm_mul_ps(vec_c, vec_d))
897 #define MM_FMA3_PS(v00, v01, v10, v11, v20, v21) _mm_fmadd_ps(v20, v21, _mm_fmadd_ps(v10, v11, _mm_mul_ps(v00, v01))
898 #define MM_FMA4_PS(v00, v01, v10, v11, v20, v21, v30, v31) _mm_add_ps(\
899 _mm_fmadd_ps(v30, v31, _mm_mul_ps(v20, v21)), _mm_fmadd_ps(v10, v11, _mm_mul_ps(v00, v01)) )
900 #define MM_FMA5_PS(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41) _mm_fmadd_ps(v40, v41, \
901 _mm_fmadd_ps(v30, v31, _mm_mul_ps(v20, v21)), _mm_fmadd_ps(v10, v11, _mm_mul_ps(v00, v01)) )
902 #define MM_FMA6_PS(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41, v50, v51) _mm_add_ps(\
903 _mm_fmadd_ps(v50, v51, _mm_fmadd_ps(v40, v41, _mm_mul_ps(v30, v31))), \
904 _mm_fmadd_ps(v20, v21, _mm_fmadd_ps(v10, v11, _mm_mul_ps(v00, v01))) )
905 #define MM_LS_FMA_PS(ptr, vec_a, vec_b) _mm_store_ps(ptr, _mm_fmadd_ps(vec_a, vec_b, _mm_load_ps(ptr)))
906 #define MM_LSU_FMA_PS(ptr, vec_a, vec_b) _mm_storeu_ps(ptr, _mm_fmadd_ps(vec_a, vec_b, _mm_loadu_ps(ptr)))
907 #define MM_MSUB_PS(vec_a, vec_b, vec_c) _mm_fmsub_ps(vec_a, vec_b, vec_c)
909 #define MM_FMA_PS(vec_a, vec_b, vec_c) _mm_add_ps(_mm_mul_ps(vec_a, vec_b), vec_c)
910 #define MM_FMA2_PS(vec_a, vec_b, vec_c, vec_d) _mm_add_ps(_mm_mul_ps(vec_a, vec_b), _mm_mul_ps(vec_c, vec_d))
911 #define MM_FMA3_PS(v00, v01, v10, v11, v20, v21) _mm_add_ps(\
912 _mm_add_ps(_mm_mul_ps(v00, v01),_mm_mul_ps(v10, v11)), _mm_mul_ps(v20, v21))
913 #define MM_FMA4_PS(v00, v01, v10, v11, v20, v21, v30, v31) _mm_add_ps(\
914 _mm_add_ps(_mm_mul_ps(v00, v01),_mm_mul_ps(v10, v11)), _mm_add_ps(_mm_mul_ps(v20, v21),_mm_mul_ps(v30, v31))))
915 #define MM_FMA5_PS(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41) _mm_add_ps(_mm_add_ps(\
916 _mm_add_ps(_mm_mul_ps(v00, v01),_mm_mul_ps(v10, v11)), _mm_add_ps(_mm_mul_ps(v20, v21),_mm_mul_ps(v30, v31)))\
917 , _mm_mul_ps(v40, v41))
918 #define MM_FMA6_PS(v00, v01, v10, v11, v20, v21, v30, v31, v40, v41, v50, v51) _mm_add_ps(_mm_add_ps(\
919 _mm_add_ps(_mm_mul_ps(v00, v01),_mm_mul_ps(v10, v11)), _mm_add_ps(_mm_mul_ps(v20, v21),_mm_mul_ps(v30, v31)))\
920 , _mm_add_ps(_mm_mul_ps(v40, v41),_mm_mul_ps(v50, v51)))
921 #define MM_LS_FMA_PS(ptr, vec_a, vec_b) _mm_store_ps(ptr, _mm_add_ps(_mm_load_ps(ptr), _mm_mul_ps(vec_a, vec_b)))
922 #define MM_LSU_FMA_PS(ptr, vec_a, vec_b) _mm_storeu_ps(ptr, _mm_add_ps(_mm_loadu_ps(ptr), _mm_mul_ps(vec_a, vec_b)))
923 #define MM_MSUB_PS(vec_a, vec_b, vec_c) _mm_sub_ps(_mm_mul_ps(vec_a, vec_b), vec_c)
925 #define MM_LS_ADD_PS(ptr, vec_a) _mm_store_ps(ptr, _mm_add_ps(_mm_load_ps(ptr), vec_a))
926 #define MM_LSU_ADD_PS(ptr, vec_a) _mm_storeu_ps(ptr, _mm_add_ps(_mm_loadu_ps(ptr), vec_a))
927 #define MM_LS_MUL_PS(ptr, vec_a) _mm_store_ps(ptr, _mm_mul_ps(_mm_load_ps(ptr), vec_a))
928 #define MM_LSU_MUL_PS(ptr, vec_a) _mm_storeu_ps(ptr, _mm_mul_ps(_mm_loadu_ps(ptr), vec_a))
931 #if (USE_X86_EXT_INTRIN >= 1)
932 #if !defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
933 #define MM_EXTRACT_F32(reg,idx) _mm_cvtss_f32(_mm_shuffle_ps(reg,reg,idx))
934 #define MM_EXTRACT_F64(reg,idx) _mm_cvtsd_f64(_mm_shuffle_pd(reg,reg,idx))
935 #define MM_EXTRACT_I32(reg,idx) _mm_cvtsi128_si32(_mm_shuffle_epi32(reg,idx))
936 #if (USE_X86_EXT_INTRIN >= 9)
937 #define MM256_EXTRACT_F32(reg,idx) _mm256_cvtss_f32(_mm256_permutevar8x32_ps(reg,idx))
938 #define MM256_EXTRACT_F64(reg,idx) _mm256_cvtsd_f64(_mm256_permute4x64_pd(reg,idx))
940 #define MM256_EXTRACT_F32(reg,idx) _mm_cvtss_f32(_mm_permute_ps(_mm256_extractf128_ps(reg, idx >= 4), idx % 4))
941 #define MM256_EXTRACT_F64(reg,idx) _mm_cvtsd_f64(_mm_permute_pd(_mm256_extractf128_ps(reg, idx >= 2), idx % 2))
943 #define MM256_EXTRACT_I32(reg,idx) _mm256_extract_epi32(reg,idx)
944 #define MM512_EXTRACT_F32(reg,idx) _mm_cvtss_f32(_mm_permute_ps(_mm512_extractf32x4_ps(reg, idx >> 2), idx & 3)))
945 #define MM512_EXTRACT_F64(reg,idx) _mm_cvtsd_f64(_mm_permute_pd(_mm512_extractf64x2_pd(reg, idx >> 1), idx & 1))
946 #define MM512_EXTRACT_I32(reg,idx) _mm_cvtsi128_si32(_mm_shuffle_epi32(_mm512_extracti32x4_epi32(reg, idx >> 2), idx & 3))
948 #define MM_EXTRACT_F32(reg,idx) reg.m128_f32[idx]
949 #define MM_EXTRACT_F64(reg,idx) reg.m128d_f64[idx]
950 #define MM_EXTRACT_I32(reg,idx) reg.m128i_i32[idx]
951 #define MM256_EXTRACT_F32(reg,idx) reg.m256_f32[idx]
952 #define MM256_EXTRACT_F64(reg,idx) reg.m256d_f64[idx]
953 #define MM256_EXTRACT_I32(reg,idx) reg.m256i_i32[idx]
954 #define MM512_EXTRACT_F32(reg,idx) reg.m512_f32[idx]
955 #define MM512_EXTRACT_F64(reg,idx) reg.m512d_f64[idx]
956 #define MM512_EXTRACT_I32(reg,idx) reg.m512i_i32[idx]
958 #endif // (USE_X86_EXT_INTRIN >= 1)
960 #define IS_ALIGN(ptr) (!((int32)ptr & (ALIGN_SIZE - 1)))
961 extern int is_x86ext_available(void);
969 #define ALIGNED_MALLOC(size) malloc(size)
970 #define ALIGNED_FREE(ptr) free(ptr)
972 #ifndef aligned_malloc
973 #define aligned_malloc(size_byte, align_size) malloc(size_byte)
976 #define aligned_free(ptr) free(ptr)
982 /*****************************************************************************/
985 #ifndef __bool_true_false_are_defined
987 typedef enum { false = 0, true = 1 } bool;
988 #endif /* C99 Hack */
991 typedef vector signed int vint32;
992 typedef vector signed char vint8;
995 void v_memset(void *dest, int c, size_t len);
996 void v_memzero(void *dest, size_t len);
997 void v_set_dry_signal(void *dest, const int32 *buf, int32 n);
999 /* inline functions */
1000 extern inline bool is_altivec_available(void)
1002 int sel[2] = { CTL_HW, HW_VECTORUNIT };
1003 int has_altivec = false;
1004 size_t len = sizeof(has_altivec);
1005 int error = sysctl(sel, 2, &has_altivec, &len, NULL, 0);
1007 return (bool)!!has_altivec;
1013 extern inline void libc_memset(void *destp, int c, size_t len)
1015 memset(destp, c, len);
1018 static inline void *switch_memset(void *destp, int c, size_t len)
1020 void *keepdestp = destp;
1021 if (!is_altivec_available()) {
1022 libc_memset(destp, c, len);
1024 v_memset(destp, c, len);
1026 v_memzero(destp, len);
1031 #define memset switch_memset
1032 #endif /* altivec */
1035 #pragma clang diagnostic pop
1038 #endif /* OPTCODE_H_INCLUDED */