OSDN Git Service

Rewrite intrinsics for GCC compatibility
authorStarg <starg@users.osdn.me>
Sun, 11 Mar 2018 08:09:21 +0000 (17:09 +0900)
committerStarg <starg@users.osdn.me>
Sun, 11 Mar 2018 08:09:21 +0000 (17:09 +0900)
Patch from <https://jbbs.shitaraba.net/bbs/read.cgi/computer/42137/1499269478/116>

timidity/effect.c
timidity/int_synth.c
timidity/optcode.c
timidity/optcode.h
timidity/output.c
timidity/playmidi.c
timidity/resample.c
timidity/sysdep.h
timidity/thread_mix.c
timidity/voice_effect.c

index a4f3974..41080b7 100644 (file)
@@ -7029,8 +7029,8 @@ static void do_reverb_ex_mod_chST(DATA_T *buf, int32 count, InfoReverbEX *info)
                _mm_storeu_pd(&info->mcount[i][REV_EX_RV_L1], vc[1]);
                vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->mphase[i][REV_EX_ER_L1])); // mcount+mphase
                vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->mphase[i][REV_EX_RV_L1])); // mcount+mphase
-               vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(mc)
-               vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(mc)       
+               vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(mc)
+               vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],1))); // lookup2_sine_p(mc)     
                vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_ER_L1]), vd[0]); // mdepth* sine
                vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_RV_L1]), vd[1]); // mdepth* sine
                vfp[0] = _mm_sub_pd(_mm_sub_pd(vmi, _mm_loadu_pd(&info->mdelay[i][REV_EX_ER_L1])), vd[0]); // mindex-mdelay-mdepth
@@ -7046,10 +7046,10 @@ static void do_reverb_ex_mod_chST(DATA_T *buf, int32 count, InfoReverbEX *info)
                vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
                vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
 #endif
-               vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
-               vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
-               vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
-               vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+               vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+               vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+               vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+               vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
                vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
                vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
                vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
@@ -7152,8 +7152,8 @@ static void do_reverb_ex_mod_chMS(DATA_T *buf, int32 count, InfoReverbEX *info)
                _mm_storeu_pd(&info->mcount[i][REV_EX_RV_L1], vc[1]);
                vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->mphase[i][REV_EX_ER_L1])); // mcount+mphase
                vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->mphase[i][REV_EX_RV_L1])); // mcount+mphase
-               vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(mc)
-               vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(mc)       
+               vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(mc)
+               vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],0))); // lookup2_sine_p(mc)     
                vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_ER_L1]), vd[0]); // mdepth* sine
                vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_RV_L1]), vd[1]); // mdepth* sine
                vfp[0] = _mm_sub_pd(_mm_sub_pd(vmi, _mm_loadu_pd(&info->mdelay[i][REV_EX_ER_L1])), vd[0]); // mindex-mdelay-mdepth
@@ -7169,10 +7169,10 @@ static void do_reverb_ex_mod_chMS(DATA_T *buf, int32 count, InfoReverbEX *info)
                vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
                vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
 #endif
-               vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
-               vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
-               vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
-               vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+               vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+               vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+               vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+               vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
                vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
                vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
                vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
@@ -7279,8 +7279,8 @@ static void do_reverb_ex_mod_chST_ap8(DATA_T *buf, int32 count, InfoReverbEX *in
                _mm_storeu_pd(&info->mcount[i][REV_EX_RV_L1], vc[1]);
                vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->mphase[i][REV_EX_ER_L1])); // mcount+mphase
                vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->mphase[i][REV_EX_RV_L1])); // mcount+mphase
-               vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(mc)
-               vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(mc)       
+               vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(mc)
+               vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],0))); // lookup2_sine_p(mc)     
                vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_ER_L1]), vd[0]); // mdepth* sine
                vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_RV_L1]), vd[1]); // mdepth* sine
                vfp[0] = _mm_sub_pd(_mm_sub_pd(vmi, _mm_loadu_pd(&info->mdelay[i][REV_EX_ER_L1])), vd[0]); // mindex-mdelay-mdepth
@@ -7296,10 +7296,10 @@ static void do_reverb_ex_mod_chST_ap8(DATA_T *buf, int32 count, InfoReverbEX *in
                vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
                vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
 #endif
-               vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
-               vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
-               vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
-               vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+               vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+               vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+               vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+               vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
                vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
                vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
                vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
@@ -7369,8 +7369,8 @@ static void do_reverb_ex_mod_chST_ap8(DATA_T *buf, int32 count, InfoReverbEX *in
                _mm_storeu_pd(&info->acount[i][REV_EX_RV_L1], vc[1]);
                vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->aphase[i][REV_EX_ER_L1])); // count+phase
                vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->aphase[i][REV_EX_RV_L1])); // count+phase
-               vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(count)
-               vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(cuont)    
+               vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(count)
+               vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],0))); // lookup2_sine_p(cuont)  
                vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->adepth[i][REV_EX_ER_L1]), vd[0]); // depth* sine
                vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->adepth[i][REV_EX_RV_L1]), vd[1]); // depth* sine
                vfp[0] = _mm_sub_pd(_mm_sub_pd(vai, _mm_loadu_pd(&info->adelay[i][REV_EX_ER_L1])), vd[0]); // index-delay-depth
@@ -7386,10 +7386,10 @@ static void do_reverb_ex_mod_chST_ap8(DATA_T *buf, int32 count, InfoReverbEX *in
                vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
                vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
 #endif
-               vtmp[0] = _mm_loadu_pd(&info->abuf[REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
-               vtmp[1] = _mm_loadu_pd(&info->abuf[REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
-               vtmp[2] = _mm_loadu_pd(&info->abuf[REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
-               vtmp[3] = _mm_loadu_pd(&info->abuf[REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+               vtmp[0] = _mm_loadu_pd(&info->abuf[REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+               vtmp[1] = _mm_loadu_pd(&info->abuf[REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+               vtmp[2] = _mm_loadu_pd(&info->abuf[REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+               vtmp[3] = _mm_loadu_pd(&info->abuf[REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
                vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
                vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
                vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
@@ -7467,8 +7467,8 @@ static void do_reverb_ex_mod_chMS_ap8(DATA_T *buf, int32 count, InfoReverbEX *in
                _mm_storeu_pd(&info->mcount[i][REV_EX_RV_L1], vc[1]);
                vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->mphase[i][REV_EX_ER_L1])); // mcount+mphase
                vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->mphase[i][REV_EX_RV_L1])); // mcount+mphase
-               vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(mc)
-               vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(mc)       
+               vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(mc)
+               vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],0))); // lookup2_sine_p(mc)     
                vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_ER_L1]), vd[0]); // mdepth* sine
                vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_RV_L1]), vd[1]); // mdepth* sine
                vfp[0] = _mm_sub_pd(_mm_sub_pd(vmi, _mm_loadu_pd(&info->mdelay[i][REV_EX_ER_L1])), vd[0]); // mindex-mdelay-mdepth
@@ -7484,10 +7484,10 @@ static void do_reverb_ex_mod_chMS_ap8(DATA_T *buf, int32 count, InfoReverbEX *in
                vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
                vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
 #endif
-               vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
-               vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
-               vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
-               vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+               vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+               vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+               vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+               vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
                vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
                vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
                vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
@@ -7556,8 +7556,8 @@ static void do_reverb_ex_mod_chMS_ap8(DATA_T *buf, int32 count, InfoReverbEX *in
                _mm_storeu_pd(&info->acount[i][REV_EX_RV_L1], vc[1]);
                vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->aphase[i][REV_EX_ER_L1])); // count+phase
                vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->aphase[i][REV_EX_RV_L1])); // count+phase
-               vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(count)
-               vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(cuont)    
+               vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(count)
+               vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],0))); // lookup2_sine_p(cuont)  
                vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->adepth[i][REV_EX_ER_L1]), vd[0]); // depth* sine
                vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->adepth[i][REV_EX_RV_L1]), vd[1]); // depth* sine
                vfp[0] = _mm_sub_pd(_mm_sub_pd(vai, _mm_loadu_pd(&info->adelay[i][REV_EX_ER_L1])), vd[0]); // index-delay-depth
@@ -7573,10 +7573,10 @@ static void do_reverb_ex_mod_chMS_ap8(DATA_T *buf, int32 count, InfoReverbEX *in
                vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
                vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
 #endif
-               vtmp[0] = _mm_loadu_pd(&info->abuf[REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
-               vtmp[1] = _mm_loadu_pd(&info->abuf[REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
-               vtmp[2] = _mm_loadu_pd(&info->abuf[REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
-               vtmp[3] = _mm_loadu_pd(&info->abuf[REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+               vtmp[0] = _mm_loadu_pd(&info->abuf[REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+               vtmp[1] = _mm_loadu_pd(&info->abuf[REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+               vtmp[2] = _mm_loadu_pd(&info->abuf[REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+               vtmp[3] = _mm_loadu_pd(&info->abuf[REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
                vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
                vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
                vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
@@ -11828,10 +11828,10 @@ static inline void do_pitch_shifter_core(DATA_T *buf, InfoPitchShifter_core *inf
                vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
                vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
 #endif
-               vin[0] = _mm_loadu_pd(&info->ptr[vindex[0].m128i_i32[0]]); // v1v2
-               vin[1] = _mm_loadu_pd(&info->ptr[vindex[0].m128i_i32[1]]); // v1v2
-               vin[2] = _mm_loadu_pd(&info->ptr[vindex[1].m128i_i32[0]]); // v1v2
-               vin[3] = _mm_loadu_pd(&info->ptr[vindex[1].m128i_i32[1]]); // v1v2
+               vin[0] = _mm_loadu_pd(&info->ptr[MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+               vin[1] = _mm_loadu_pd(&info->ptr[MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+               vin[2] = _mm_loadu_pd(&info->ptr[MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+               vin[3] = _mm_loadu_pd(&info->ptr[MM_EXTRACT_I32(vindex[1],1)]); // v1v2
                vv1[0] = _mm_shuffle_pd(vin[0], vin[1], 0x0); // v1v1
                vv1[1] = _mm_shuffle_pd(vin[2], vin[3], 0x0); // v1v1
                vv2[0] = _mm_shuffle_pd(vin[0], vin[1], 0x3); // v2v2
index f28d2da..1a3e309 100644 (file)
@@ -262,36 +262,19 @@ static inline void is_resample_core(Info_Resample *rs, DATA_T *is_buf, IS_RS_DAT
        vofsi = _mm_srli_epi32(vofs, FRACTION_BITS);
        vofsf = _mm_and_si128(vofs, vfmask);
        vfp = _mm_mul_ps(_mm_cvtepi32_ps(vofsf), vec_divf); // int32 to float // calc fp
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-       ofsp1 = (int32 *)vofsi;
-#if defined(IS_RS_DATA_T_DOUBLE)
-       tmp1 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[ofsp1[0]])); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       tmp2 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[ofsp1[1]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[ofsp1[2]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp4 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[ofsp1[3]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶     
-       tmp1 = _mm_shuffle_ps(tmp1, tmp2, 0x44);
-       tmp3 = _mm_shuffle_ps(tmp3, tmp4, 0x44);
-#else // defined(IS_RS_DATA_T_FLOAT)
-       tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[ofsp1[0]]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[ofsp1[1]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[ofsp1[2]]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[ofsp1[3]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-#endif
-#else
 #if defined(IS_RS_DATA_T_DOUBLE)
-       tmp1 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[vofsi.m128i_i32[0]])); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       tmp2 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[vofsi.m128i_i32[1]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[vofsi.m128i_i32[2]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp4 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[vofsi.m128i_i32[3]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à       
+       tmp1 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,0)])); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+       tmp2 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,1)])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       tmp3 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,2)])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       tmp4 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,3)])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à  
        tmp1 = _mm_shuffle_ps(tmp1, tmp2, 0x44);
        tmp3 = _mm_shuffle_ps(tmp3, tmp4, 0x44);
 #else // defined(IS_RS_DATA_T_FLOAT)
-       tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[vofsi.m128i_i32[0]]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[vofsi.m128i_i32[1]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[vofsi.m128i_i32[2]]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[vofsi.m128i_i32[3]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,0)]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+       tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,1)]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,2)]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,3)]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
 #endif
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))     
        vv1 = _mm_shuffle_ps(tmp1, tmp3, 0x88); // v1[0,1,2,3]  // ofsi\82Ív1\82É
        vv2 = _mm_shuffle_ps(tmp1, tmp3, 0xdd); // v2[0,1,2,3]  // ofsi+1\82Ív2\82É\88Ú\93®
        vec_out = MM_FMA_PS(_mm_sub_ps(vv2, vv1), vfp, vv1);    
@@ -328,18 +311,10 @@ static inline void is_resample_core(Info_Resample *rs, DATA_T *is_buf, IS_RS_DAT
        vofsf = _mm_and_si128(vofs, vfmask);
        vfp1 = _mm_mul_pd(_mm_cvtepi32_pd(vofsf), vec_divf); // int32 to double // calc fp
        vfp2 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_shuffle_epi32(vofsf, 0x4E)), vec_divf); // int32 to double // calc fp
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-       ofsp1 = (int32 *)vofsi;
-       tmp1 = _mm_loadu_pd(&rs_buf[ofsp1[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       tmp2 = _mm_loadu_pd(&rs_buf[ofsp1[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_loadu_pd(&rs_buf[ofsp1[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp4 = _mm_loadu_pd(&rs_buf[ofsp1[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶   
-#else
-       tmp1 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       tmp2 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp4 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à     
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))     
+       tmp1 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,0)]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+       tmp2 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,1)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       tmp3 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,2)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       tmp4 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,3)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à        
        vv11 = _mm_shuffle_pd(tmp1, tmp2, 0x00); // v1[0,1] // ofsi\82Ív1\82É
        vv21 = _mm_shuffle_pd(tmp1, tmp2, 0x03); // v2[0,1] // ofsi+1\82Ív2\82É\88Ú\93®
        vv12 = _mm_shuffle_pd(tmp3, tmp4, 0x00); // v1[2,3] // ofsi\82Ív1\82É
@@ -688,18 +663,10 @@ static inline void is_resample_core(Info_Resample *rs, DATA_T *is_buf, IS_RS_DAT
        vofsi = _mm_srli_epi32(vofs, FRACTION_BITS);
        vosfsf = _mm_and_si128(vofs, vfmask);
        vfp = _mm_mul_ps(_mm_cvtepi32_ps(vosfsf), vec_divf); // int32 to float // calc fp
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-       ofsp1 = (int32 *)vofsi; 
-       tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[ofsp1[0]]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[ofsp1[1]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[ofsp1[2]]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[ofsp1[3]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-#else
-       tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[vofsi.m128i_i32[0]]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[vofsi.m128i_i32[1]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[vofsi.m128i_i32[2]]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[vofsi.m128i_i32[3]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+       tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi.0)]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+       tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,1)]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,2)]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,3)]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
        vv1 = _mm_shuffle_ps(tmp1, tmp3, 0x88); // v1[0,1,2,3]  // ofsi\82Ív1\82É
        vv2 = _mm_shuffle_ps(tmp1, tmp3, 0xdd); // v2[0,1,2,3]  // ofsi+1\82Ív2\82É\88Ú\93®
        vec_out = MM_FMA_PS(_mm_sub_ps(vv2, vv1), vfp, vv1);                    
@@ -739,10 +706,10 @@ static inline void is_resample_core(Info_Resample *rs, DATA_T *is_buf, IS_RS_DAT
        tmp3 = _mm_loadu_pd(&rs_buf[ofsp1[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
        tmp4 = _mm_loadu_pd(&rs_buf[ofsp1[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶   
 #else
-       tmp1 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       tmp2 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp3 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       tmp4 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à     
+       tmp1 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,0)]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+       tmp2 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,1)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       tmp3 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,2)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       tmp4 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,3)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à        
 #endif // !(defined(_MSC_VER) || defined(MSC_VER))     
        vv11 = _mm_shuffle_pd(tmp1, tmp2, 0x00); // v1[0,1] // ofsi\82Ív1\82É
        vv21 = _mm_shuffle_pd(tmp1, tmp2, 0x03); // v2[0,1] // ofsi+1\82Ív2\82É\88Ú\93®
index 7541519..5676570 100755 (executable)
@@ -221,7 +221,34 @@ int32 imuldiv28(int32 a, int32 b) {
 
 /*****************************************************************************/
 #if (USE_X86_EXT_ASM || USE_X86_EXT_INTRIN || USE_X86_AMD_EXT_ASM || USE_X86_AMD_EXT_INTRIN)
-
+#ifdef __GNUC__
+inline void CPUID(int32 *regs, uint32 eax)
+{
+       uint32 ebx,ecx,edx;
+       __asm__ __volatile__ (
+#ifdef __x86_64__
+               "push           %%rbx           \n\t"
+#else
+               "push           %%ebx           \n\t"
+#endif
+               "cpuid                                  \n\t"
+               "mov            %%ebx, %1       \n\t"
+#ifdef __x86_64__
+               "pop            %%rbx           \n\t"
+#else
+               "pop            %%ebx           \n\t"
+#endif
+               : "+a"(eax), "=r"(ebx), "=c"(ecx), "=d"(edx)
+       );
+       regs[0] = eax;
+       regs[1] = ebx;
+       regs[2] = ecx;
+       regs[3] = edx;
+}
+#else
+#include <cpuid.h>
+#define CPUID __cpuid
+#endif
 enum{
        X86_VENDER_INTEL=0,
        X86_VENDER_AMD,
@@ -238,6 +265,15 @@ static const char* x86_vendors[] =
 // \8ag\92£\83t\83\89\83O\8eæ\93¾
 static inline int64    xgetbv(int index)
 {
+#if defined(__GNUC__)
+       unsigned int eax, edx;
+       __asm__ __volatile__ (
+               "xgetbv         \n\t"
+               : "=a"(eax), "=d"(edx)
+               : "c"(index)
+       );
+       return (uint64)eax|((uint64)edx<<32);
+#else
 #if (USE_X86_EXT_ASM || USE_X86_AMD_EXT_ASM)
        uint64 flg = 0;
        //_asm {
@@ -250,7 +286,7 @@ static inline int64 xgetbv(int index)
 #elif (USE_X86_EXT_INTRIN || USE_X86_AMD_EXT_INTRIN)
        return _xgetbv(index);
 #endif
-
+#endif
 }
 
 
@@ -270,7 +306,7 @@ int is_x86ext_available(void)
        uint32 flg4; // extended feature flg pg2
 
        memset(vendor, 0, sizeof(vendor));
-       __cpuid(reg, 0);
+       CPUID(reg,0);
        cmd = reg[0];
        ((uint32*)vendor)[0] = reg[1];
        ((uint32*)vendor)[1] = reg[3];
@@ -280,14 +316,14 @@ int is_x86ext_available(void)
                        break;
        }
        if(cmd >= 0x00000001){
-               __cpuid(reg, 0x00000001);
+               CPUID(reg,0x00000001);
                flg1 = reg[3];
                flg2 = reg[2];
        }
-       __cpuid(reg, 0x80000000);
+       CPUID(reg,0x80000000);
        cmd = reg[ 0 ];
        if(cmd >= 0x80000001){
-               __cpuid(reg, 0x80000001);
+               CPUID(reg,0x80000001);
                flg4 = reg[2];
                flg3 = reg[3];
        }
index bb964bd..61de48e 100644 (file)
 #ifndef OPTCODE_H_INCLUDED
 #define OPTCODE_H_INCLUDED 1
 
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmacro-redefined"
+#endif
+
 #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_X86_) || defined(__X86__) || defined(__I86__)
 #define IX86CPU 1
 #endif
 
-#if defined(_M_X64) || defined(_AMD64_) || defined(_X64_) || defined(__X64__)
+#if defined(_M_X64) || defined(_AMD64_) || defined(_X64_) || defined(__X64__) || defined(__x86_64__)
 #define IX64CPU 1
 #undef IX86CPU
 #undef IA64CPU
@@ -843,6 +848,19 @@ LSU : Unalignment (use loadu/storeu
 #define MM_LSU_MUL_PS(ptr, vec_a) _mm_storeu_ps(ptr, _mm_mul_ps(_mm_loadu_ps(ptr), vec_a))
 #endif
 
+#if (USE_X86_EXT_INTRIN >= 1)
+#if !(defined(_MSC_VER) || defined(MSC_VER))
+#define MM_EXTRACT_F32(reg,idx) _mm_cvtss_f32(_mm_shuffle_ps(reg,reg,idx))
+#define MM_EXTRACT_F64(reg,idx) _mm_cvtsd_f64(_mm_shuffle_pd(reg,reg,idx))
+#define MM_EXTRACT_I32(reg,idx) _mm_cvtsi128_si32(_mm_shuffle_epi32(reg,idx))
+#define MM256_EXTRACT_I32(reg,idx) _mm256_extract_epi32(reg,idx)
+#else
+#define MM_EXTRACT_F32(reg,idx) reg.m128_f32[idx]
+#define MM_EXTRACT_F64(reg,idx) reg.m128d_f64[idx]
+#define MM_EXTRACT_I32(reg,idx) reg.m128i_i32[idx]
+#define MM256_EXTRACT_I32(reg,idx) reg.m256i_i32[idx]
+#endif
+#endif // (USE_X86_EXT_INTRIN >= 1)
 
 #define IS_ALIGN(ptr) (!((int32)ptr & (ALIGN_SIZE - 1)))
 extern int is_x86ext_available(void);
@@ -918,4 +936,8 @@ static inline void *switch_memset(void *destp, int c, size_t len)
 #define memset switch_memset
 #endif /* altivec */
 
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
 #endif /* OPTCODE_H_INCLUDED */
index fea43ff..e1b5774 100644 (file)
@@ -504,20 +504,10 @@ static void CALLINGCONV f64tos8(DATA_T *lp, int32 c)
        __m128 vmul = _mm_set1_ps((float)MAX_8BIT_SIGNED);      
        for(i = 0; i < c; i += 4){
                __m128 vec_f = _mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               float *out = (float *)vec_f;
-               cp[i] = (int8)(out[0]);
-               cp[i] = (int8)(out[1]);
-               cp[i] = (int8)(out[2]);
-               cp[i] = (int8)(out[3]); 
-               }
-#else
-               cp[i] = (int8)(vec_f.m128_f32[0]);
-               cp[i] = (int8)(vec_f.m128_f32[1]);
-               cp[i] = (int8)(vec_f.m128_f32[2]);
-               cp[i] = (int8)(vec_f.m128_f32[3]);      
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+               cp[i] = (int8)(MM_EXTRACT_F32(vec_f,0));
+               cp[i] = (int8)(MM_EXTRACT_F32(vec_f,1));
+               cp[i] = (int8)(MM_EXTRACT_F32(vec_f,2));
+               cp[i] = (int8)(MM_EXTRACT_F32(vec_f,3));        
        }
 }
 #else
@@ -601,20 +591,10 @@ static void CALLINGCONV f64tou8(DATA_T *lp, int32 c)
        __m128i vex = _mm_set1_epi8(0x80);      
        for(i = 0; i < c; i += 4){
                __m128 vec_f = _mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               float *out = (float *)vec_f;
-               cp[i] = 0x80 ^ (uint8)(out[0]);
-               cp[i] = 0x80 ^ (uint8)(out[1]);
-               cp[i] = 0x80 ^ (uint8)(out[2]);
-               cp[i] = 0x80 ^ (uint8)(out[3]); 
-               }
-#else
-               cp[i] = 0x80 ^ (uint8)(vec_f.m128_f32[0]);
-               cp[i] = 0x80 ^ (uint8)(vec_f.m128_f32[1]);
-               cp[i] = 0x80 ^ (uint8)(vec_f.m128_f32[2]);
-               cp[i] = 0x80 ^ (uint8)(vec_f.m128_f32[3]);
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+               cp[i] = 0x80 ^ (uint8)(MM_EXTRACT_F32(vec_f,0));
+               cp[i] = 0x80 ^ (uint8)(MM_EXTRACT_F32(vec_f,1));
+               cp[i] = 0x80 ^ (uint8)(MM_EXTRACT_F32(vec_f,2));
+               cp[i] = 0x80 ^ (uint8)(MM_EXTRACT_F32(vec_f,3));
        }
 }
 #else
@@ -637,20 +617,10 @@ static void CALLINGCONV f64toulaw(DATA_T *lp, int32 c)
        __m256d vmul = _mm256_set1_pd((double)MAX_16BIT_SIGNED);        
        for(i = 0; i < c; i += 4){
                __m128i vec0 = _mm256_cvttpd_epi32(_mm256_mul_pd(D256_CLIP_INPUT(&lp[i], gain), vmul));
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               int32 *out = (int32 *)vec0;
-               up[i] = AUDIO_S2U(out[0]);
-               up[i + 1] = AUDIO_S2U(out[1]);
-               up[i + 2] = AUDIO_S2U(out[2]);
-               up[i + 3] = AUDIO_S2U(out[3]);
-               }
-#else
-               up[i] = AUDIO_S2U(vec0.m128i_i32[0]);
-               up[i + 1] = AUDIO_S2U(vec0.m128i_i32[1]);
-               up[i + 2] = AUDIO_S2U(vec0.m128i_i32[2]);
-               up[i + 3] = AUDIO_S2U(vec0.m128i_i32[3]);
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+               up[i] = AUDIO_S2U(MM_EXTRACT_I32(vec0,0));
+               up[i + 1] = AUDIO_S2U(MM_EXTRACT_I32(vec0,1));
+               up[i + 2] = AUDIO_S2U(MM_EXTRACT_I32(vec0,2));
+               up[i + 3] = AUDIO_S2U(MM_EXTRACT_I32(vec0,3));
        }
 }
 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
@@ -665,20 +635,10 @@ static void CALLINGCONV f64toulaw(DATA_T *lp, int32 c)
                __m128 vec_f12 = _mm_cvtpd_ps(_mm_load_pd(&lp[i + 2]));
                __m128 vec_f1 = _mm_shuffle_ps(vec_f11, vec_f12, 0x44);
                __m128i vec_i32 = _mm_cvttps_epi32(_mm_mul_ps(F128_CLIP_MM(vec_f1, gain), vmul));
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               int32 *out = (int32 *)vec_i32;
-               up[i] = AUDIO_S2U(out[0]);
-               up[i + 1] = AUDIO_S2U(out[1]);
-               up[i + 2] = AUDIO_S2U(out[2]);
-               up[i + 3] = AUDIO_S2U(out[3]);
-               }
-#else
-               up[i] = AUDIO_S2U(vec_i32.m128i_i32[0]);
-               up[i + 1] = AUDIO_S2U(vec_i32.m128i_i32[1]);
-               up[i + 2] = AUDIO_S2U(vec_i32.m128i_i32[2]);
-               up[i + 3] = AUDIO_S2U(vec_i32.m128i_i32[3]);
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+               up[i] = AUDIO_S2U(MM_EXTRACT_I32(vec_i32,0));
+               up[i + 1] = AUDIO_S2U(MM_EXTRACT_I32(vec_i32,1));
+               up[i + 2] = AUDIO_S2U(MM_EXTRACT_I32(vec_i32,2));
+               up[i + 3] = AUDIO_S2U(MM_EXTRACT_I32(vec_i32,3));
        }       
 }
 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_FLOAT)
@@ -690,20 +650,10 @@ static void CALLINGCONV f64toulaw(DATA_T *lp, int32 c)
        __m128 vmul = _mm_set1_ps((float)MAX_16BIT_SIGNED);
        for(i = 0; i < c; i += 4){
                __m128i vec0 = _mm_cvttps_epi32(_mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul));
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               int32 *out = (int32 *)vec0;
-               up[i] = AUDIO_S2U(out[0]);
-               up[i + 1] = AUDIO_S2U(out[1]);
-               up[i + 2] = AUDIO_S2U(out[2]);
-               up[i + 3] = AUDIO_S2U(out[3]);
-               }
-#else
-               up[i] = AUDIO_S2U(vec0.m128i_i32[0]);
-               up[i + 1] = AUDIO_S2U(vec0.m128i_i32[1]);
-               up[i + 2] = AUDIO_S2U(vec0.m128i_i32[2]);
-               up[i + 3] = AUDIO_S2U(vec0.m128i_i32[3]);
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+               up[i] = AUDIO_S2U(MM_EXTRACT_I32(vec0,0));
+               up[i + 1] = AUDIO_S2U(MM_EXTRACT_I32(vec0,1));
+               up[i + 2] = AUDIO_S2U(MM_EXTRACT_I32(vec0,2));
+               up[i + 3] = AUDIO_S2U(MM_EXTRACT_I32(vec0,3));
        }
 }
 #else
@@ -726,20 +676,10 @@ static void CALLINGCONV f64toalaw(DATA_T *lp, int32 c)
        __m256d vmul = _mm256_set1_pd((double)MAX_16BIT_SIGNED);                
        for(i = 0; i < c; i += 4){
                __m128i vec0 = _mm256_cvttpd_epi32(_mm256_mul_pd(D256_CLIP_INPUT(&lp[i], gain), vmul));
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               int32 *out = (int32 *)vec0;
-               up[i] = AUDIO_S2A(out[0]);
-               up[i + 1] = AUDIO_S2A(out[1]);
-               up[i + 2] = AUDIO_S2A(out[2]);
-               up[i + 3] = AUDIO_S2A(out[3]);
-               }
-#else
-               up[i] = AUDIO_S2A(vec0.m128i_i32[0]);
-               up[i + 1] = AUDIO_S2A(vec0.m128i_i32[1]);
-               up[i + 2] = AUDIO_S2A(vec0.m128i_i32[2]);
-               up[i + 3] = AUDIO_S2A(vec0.m128i_i32[3]);
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+               up[i] = AUDIO_S2A(MM_EXTRACT_I32(vec0,0));
+               up[i + 1] = AUDIO_S2A(MM_EXTRACT_I32(vec0,1));
+               up[i + 2] = AUDIO_S2A(MM_EXTRACT_I32(vec0,2));
+               up[i + 3] = AUDIO_S2A(MM_EXTRACT_I32(vec0,3));
        }
 }
 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
@@ -754,20 +694,10 @@ static void CALLINGCONV f64toalaw(DATA_T *lp, int32 c)
                __m128 vec_f12 = _mm_cvtpd_ps(_mm_load_pd(&lp[i + 2]));
                __m128 vec_f1 = _mm_shuffle_ps(vec_f11, vec_f12, 0x44);
                __m128i vec_i32 = _mm_cvttps_epi32(_mm_mul_ps(F128_CLIP_MM(vec_f1, gain), vmul));
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               int32 *out = (int32 *)vec_i32;
-               up[i] = AUDIO_S2A(out[0]);
-               up[i + 1] = AUDIO_S2A(out[1]);
-               up[i + 2] = AUDIO_S2A(out[2]);
-               up[i + 3] = AUDIO_S2A(out[3]);
-               }
-#else
-               up[i] = AUDIO_S2A(vec_i32.m128i_i32[0]);
-               up[i + 1] = AUDIO_S2A(vec_i32.m128i_i32[1]);
-               up[i + 2] = AUDIO_S2A(vec_i32.m128i_i32[2]);
-               up[i + 3] = AUDIO_S2A(vec_i32.m128i_i32[3]);
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+               up[i] = AUDIO_S2A(MM_EXTRACT_I32(vec_i32,0));
+               up[i + 1] = AUDIO_S2A(MM_EXTRACT_I32(vec_i32,1));
+               up[i + 2] = AUDIO_S2A(MM_EXTRACT_I32(vec_i32,2));
+               up[i + 3] = AUDIO_S2A(MM_EXTRACT_I32(vec_i32,3));
        }
 }
 #else
@@ -840,20 +770,10 @@ static void CALLINGCONV f64tos16(DATA_T *lp, int32 c)
        __m128 vmul = _mm_set1_ps((float)MAX_16BIT_SIGNED);     
        for(i = 0; i < c; i += 4){
                __m128 vec_f = _mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               float *out = (float *)vec_f;
-               sp[i] = (int16)(out[0]);
-               sp[i] = (int16)(out[1]);
-               sp[i] = (int16)(out[2]);
-               sp[i] = (int16)(out[3]);        
-               }
-#else
-               sp[i] = (int16)(vec_f.m128_f32[0]);
-               sp[i] = (int16)(vec_f.m128_f32[1]);
-               sp[i] = (int16)(vec_f.m128_f32[2]);
-               sp[i] = (int16)(vec_f.m128_f32[3]);             
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+               sp[i] = (int16)(MM_EXTRACT_F32(vec_f,0));
+               sp[i] = (int16)(MM_EXTRACT_F32(vec_f,1));
+               sp[i] = (int16)(MM_EXTRACT_F32(vec_f,2));
+               sp[i] = (int16)(MM_EXTRACT_F32(vec_f,3));               
        }
 }
 #else
@@ -1062,20 +982,10 @@ static void CALLINGCONV f64tos24(DATA_T *lp, int32 c)
        __m128 vmul = _mm_set1_ps((float)MAX_24BIT_SIGNED);
        for(i = 0; i < c; i += 4){ // 108 inst in loop
                __m128 vec_f = _mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               float *out = (float *)vec_f;
-               STORE_S24(cp, (int32)(out[0]));
-               STORE_S24(cp, (int32)(out[1]));
-               STORE_S24(cp, (int32)(out[2]));
-               STORE_S24(cp, (int32)(out[3]));
-               }
-#else
-               STORE_S24(cp, (int32)(vec_f.m128_f32[0]));
-               STORE_S24(cp, (int32)(vec_f.m128_f32[1]));
-               STORE_S24(cp, (int32)(vec_f.m128_f32[2]));
-               STORE_S24(cp, (int32)(vec_f.m128_f32[3]));      
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+               STORE_S24(cp, (int32)(MM_EXTRACT_F32(vec_f,0)));
+               STORE_S24(cp, (int32)(MM_EXTRACT_F32(vec_f,1)));
+               STORE_S24(cp, (int32)(MM_EXTRACT_F32(vec_f,2)));
+               STORE_S24(cp, (int32)(MM_EXTRACT_F32(vec_f,3)));        
        }
 }
 #else
@@ -1186,20 +1096,10 @@ static void CALLINGCONV f64tos32(DATA_T *lp, int32 c)
        __m128 vmul = _mm_set1_ps((float)MAX_32BIT_SIGNED);     
        for(i = 0; i < c; i += 4){
                __m128 vec_f = _mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               float *out = (float *)vec_f;
-               sp[i] = (int32)(out[0]);
-               sp[i] = (int32)(out[1]);
-               sp[i] = (int32)(out[2]);
-               sp[i] = (int32)(out[3]);        
-               }
-#else
-               sp[i] = (int32)(vec_f.m128_f32[0]);
-               sp[i] = (int32)(vec_f.m128_f32[1]);
-               sp[i] = (int32)(vec_f.m128_f32[2]);
-               sp[i] = (int32)(vec_f.m128_f32[3]);     
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+               sp[i] = (int32)(MM_EXTRACT_F32(vec_f,0));
+               sp[i] = (int32)(MM_EXTRACT_F32(vec_f,1));
+               sp[i] = (int32)(MM_EXTRACT_F32(vec_f,2));
+               sp[i] = (int32)(MM_EXTRACT_F32(vec_f,3));       
        }
 }
 #else
@@ -1685,20 +1585,10 @@ static void CALLINGCONV f64tof64(DATA_T *lp, int32 c)
        __m128 gain = _mm_set1_ps((float)INPUT_GAIN);
        for(i = c - 4; i >= 0; i -= 4){
                __m128 vec_f = F128_CLIP_INPUT(&lp[i], gain);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               float *out = (float *)vec_f;
-               sp[i] = (double)(out[0]);
-               sp[i] = (double)(out[1]);
-               sp[i] = (double)(out[2]);
-               sp[i] = (double)(out[3]);       
-               }
-#else
-               sp[i] = (double)(vec_f.m128_f32[0]);
-               sp[i] = (double)(vec_f.m128_f32[1]);
-               sp[i] = (double)(vec_f.m128_f32[2]);
-               sp[i] = (double)(vec_f.m128_f32[3]);            
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+               sp[i] = (double)(MM_EXTRACT_F32(vec_f,0));
+               sp[i] = (double)(MM_EXTRACT_F32(vec_f,1));
+               sp[i] = (double)(MM_EXTRACT_F32(vec_f,2));
+               sp[i] = (double)(MM_EXTRACT_F32(vec_f,3));              
        }
 }
 #elif defined(DATA_T_DOUBLE)
index 71e0112..3524d53 100644 (file)
@@ -11377,16 +11377,8 @@ static inline void mix_ch_signal_source(DATA_T *src, int ch, int count)
                                        vevol = _mm_shuffle_ps(vevol, vevol, 0x44);
                                }
                                vsp = _mm_mul_ps(_mm_loadu_ps(src), vevol);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-                               {
-                               float *out = (float *)vsp;
-                               *(src++) = out[0];
-                               *(src++) = out[1];
-                               }
-#else
-                               *(src++) = vsp.m128_f32[0];
-                               *(src++) = vsp.m128_f32[1];     
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+                               *(src++) = MM_EXTRACT_F32(vsp,0);
+                               *(src++) = MM_EXTRACT_F32(vsp,1);       
                        }
 
 #else // ! USE_X86_EXT_INTRIN
index be984c6..c5b9b5b 100644 (file)
@@ -4276,26 +4276,14 @@ static inline DATA_T *resample_linear_multi(Voice *vp, DATA_T *dest, int32 req_c
 
        for(; i < count; i += 8) {
        __m256i vofsi = _mm256_srli_epi32(vofs, FRACTION_BITS);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-       int32 *ofsp = (int32 *)vofsi;
-       __m128i vin1 = _mm_loadu_si128((__m128i *)&src[ofsp[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       __m128i vin2 = _mm_loadu_si128((__m128i *)&src[ofsp[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin3 = _mm_loadu_si128((__m128i *)&src[ofsp[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin4 = _mm_loadu_si128((__m128i *)&src[ofsp[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin5 = _mm_loadu_si128((__m128i *)&src[ofsp[4]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin6 = _mm_loadu_si128((__m128i *)&src[ofsp[5]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin7 = _mm_loadu_si128((__m128i *)&src[ofsp[6]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin8 = _mm_loadu_si128((__m128i *)&src[ofsp[7]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-#else
-       __m128i vin1 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       __m128i vin2 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin3 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin4 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin5 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[4]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin6 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[5]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin7 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[6]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin8 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[7]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-#endif
+       __m128i vin1 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,0)]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+       __m128i vin2 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,1)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       __m128i vin3 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,2)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       __m128i vin4 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,3)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       __m128i vin5 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,4)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       __m128i vin6 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,5)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       __m128i vin7 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,6)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       __m128i vin8 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,7)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
        __m128i vin12 = _mm_unpacklo_epi16(vin1, vin2); // [v11v21]e96,[v12v22]e96 to [v11v12v21v22]e64
        __m128i vin34 = _mm_unpacklo_epi16(vin3, vin4); // [v13v23]e96,[v14v24]e96 to [v13v14v23v24]e64
        __m128i vin56 = _mm_unpacklo_epi16(vin5, vin6); // \93¯\82
@@ -4377,7 +4365,7 @@ static inline DATA_T *resample_linear_multi(Voice *vp, DATA_T *dest, int32 req_c
        dest += 4;
 #elif defined(DATA_T_FLOAT) // DATA_T_FLOAT 
        __m128 vec_out = _mm_mul_ps(MM_FMA_PS(_mm_sub_ps(vv2, vv1), vfp, vv1), vec_divo);
-       _mm256_storeu_ps(dest, vec_out);
+       _mm_storeu_ps(dest, vec_out);
        dest += 4;
 #else // DATA_T_IN32
        __m128 vec_out = MM_FMA_PS(_mm_sub_ps(vv2, vv1), vfp, vv1);
@@ -4497,18 +4485,10 @@ static inline DATA_T *resample_linear_multi(Voice *vp, DATA_T *dest, int32 req_c
        const __m128 vec_divo = _mm_set1_ps(DIV_15BIT);
        for(; i < count; i += 4) {
        __m128i vofsi = _mm_srli_epi32(vofs, FRACTION_BITS);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-       int32 *ofsp = (int32 *)vofsi;
-       __m128i vin1 = _mm_loadu_si128((__m128i *)&src[ofsp[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       __m128i vin2 = _mm_loadu_si128((__m128i *)&src[ofsp[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin3 = _mm_loadu_si128((__m128i *)&src[ofsp[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin4 = _mm_loadu_si128((__m128i *)&src[ofsp[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-#else
-       __m128i vin1 = _mm_loadu_si128((__m128i *)&src[vofsi.m128i_i32[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
-       __m128i vin2 = _mm_loadu_si128((__m128i *)&src[vofsi.m128i_i32[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin3 = _mm_loadu_si128((__m128i *)&src[vofsi.m128i_i32[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-       __m128i vin4 = _mm_loadu_si128((__m128i *)&src[vofsi.m128i_i32[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
-#endif         
+       __m128i vin1 = _mm_loadu_si128((__m128i *)&src[MM_EXTRACT_I32(vofsi,0)]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+       __m128i vin2 = _mm_loadu_si128((__m128i *)&src[MM_EXTRACT_I32(vofsi,1)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       __m128i vin3 = _mm_loadu_si128((__m128i *)&src[MM_EXTRACT_I32(vofsi,2)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82
+       __m128i vin4 = _mm_loadu_si128((__m128i *)&src[MM_EXTRACT_I32(vofsi,3)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶ 
        __m128i vin12 = _mm_unpacklo_epi16(vin1, vin2); // [v11v21]e96,[v12v22]e96 to [v11v12v21v22]e64
        __m128i vin34 = _mm_unpacklo_epi16(vin3, vin4); // [v13v23]e96,[v14v24]e96 to [v13v14v23v24]e64
        __m128i vi16 = _mm_unpacklo_epi32(vin12, vin34); // [v11v12,v21v22]e64,[v13v14,v23v24]e64 to [v11v12v13v14,v21v22v23v24]e0
@@ -4545,7 +4525,7 @@ static inline DATA_T *resample_linear_multi(Voice *vp, DATA_T *dest, int32 req_c
        vofs = _mm_add_epi32(vofs, vinc);
        }
        }
-       resrc->offset = prec_offset + (splen_t)(vofs.m128i_i32[0]);
+       resrc->offset = prec_offset + (splen_t)(MM_EXTRACT_I32(vofs,0));
        *out_count = i;
     return dest;
 }
@@ -4585,20 +4565,10 @@ static inline DATA_T *resample_linear_multi(Voice *vp, DATA_T *dest, int32 req_c
                vv2 = _mm_cvt_si2ss(vv2, src[++ofsi]), vv2 = _mm_shuffle_ps(vv2, vv2, 0x1b);                    
 #if defined(DATA_T_DOUBLE)
                vec_out = _mm_mul_ps(MM_FMA_PS(_mm_sub_ps(vv2, vv1), _mm_mul_ps(vfp, vec_divf), vv1), vec_divo);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-               {
-               float *out = (float *)vec_out;
-               *dest++ = (DATA_T)out[0];
-               *dest++ = (DATA_T)out[1];
-               *dest++ = (DATA_T)out[2];
-               *dest++ = (DATA_T)out[3];
-               }
-#else
-               *dest++ = (DATA_T)vec_out.m128_f32[0];
-               *dest++ = (DATA_T)vec_out.m128_f32[1];
-               *dest++ = (DATA_T)vec_out.m128_f32[2];
-               *dest++ = (DATA_T)vec_out.m128_f32[3];
-#endif
+               *dest++ = (DATA_T)MM_EXTRACT_F32(vec_out,0);
+               *dest++ = (DATA_T)MM_EXTRACT_F32(vec_out,1);
+               *dest++ = (DATA_T)MM_EXTRACT_F32(vec_out,2);
+               *dest++ = (DATA_T)MM_EXTRACT_F32(vec_out,3);
 #elif defined(DATA_T_FLOAT) // DATA_T_FLOAT
                _mm_storeu_ps(dest, _mm_mul_ps(MM_FMA_PS(_mm_sub_ps(vv2, vv1), _mm_mul_ps(vfp, vec_divf), vv1), vec_divo));
                dest += 4;
index 6e27004..64b9466 100644 (file)
 #include <sys/types.h>
 #endif
 
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
 #include <stdio.h>
 
 /* Architectures */
@@ -82,7 +86,7 @@
 #if defined(IX86CPU) && (defined(_MSC_VER) || defined(__POCC__) || \
        defined(__BORLANDC__) || defined(__WATCOMC__))
 #define CALLINGCONV __fastcall
-#elif defined(IX86CPU) && defined(__GNUC__)
+#elif defined(IX86CPU) && !defined(AMD64CPU) && defined(__GNUC__)
 #define CALLINGCONV __attribute__((fastcall))
 #else
 #define CALLINGCONV /**/
@@ -944,12 +948,13 @@ int usleep(unsigned int useconds); /* shut gcc warning up */
 #ifdef __MINGW32__
 #define aligned_malloc __mingw_aligned_malloc
 #define aligned_free   __mingw_aligned_free
-#elif __STDC_VERSION__ >= 201112L
-#define aligned_malloc(s,a) aligned_alloc(a,s)
-#define aligned_free   free
-//#elif _POSIX_VERSION >= 200112L
-//#define aligned_malloc(s,a) posix_memalign(,a,s)
+/* aligned_malloc is unsafe because s must be a multiple of a */
+//#elif __STDC_VERSION__ >= 201112L
+//#define aligned_malloc(s,a) aligned_malloc(a,s)
 //#define aligned_free   free
+#elif defined(__GNUC__) && _POSIX_VERSION >= 200112L
+#define aligned_malloc(s,a) ({void *ptr; if(!s || posix_memalign(&ptr,a,s)) ptr = NULL; ptr;})
+#define aligned_free   free
 #elif _MSC_VER
 #define aligned_malloc _aligned_malloc
 #define aligned_free   _aligned_free
index 86c3813..a41f587 100644 (file)
@@ -281,16 +281,8 @@ static inline void mix_mystery_signal_thread(DATA_T *sp, DATA_T *lp, int v, int
                                vsp = _mm_loadu_ps(sp++);
                                vsp = _mm_shuffle_ps(vsp, vsp, 0x50); // [0,1,2,3] to {0,0,1,1]
                                vsp = _mm_mul_ps(vsp, vevol);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
-                               {
-                               float *out = (float *)vsp;
-                               *(lp++) = out[0];
-                               *(lp++) = out[1];
-                               }
-#else
-                               *(lp++) = vsp.m128_f32[0];
-                               *(lp++) = vsp.m128_f32[1];
-#endif //  !(defined(_MSC_VER) || defined(MSC_VER))
+                               *(lp++) = MM_EXTRACT_F32(vsp,0);
+                               *(lp++) = MM_EXTRACT_F32(vsp,1);
                        }
 
 #else // ! USE_X86_EXT_INTRIN
index 3a8a337..e65dbed 100644 (file)
@@ -2346,6 +2346,7 @@ static inline void do_vfx_tremolo(int v, VoiceEffect *vfx, DATA_T *sp, int32 cou
        for(i = 0; i < count2; i += 8){
                MM256_LSU_MUL_PS(&sp[i], vamp);
        }
+       }
 #elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
        {
        const int32 req_count_mask = ~(0x7);