_mm_storeu_pd(&info->mcount[i][REV_EX_RV_L1], vc[1]);
vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->mphase[i][REV_EX_ER_L1])); // mcount+mphase
vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->mphase[i][REV_EX_RV_L1])); // mcount+mphase
- vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(mc)
- vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(mc)
+ vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(mc)
+ vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],1))); // lookup2_sine_p(mc)
vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_ER_L1]), vd[0]); // mdepth* sine
vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_RV_L1]), vd[1]); // mdepth* sine
vfp[0] = _mm_sub_pd(_mm_sub_pd(vmi, _mm_loadu_pd(&info->mdelay[i][REV_EX_ER_L1])), vd[0]); // mindex-mdelay-mdepth
vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
#endif
- vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
- vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
- vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
- vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+ vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+ vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+ vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+ vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
_mm_storeu_pd(&info->mcount[i][REV_EX_RV_L1], vc[1]);
vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->mphase[i][REV_EX_ER_L1])); // mcount+mphase
vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->mphase[i][REV_EX_RV_L1])); // mcount+mphase
- vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(mc)
- vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(mc)
+ vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(mc)
+ vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],0))); // lookup2_sine_p(mc)
vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_ER_L1]), vd[0]); // mdepth* sine
vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_RV_L1]), vd[1]); // mdepth* sine
vfp[0] = _mm_sub_pd(_mm_sub_pd(vmi, _mm_loadu_pd(&info->mdelay[i][REV_EX_ER_L1])), vd[0]); // mindex-mdelay-mdepth
vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
#endif
- vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
- vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
- vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
- vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+ vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+ vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+ vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+ vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
_mm_storeu_pd(&info->mcount[i][REV_EX_RV_L1], vc[1]);
vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->mphase[i][REV_EX_ER_L1])); // mcount+mphase
vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->mphase[i][REV_EX_RV_L1])); // mcount+mphase
- vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(mc)
- vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(mc)
+ vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(mc)
+ vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],0))); // lookup2_sine_p(mc)
vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_ER_L1]), vd[0]); // mdepth* sine
vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_RV_L1]), vd[1]); // mdepth* sine
vfp[0] = _mm_sub_pd(_mm_sub_pd(vmi, _mm_loadu_pd(&info->mdelay[i][REV_EX_ER_L1])), vd[0]); // mindex-mdelay-mdepth
vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
#endif
- vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
- vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
- vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
- vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+ vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+ vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+ vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+ vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
_mm_storeu_pd(&info->acount[i][REV_EX_RV_L1], vc[1]);
vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->aphase[i][REV_EX_ER_L1])); // count+phase
vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->aphase[i][REV_EX_RV_L1])); // count+phase
- vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(count)
- vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(cuont)
+ vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(count)
+ vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],0))); // lookup2_sine_p(cuont)
vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->adepth[i][REV_EX_ER_L1]), vd[0]); // depth* sine
vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->adepth[i][REV_EX_RV_L1]), vd[1]); // depth* sine
vfp[0] = _mm_sub_pd(_mm_sub_pd(vai, _mm_loadu_pd(&info->adelay[i][REV_EX_ER_L1])), vd[0]); // index-delay-depth
vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
#endif
- vtmp[0] = _mm_loadu_pd(&info->abuf[REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
- vtmp[1] = _mm_loadu_pd(&info->abuf[REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
- vtmp[2] = _mm_loadu_pd(&info->abuf[REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
- vtmp[3] = _mm_loadu_pd(&info->abuf[REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+ vtmp[0] = _mm_loadu_pd(&info->abuf[REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+ vtmp[1] = _mm_loadu_pd(&info->abuf[REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+ vtmp[2] = _mm_loadu_pd(&info->abuf[REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+ vtmp[3] = _mm_loadu_pd(&info->abuf[REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
_mm_storeu_pd(&info->mcount[i][REV_EX_RV_L1], vc[1]);
vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->mphase[i][REV_EX_ER_L1])); // mcount+mphase
vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->mphase[i][REV_EX_RV_L1])); // mcount+mphase
- vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(mc)
- vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(mc)
+ vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(mc)
+ vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],0))); // lookup2_sine_p(mc)
vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_ER_L1]), vd[0]); // mdepth* sine
vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->mdepth[i][REV_EX_RV_L1]), vd[1]); // mdepth* sine
vfp[0] = _mm_sub_pd(_mm_sub_pd(vmi, _mm_loadu_pd(&info->mdelay[i][REV_EX_ER_L1])), vd[0]); // mindex-mdelay-mdepth
vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
#endif
- vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
- vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
- vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
- vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+ vtmp[0] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+ vtmp[1] = _mm_loadu_pd(&info->buf[i][REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+ vtmp[2] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+ vtmp[3] = _mm_loadu_pd(&info->buf[i][REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
_mm_storeu_pd(&info->acount[i][REV_EX_RV_L1], vc[1]);
vr[0] = _mm_add_pd(vc[0], _mm_loadu_pd(&info->aphase[i][REV_EX_ER_L1])); // count+phase
vr[1] = _mm_add_pd(vc[1], _mm_loadu_pd(&info->aphase[i][REV_EX_RV_L1])); // count+phase
- vd[0] = _mm_set_pd(lookup2_sine_p(vr[0].m128d_f64[1]), lookup2_sine_p(vr[0].m128d_f64[0])); // lookup2_sine_p(count)
- vd[1] = _mm_set_pd(lookup2_sine_p(vr[1].m128d_f64[1]), lookup2_sine_p(vr[1].m128d_f64[0])); // lookup2_sine_p(cuont)
+ vd[0] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[0],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[0],0))); // lookup2_sine_p(count)
+ vd[1] = _mm_set_pd(lookup2_sine_p(MM_EXTRACT_F64(vr[1],1)), lookup2_sine_p(MM_EXTRACT_F64(vr[1],0))); // lookup2_sine_p(cuont)
vd[0] = _mm_mul_pd(_mm_loadu_pd(&info->adepth[i][REV_EX_ER_L1]), vd[0]); // depth* sine
vd[1] = _mm_mul_pd(_mm_loadu_pd(&info->adepth[i][REV_EX_RV_L1]), vd[1]); // depth* sine
vfp[0] = _mm_sub_pd(_mm_sub_pd(vai, _mm_loadu_pd(&info->adelay[i][REV_EX_ER_L1])), vd[0]); // index-delay-depth
vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
#endif
- vtmp[0] = _mm_loadu_pd(&info->abuf[REV_EX_ER_L1][vindex[0].m128i_i32[0]]); // v1v2
- vtmp[1] = _mm_loadu_pd(&info->abuf[REV_EX_ER_R1][vindex[0].m128i_i32[1]]); // v1v2
- vtmp[2] = _mm_loadu_pd(&info->abuf[REV_EX_RV_L1][vindex[1].m128i_i32[0]]); // v1v2
- vtmp[3] = _mm_loadu_pd(&info->abuf[REV_EX_RV_R1][vindex[1].m128i_i32[1]]); // v1v2
+ vtmp[0] = _mm_loadu_pd(&info->abuf[REV_EX_ER_L1][MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+ vtmp[1] = _mm_loadu_pd(&info->abuf[REV_EX_ER_R1][MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+ vtmp[2] = _mm_loadu_pd(&info->abuf[REV_EX_RV_L1][MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+ vtmp[3] = _mm_loadu_pd(&info->abuf[REV_EX_RV_R1][MM_EXTRACT_I32(vindex[1],1)]); // v1v2
vv1[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x0);
vv1[1] = _mm_shuffle_pd(vtmp[2], vtmp[3], 0x0);
vv2[0] = _mm_shuffle_pd(vtmp[0], vtmp[1], 0x3);
vfp[0] = _mm_sub_pd(vfp[0], _mm_cvtepi32_pd(vindex[0])); // fp-vindex
vfp[1] = _mm_sub_pd(vfp[1], _mm_cvtepi32_pd(vindex[1])); // fp-vindex
#endif
- vin[0] = _mm_loadu_pd(&info->ptr[vindex[0].m128i_i32[0]]); // v1v2
- vin[1] = _mm_loadu_pd(&info->ptr[vindex[0].m128i_i32[1]]); // v1v2
- vin[2] = _mm_loadu_pd(&info->ptr[vindex[1].m128i_i32[0]]); // v1v2
- vin[3] = _mm_loadu_pd(&info->ptr[vindex[1].m128i_i32[1]]); // v1v2
+ vin[0] = _mm_loadu_pd(&info->ptr[MM_EXTRACT_I32(vindex[0],0)]); // v1v2
+ vin[1] = _mm_loadu_pd(&info->ptr[MM_EXTRACT_I32(vindex[0],1)]); // v1v2
+ vin[2] = _mm_loadu_pd(&info->ptr[MM_EXTRACT_I32(vindex[1],0)]); // v1v2
+ vin[3] = _mm_loadu_pd(&info->ptr[MM_EXTRACT_I32(vindex[1],1)]); // v1v2
vv1[0] = _mm_shuffle_pd(vin[0], vin[1], 0x0); // v1v1
vv1[1] = _mm_shuffle_pd(vin[2], vin[3], 0x0); // v1v1
vv2[0] = _mm_shuffle_pd(vin[0], vin[1], 0x3); // v2v2
vofsi = _mm_srli_epi32(vofs, FRACTION_BITS);
vofsf = _mm_and_si128(vofs, vfmask);
vfp = _mm_mul_ps(_mm_cvtepi32_ps(vofsf), vec_divf); // int32 to float // calc fp
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- ofsp1 = (int32 *)vofsi;
-#if defined(IS_RS_DATA_T_DOUBLE)
- tmp1 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[ofsp1[0]])); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- tmp2 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[ofsp1[1]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[ofsp1[2]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp4 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[ofsp1[3]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp1 = _mm_shuffle_ps(tmp1, tmp2, 0x44);
- tmp3 = _mm_shuffle_ps(tmp3, tmp4, 0x44);
-#else // defined(IS_RS_DATA_T_FLOAT)
- tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[ofsp1[0]]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[ofsp1[1]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[ofsp1[2]]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[ofsp1[3]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
-#endif
-#else
#if defined(IS_RS_DATA_T_DOUBLE)
- tmp1 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[vofsi.m128i_i32[0]])); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- tmp2 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[vofsi.m128i_i32[1]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[vofsi.m128i_i32[2]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp4 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[vofsi.m128i_i32[3]])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à
+ tmp1 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,0)])); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+ tmp2 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,1)])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ tmp3 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,2)])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ tmp4 = _mm_cvtpd_ps(_mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,3)])); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à
tmp1 = _mm_shuffle_ps(tmp1, tmp2, 0x44);
tmp3 = _mm_shuffle_ps(tmp3, tmp4, 0x44);
#else // defined(IS_RS_DATA_T_FLOAT)
- tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[vofsi.m128i_i32[0]]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[vofsi.m128i_i32[1]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[vofsi.m128i_i32[2]]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[vofsi.m128i_i32[3]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,0)]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+ tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,1)]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,2)]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,3)]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
#endif
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
vv1 = _mm_shuffle_ps(tmp1, tmp3, 0x88); // v1[0,1,2,3] // ofsi\82Ív1\82É
vv2 = _mm_shuffle_ps(tmp1, tmp3, 0xdd); // v2[0,1,2,3] // ofsi+1\82Ív2\82É\88Ú\93®
vec_out = MM_FMA_PS(_mm_sub_ps(vv2, vv1), vfp, vv1);
vofsf = _mm_and_si128(vofs, vfmask);
vfp1 = _mm_mul_pd(_mm_cvtepi32_pd(vofsf), vec_divf); // int32 to double // calc fp
vfp2 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_shuffle_epi32(vofsf, 0x4E)), vec_divf); // int32 to double // calc fp
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- ofsp1 = (int32 *)vofsi;
- tmp1 = _mm_loadu_pd(&rs_buf[ofsp1[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- tmp2 = _mm_loadu_pd(&rs_buf[ofsp1[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_loadu_pd(&rs_buf[ofsp1[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp4 = _mm_loadu_pd(&rs_buf[ofsp1[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
-#else
- tmp1 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- tmp2 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp4 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ tmp1 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,0)]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+ tmp2 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,1)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ tmp3 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,2)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ tmp4 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,3)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à
vv11 = _mm_shuffle_pd(tmp1, tmp2, 0x00); // v1[0,1] // ofsi\82Ív1\82É
vv21 = _mm_shuffle_pd(tmp1, tmp2, 0x03); // v2[0,1] // ofsi+1\82Ív2\82É\88Ú\93®
vv12 = _mm_shuffle_pd(tmp3, tmp4, 0x00); // v1[2,3] // ofsi\82Ív1\82É
vofsi = _mm_srli_epi32(vofs, FRACTION_BITS);
vosfsf = _mm_and_si128(vofs, vfmask);
vfp = _mm_mul_ps(_mm_cvtepi32_ps(vosfsf), vec_divf); // int32 to float // calc fp
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- ofsp1 = (int32 *)vofsi;
- tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[ofsp1[0]]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[ofsp1[1]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[ofsp1[2]]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[ofsp1[3]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
-#else
- tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[vofsi.m128i_i32[0]]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[vofsi.m128i_i32[1]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[vofsi.m128i_i32[2]]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[vofsi.m128i_i32[3]]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ tmp1 = _mm_loadl_pi(tmp1, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi.0)]); // L64bit ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+ tmp1 = _mm_loadh_pi(tmp1, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,1)]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ tmp3 = _mm_loadl_pi(tmp3, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,2)]); // L64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ tmp3 = _mm_loadh_pi(tmp3, (__m64 *)&rs_buf[MM_EXTRACT_I32(vofsi,3)]); // H64bit \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
vv1 = _mm_shuffle_ps(tmp1, tmp3, 0x88); // v1[0,1,2,3] // ofsi\82Ív1\82É
vv2 = _mm_shuffle_ps(tmp1, tmp3, 0xdd); // v2[0,1,2,3] // ofsi+1\82Ív2\82É\88Ú\93®
vec_out = MM_FMA_PS(_mm_sub_ps(vv2, vv1), vfp, vv1);
tmp3 = _mm_loadu_pd(&rs_buf[ofsp1[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
tmp4 = _mm_loadu_pd(&rs_buf[ofsp1[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
#else
- tmp1 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- tmp2 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp3 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- tmp4 = _mm_loadu_pd(&rs_buf[vofsi.m128i_i32[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à
+ tmp1 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,0)]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+ tmp2 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,1)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ tmp3 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,2)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ tmp4 = _mm_loadu_pd(&rs_buf[MM_EXTRACT_I32(vofsi,3)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à
#endif // !(defined(_MSC_VER) || defined(MSC_VER))
vv11 = _mm_shuffle_pd(tmp1, tmp2, 0x00); // v1[0,1] // ofsi\82Ív1\82É
vv21 = _mm_shuffle_pd(tmp1, tmp2, 0x03); // v2[0,1] // ofsi+1\82Ív2\82É\88Ú\93®
/*****************************************************************************/
#if (USE_X86_EXT_ASM || USE_X86_EXT_INTRIN || USE_X86_AMD_EXT_ASM || USE_X86_AMD_EXT_INTRIN)
-
+#ifdef __GNUC__
+inline void CPUID(int32 *regs, uint32 eax)
+{
+ uint32 ebx,ecx,edx;
+ __asm__ __volatile__ (
+#ifdef __x86_64__
+ "push %%rbx \n\t"
+#else
+ "push %%ebx \n\t"
+#endif
+ "cpuid \n\t"
+ "mov %%ebx, %1 \n\t"
+#ifdef __x86_64__
+ "pop %%rbx \n\t"
+#else
+ "pop %%ebx \n\t"
+#endif
+ : "+a"(eax), "=r"(ebx), "=c"(ecx), "=d"(edx)
+ );
+ regs[0] = eax;
+ regs[1] = ebx;
+ regs[2] = ecx;
+ regs[3] = edx;
+}
+#else
+#include <cpuid.h>
+#define CPUID __cpuid
+#endif
enum{
X86_VENDER_INTEL=0,
X86_VENDER_AMD,
// \8ag\92£\83t\83\89\83O\8eæ\93¾
static inline int64 xgetbv(int index)
{
+#if defined(__GNUC__)
+ unsigned int eax, edx;
+ __asm__ __volatile__ (
+ "xgetbv \n\t"
+ : "=a"(eax), "=d"(edx)
+ : "c"(index)
+ );
+ return (uint64)eax|((uint64)edx<<32);
+#else
#if (USE_X86_EXT_ASM || USE_X86_AMD_EXT_ASM)
uint64 flg = 0;
//_asm {
#elif (USE_X86_EXT_INTRIN || USE_X86_AMD_EXT_INTRIN)
return _xgetbv(index);
#endif
-
+#endif
}
uint32 flg4; // extended feature flg pg2
memset(vendor, 0, sizeof(vendor));
- __cpuid(reg, 0);
+ CPUID(reg,0);
cmd = reg[0];
((uint32*)vendor)[0] = reg[1];
((uint32*)vendor)[1] = reg[3];
break;
}
if(cmd >= 0x00000001){
- __cpuid(reg, 0x00000001);
+ CPUID(reg,0x00000001);
flg1 = reg[3];
flg2 = reg[2];
}
- __cpuid(reg, 0x80000000);
+ CPUID(reg,0x80000000);
cmd = reg[ 0 ];
if(cmd >= 0x80000001){
- __cpuid(reg, 0x80000001);
+ CPUID(reg,0x80000001);
flg4 = reg[2];
flg3 = reg[3];
}
#ifndef OPTCODE_H_INCLUDED
#define OPTCODE_H_INCLUDED 1
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmacro-redefined"
+#endif
+
#if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_X86_) || defined(__X86__) || defined(__I86__)
#define IX86CPU 1
#endif
-#if defined(_M_X64) || defined(_AMD64_) || defined(_X64_) || defined(__X64__)
+#if defined(_M_X64) || defined(_AMD64_) || defined(_X64_) || defined(__X64__) || defined(__x86_64__)
#define IX64CPU 1
#undef IX86CPU
#undef IA64CPU
#define MM_LSU_MUL_PS(ptr, vec_a) _mm_storeu_ps(ptr, _mm_mul_ps(_mm_loadu_ps(ptr), vec_a))
#endif
+#if (USE_X86_EXT_INTRIN >= 1)
+#if !(defined(_MSC_VER) || defined(MSC_VER))
+#define MM_EXTRACT_F32(reg,idx) _mm_cvtss_f32(_mm_shuffle_ps(reg,reg,idx))
+#define MM_EXTRACT_F64(reg,idx) _mm_cvtsd_f64(_mm_shuffle_pd(reg,reg,idx))
+#define MM_EXTRACT_I32(reg,idx) _mm_cvtsi128_si32(_mm_shuffle_epi32(reg,idx))
+#define MM256_EXTRACT_I32(reg,idx) _mm256_extract_epi32(reg,idx)
+#else
+#define MM_EXTRACT_F32(reg,idx) reg.m128_f32[idx]
+#define MM_EXTRACT_F64(reg,idx) reg.m128d_f64[idx]
+#define MM_EXTRACT_I32(reg,idx) reg.m128i_i32[idx]
+#define MM256_EXTRACT_I32(reg,idx) reg.m256i_i32[idx]
+#endif
+#endif // (USE_X86_EXT_INTRIN >= 1)
#define IS_ALIGN(ptr) (!((int32)ptr & (ALIGN_SIZE - 1)))
extern int is_x86ext_available(void);
#define memset switch_memset
#endif /* altivec */
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
#endif /* OPTCODE_H_INCLUDED */
__m128 vmul = _mm_set1_ps((float)MAX_8BIT_SIGNED);
for(i = 0; i < c; i += 4){
__m128 vec_f = _mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- float *out = (float *)vec_f;
- cp[i] = (int8)(out[0]);
- cp[i] = (int8)(out[1]);
- cp[i] = (int8)(out[2]);
- cp[i] = (int8)(out[3]);
- }
-#else
- cp[i] = (int8)(vec_f.m128_f32[0]);
- cp[i] = (int8)(vec_f.m128_f32[1]);
- cp[i] = (int8)(vec_f.m128_f32[2]);
- cp[i] = (int8)(vec_f.m128_f32[3]);
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ cp[i] = (int8)(MM_EXTRACT_F32(vec_f,0));
+ cp[i] = (int8)(MM_EXTRACT_F32(vec_f,1));
+ cp[i] = (int8)(MM_EXTRACT_F32(vec_f,2));
+ cp[i] = (int8)(MM_EXTRACT_F32(vec_f,3));
}
}
#else
__m128i vex = _mm_set1_epi8(0x80);
for(i = 0; i < c; i += 4){
__m128 vec_f = _mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- float *out = (float *)vec_f;
- cp[i] = 0x80 ^ (uint8)(out[0]);
- cp[i] = 0x80 ^ (uint8)(out[1]);
- cp[i] = 0x80 ^ (uint8)(out[2]);
- cp[i] = 0x80 ^ (uint8)(out[3]);
- }
-#else
- cp[i] = 0x80 ^ (uint8)(vec_f.m128_f32[0]);
- cp[i] = 0x80 ^ (uint8)(vec_f.m128_f32[1]);
- cp[i] = 0x80 ^ (uint8)(vec_f.m128_f32[2]);
- cp[i] = 0x80 ^ (uint8)(vec_f.m128_f32[3]);
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ cp[i] = 0x80 ^ (uint8)(MM_EXTRACT_F32(vec_f,0));
+ cp[i] = 0x80 ^ (uint8)(MM_EXTRACT_F32(vec_f,1));
+ cp[i] = 0x80 ^ (uint8)(MM_EXTRACT_F32(vec_f,2));
+ cp[i] = 0x80 ^ (uint8)(MM_EXTRACT_F32(vec_f,3));
}
}
#else
__m256d vmul = _mm256_set1_pd((double)MAX_16BIT_SIGNED);
for(i = 0; i < c; i += 4){
__m128i vec0 = _mm256_cvttpd_epi32(_mm256_mul_pd(D256_CLIP_INPUT(&lp[i], gain), vmul));
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- int32 *out = (int32 *)vec0;
- up[i] = AUDIO_S2U(out[0]);
- up[i + 1] = AUDIO_S2U(out[1]);
- up[i + 2] = AUDIO_S2U(out[2]);
- up[i + 3] = AUDIO_S2U(out[3]);
- }
-#else
- up[i] = AUDIO_S2U(vec0.m128i_i32[0]);
- up[i + 1] = AUDIO_S2U(vec0.m128i_i32[1]);
- up[i + 2] = AUDIO_S2U(vec0.m128i_i32[2]);
- up[i + 3] = AUDIO_S2U(vec0.m128i_i32[3]);
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ up[i] = AUDIO_S2U(MM_EXTRACT_I32(vec0,0));
+ up[i + 1] = AUDIO_S2U(MM_EXTRACT_I32(vec0,1));
+ up[i + 2] = AUDIO_S2U(MM_EXTRACT_I32(vec0,2));
+ up[i + 3] = AUDIO_S2U(MM_EXTRACT_I32(vec0,3));
}
}
#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
__m128 vec_f12 = _mm_cvtpd_ps(_mm_load_pd(&lp[i + 2]));
__m128 vec_f1 = _mm_shuffle_ps(vec_f11, vec_f12, 0x44);
__m128i vec_i32 = _mm_cvttps_epi32(_mm_mul_ps(F128_CLIP_MM(vec_f1, gain), vmul));
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- int32 *out = (int32 *)vec_i32;
- up[i] = AUDIO_S2U(out[0]);
- up[i + 1] = AUDIO_S2U(out[1]);
- up[i + 2] = AUDIO_S2U(out[2]);
- up[i + 3] = AUDIO_S2U(out[3]);
- }
-#else
- up[i] = AUDIO_S2U(vec_i32.m128i_i32[0]);
- up[i + 1] = AUDIO_S2U(vec_i32.m128i_i32[1]);
- up[i + 2] = AUDIO_S2U(vec_i32.m128i_i32[2]);
- up[i + 3] = AUDIO_S2U(vec_i32.m128i_i32[3]);
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ up[i] = AUDIO_S2U(MM_EXTRACT_I32(vec_i32,0));
+ up[i + 1] = AUDIO_S2U(MM_EXTRACT_I32(vec_i32,1));
+ up[i + 2] = AUDIO_S2U(MM_EXTRACT_I32(vec_i32,2));
+ up[i + 3] = AUDIO_S2U(MM_EXTRACT_I32(vec_i32,3));
}
}
#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_FLOAT)
__m128 vmul = _mm_set1_ps((float)MAX_16BIT_SIGNED);
for(i = 0; i < c; i += 4){
__m128i vec0 = _mm_cvttps_epi32(_mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul));
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- int32 *out = (int32 *)vec0;
- up[i] = AUDIO_S2U(out[0]);
- up[i + 1] = AUDIO_S2U(out[1]);
- up[i + 2] = AUDIO_S2U(out[2]);
- up[i + 3] = AUDIO_S2U(out[3]);
- }
-#else
- up[i] = AUDIO_S2U(vec0.m128i_i32[0]);
- up[i + 1] = AUDIO_S2U(vec0.m128i_i32[1]);
- up[i + 2] = AUDIO_S2U(vec0.m128i_i32[2]);
- up[i + 3] = AUDIO_S2U(vec0.m128i_i32[3]);
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ up[i] = AUDIO_S2U(MM_EXTRACT_I32(vec0,0));
+ up[i + 1] = AUDIO_S2U(MM_EXTRACT_I32(vec0,1));
+ up[i + 2] = AUDIO_S2U(MM_EXTRACT_I32(vec0,2));
+ up[i + 3] = AUDIO_S2U(MM_EXTRACT_I32(vec0,3));
}
}
#else
__m256d vmul = _mm256_set1_pd((double)MAX_16BIT_SIGNED);
for(i = 0; i < c; i += 4){
__m128i vec0 = _mm256_cvttpd_epi32(_mm256_mul_pd(D256_CLIP_INPUT(&lp[i], gain), vmul));
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- int32 *out = (int32 *)vec0;
- up[i] = AUDIO_S2A(out[0]);
- up[i + 1] = AUDIO_S2A(out[1]);
- up[i + 2] = AUDIO_S2A(out[2]);
- up[i + 3] = AUDIO_S2A(out[3]);
- }
-#else
- up[i] = AUDIO_S2A(vec0.m128i_i32[0]);
- up[i + 1] = AUDIO_S2A(vec0.m128i_i32[1]);
- up[i + 2] = AUDIO_S2A(vec0.m128i_i32[2]);
- up[i + 3] = AUDIO_S2A(vec0.m128i_i32[3]);
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ up[i] = AUDIO_S2A(MM_EXTRACT_I32(vec0,0));
+ up[i + 1] = AUDIO_S2A(MM_EXTRACT_I32(vec0,1));
+ up[i + 2] = AUDIO_S2A(MM_EXTRACT_I32(vec0,2));
+ up[i + 3] = AUDIO_S2A(MM_EXTRACT_I32(vec0,3));
}
}
#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
__m128 vec_f12 = _mm_cvtpd_ps(_mm_load_pd(&lp[i + 2]));
__m128 vec_f1 = _mm_shuffle_ps(vec_f11, vec_f12, 0x44);
__m128i vec_i32 = _mm_cvttps_epi32(_mm_mul_ps(F128_CLIP_MM(vec_f1, gain), vmul));
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- int32 *out = (int32 *)vec_i32;
- up[i] = AUDIO_S2A(out[0]);
- up[i + 1] = AUDIO_S2A(out[1]);
- up[i + 2] = AUDIO_S2A(out[2]);
- up[i + 3] = AUDIO_S2A(out[3]);
- }
-#else
- up[i] = AUDIO_S2A(vec_i32.m128i_i32[0]);
- up[i + 1] = AUDIO_S2A(vec_i32.m128i_i32[1]);
- up[i + 2] = AUDIO_S2A(vec_i32.m128i_i32[2]);
- up[i + 3] = AUDIO_S2A(vec_i32.m128i_i32[3]);
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ up[i] = AUDIO_S2A(MM_EXTRACT_I32(vec_i32,0));
+ up[i + 1] = AUDIO_S2A(MM_EXTRACT_I32(vec_i32,1));
+ up[i + 2] = AUDIO_S2A(MM_EXTRACT_I32(vec_i32,2));
+ up[i + 3] = AUDIO_S2A(MM_EXTRACT_I32(vec_i32,3));
}
}
#else
__m128 vmul = _mm_set1_ps((float)MAX_16BIT_SIGNED);
for(i = 0; i < c; i += 4){
__m128 vec_f = _mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- float *out = (float *)vec_f;
- sp[i] = (int16)(out[0]);
- sp[i] = (int16)(out[1]);
- sp[i] = (int16)(out[2]);
- sp[i] = (int16)(out[3]);
- }
-#else
- sp[i] = (int16)(vec_f.m128_f32[0]);
- sp[i] = (int16)(vec_f.m128_f32[1]);
- sp[i] = (int16)(vec_f.m128_f32[2]);
- sp[i] = (int16)(vec_f.m128_f32[3]);
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ sp[i] = (int16)(MM_EXTRACT_F32(vec_f,0));
+ sp[i] = (int16)(MM_EXTRACT_F32(vec_f,1));
+ sp[i] = (int16)(MM_EXTRACT_F32(vec_f,2));
+ sp[i] = (int16)(MM_EXTRACT_F32(vec_f,3));
}
}
#else
__m128 vmul = _mm_set1_ps((float)MAX_24BIT_SIGNED);
for(i = 0; i < c; i += 4){ // 108 inst in loop
__m128 vec_f = _mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- float *out = (float *)vec_f;
- STORE_S24(cp, (int32)(out[0]));
- STORE_S24(cp, (int32)(out[1]));
- STORE_S24(cp, (int32)(out[2]));
- STORE_S24(cp, (int32)(out[3]));
- }
-#else
- STORE_S24(cp, (int32)(vec_f.m128_f32[0]));
- STORE_S24(cp, (int32)(vec_f.m128_f32[1]));
- STORE_S24(cp, (int32)(vec_f.m128_f32[2]));
- STORE_S24(cp, (int32)(vec_f.m128_f32[3]));
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ STORE_S24(cp, (int32)(MM_EXTRACT_F32(vec_f,0)));
+ STORE_S24(cp, (int32)(MM_EXTRACT_F32(vec_f,1)));
+ STORE_S24(cp, (int32)(MM_EXTRACT_F32(vec_f,2)));
+ STORE_S24(cp, (int32)(MM_EXTRACT_F32(vec_f,3)));
}
}
#else
__m128 vmul = _mm_set1_ps((float)MAX_32BIT_SIGNED);
for(i = 0; i < c; i += 4){
__m128 vec_f = _mm_mul_ps(F128_CLIP_INPUT(&lp[i], gain), vmul);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- float *out = (float *)vec_f;
- sp[i] = (int32)(out[0]);
- sp[i] = (int32)(out[1]);
- sp[i] = (int32)(out[2]);
- sp[i] = (int32)(out[3]);
- }
-#else
- sp[i] = (int32)(vec_f.m128_f32[0]);
- sp[i] = (int32)(vec_f.m128_f32[1]);
- sp[i] = (int32)(vec_f.m128_f32[2]);
- sp[i] = (int32)(vec_f.m128_f32[3]);
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ sp[i] = (int32)(MM_EXTRACT_F32(vec_f,0));
+ sp[i] = (int32)(MM_EXTRACT_F32(vec_f,1));
+ sp[i] = (int32)(MM_EXTRACT_F32(vec_f,2));
+ sp[i] = (int32)(MM_EXTRACT_F32(vec_f,3));
}
}
#else
__m128 gain = _mm_set1_ps((float)INPUT_GAIN);
for(i = c - 4; i >= 0; i -= 4){
__m128 vec_f = F128_CLIP_INPUT(&lp[i], gain);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- float *out = (float *)vec_f;
- sp[i] = (double)(out[0]);
- sp[i] = (double)(out[1]);
- sp[i] = (double)(out[2]);
- sp[i] = (double)(out[3]);
- }
-#else
- sp[i] = (double)(vec_f.m128_f32[0]);
- sp[i] = (double)(vec_f.m128_f32[1]);
- sp[i] = (double)(vec_f.m128_f32[2]);
- sp[i] = (double)(vec_f.m128_f32[3]);
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ sp[i] = (double)(MM_EXTRACT_F32(vec_f,0));
+ sp[i] = (double)(MM_EXTRACT_F32(vec_f,1));
+ sp[i] = (double)(MM_EXTRACT_F32(vec_f,2));
+ sp[i] = (double)(MM_EXTRACT_F32(vec_f,3));
}
}
#elif defined(DATA_T_DOUBLE)
vevol = _mm_shuffle_ps(vevol, vevol, 0x44);
}
vsp = _mm_mul_ps(_mm_loadu_ps(src), vevol);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- float *out = (float *)vsp;
- *(src++) = out[0];
- *(src++) = out[1];
- }
-#else
- *(src++) = vsp.m128_f32[0];
- *(src++) = vsp.m128_f32[1];
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ *(src++) = MM_EXTRACT_F32(vsp,0);
+ *(src++) = MM_EXTRACT_F32(vsp,1);
}
#else // ! USE_X86_EXT_INTRIN
for(; i < count; i += 8) {
__m256i vofsi = _mm256_srli_epi32(vofs, FRACTION_BITS);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- int32 *ofsp = (int32 *)vofsi;
- __m128i vin1 = _mm_loadu_si128((__m128i *)&src[ofsp[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- __m128i vin2 = _mm_loadu_si128((__m128i *)&src[ofsp[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin3 = _mm_loadu_si128((__m128i *)&src[ofsp[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin4 = _mm_loadu_si128((__m128i *)&src[ofsp[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin5 = _mm_loadu_si128((__m128i *)&src[ofsp[4]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin6 = _mm_loadu_si128((__m128i *)&src[ofsp[5]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin7 = _mm_loadu_si128((__m128i *)&src[ofsp[6]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin8 = _mm_loadu_si128((__m128i *)&src[ofsp[7]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
-#else
- __m128i vin1 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- __m128i vin2 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin3 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin4 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin5 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[4]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin6 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[5]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin7 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[6]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin8 = _mm_loadu_si128((__m128i *)&src[vofsi.m256i_i32[7]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
-#endif
+ __m128i vin1 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,0)]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+ __m128i vin2 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,1)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ __m128i vin3 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,2)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ __m128i vin4 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,3)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ __m128i vin5 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,4)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ __m128i vin6 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,5)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ __m128i vin7 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,6)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ __m128i vin8 = _mm_loadu_si128((__m128i *)&src[MM256_EXTRACT_I32(vofsi,7)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
__m128i vin12 = _mm_unpacklo_epi16(vin1, vin2); // [v11v21]e96,[v12v22]e96 to [v11v12v21v22]e64
__m128i vin34 = _mm_unpacklo_epi16(vin3, vin4); // [v13v23]e96,[v14v24]e96 to [v13v14v23v24]e64
__m128i vin56 = _mm_unpacklo_epi16(vin5, vin6); // \93¯\82¶
dest += 4;
#elif defined(DATA_T_FLOAT) // DATA_T_FLOAT
__m128 vec_out = _mm_mul_ps(MM_FMA_PS(_mm_sub_ps(vv2, vv1), vfp, vv1), vec_divo);
- _mm256_storeu_ps(dest, vec_out);
+ _mm_storeu_ps(dest, vec_out);
dest += 4;
#else // DATA_T_IN32
__m128 vec_out = MM_FMA_PS(_mm_sub_ps(vv2, vv1), vfp, vv1);
const __m128 vec_divo = _mm_set1_ps(DIV_15BIT);
for(; i < count; i += 4) {
__m128i vofsi = _mm_srli_epi32(vofs, FRACTION_BITS);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- int32 *ofsp = (int32 *)vofsi;
- __m128i vin1 = _mm_loadu_si128((__m128i *)&src[ofsp[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- __m128i vin2 = _mm_loadu_si128((__m128i *)&src[ofsp[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin3 = _mm_loadu_si128((__m128i *)&src[ofsp[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin4 = _mm_loadu_si128((__m128i *)&src[ofsp[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
-#else
- __m128i vin1 = _mm_loadu_si128((__m128i *)&src[vofsi.m128i_i32[0]]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
- __m128i vin2 = _mm_loadu_si128((__m128i *)&src[vofsi.m128i_i32[1]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin3 = _mm_loadu_si128((__m128i *)&src[vofsi.m128i_i32[2]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
- __m128i vin4 = _mm_loadu_si128((__m128i *)&src[vofsi.m128i_i32[3]]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
-#endif
+ __m128i vin1 = _mm_loadu_si128((__m128i *)&src[MM_EXTRACT_I32(vofsi,0)]); // ofsi\82Æofsi+1\82ð\83\8d\81[\83h
+ __m128i vin2 = _mm_loadu_si128((__m128i *)&src[MM_EXTRACT_I32(vofsi,1)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ __m128i vin3 = _mm_loadu_si128((__m128i *)&src[MM_EXTRACT_I32(vofsi,2)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
+ __m128i vin4 = _mm_loadu_si128((__m128i *)&src[MM_EXTRACT_I32(vofsi,3)]); // \8e\9f\8eü\83T\83\93\83v\83\8b\82à\93¯\82¶
__m128i vin12 = _mm_unpacklo_epi16(vin1, vin2); // [v11v21]e96,[v12v22]e96 to [v11v12v21v22]e64
__m128i vin34 = _mm_unpacklo_epi16(vin3, vin4); // [v13v23]e96,[v14v24]e96 to [v13v14v23v24]e64
__m128i vi16 = _mm_unpacklo_epi32(vin12, vin34); // [v11v12,v21v22]e64,[v13v14,v23v24]e64 to [v11v12v13v14,v21v22v23v24]e0
vofs = _mm_add_epi32(vofs, vinc);
}
}
- resrc->offset = prec_offset + (splen_t)(vofs.m128i_i32[0]);
+ resrc->offset = prec_offset + (splen_t)(MM_EXTRACT_I32(vofs,0));
*out_count = i;
return dest;
}
vv2 = _mm_cvt_si2ss(vv2, src[++ofsi]), vv2 = _mm_shuffle_ps(vv2, vv2, 0x1b);
#if defined(DATA_T_DOUBLE)
vec_out = _mm_mul_ps(MM_FMA_PS(_mm_sub_ps(vv2, vv1), _mm_mul_ps(vfp, vec_divf), vv1), vec_divo);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- float *out = (float *)vec_out;
- *dest++ = (DATA_T)out[0];
- *dest++ = (DATA_T)out[1];
- *dest++ = (DATA_T)out[2];
- *dest++ = (DATA_T)out[3];
- }
-#else
- *dest++ = (DATA_T)vec_out.m128_f32[0];
- *dest++ = (DATA_T)vec_out.m128_f32[1];
- *dest++ = (DATA_T)vec_out.m128_f32[2];
- *dest++ = (DATA_T)vec_out.m128_f32[3];
-#endif
+ *dest++ = (DATA_T)MM_EXTRACT_F32(vec_out,0);
+ *dest++ = (DATA_T)MM_EXTRACT_F32(vec_out,1);
+ *dest++ = (DATA_T)MM_EXTRACT_F32(vec_out,2);
+ *dest++ = (DATA_T)MM_EXTRACT_F32(vec_out,3);
#elif defined(DATA_T_FLOAT) // DATA_T_FLOAT
_mm_storeu_ps(dest, _mm_mul_ps(MM_FMA_PS(_mm_sub_ps(vv2, vv1), _mm_mul_ps(vfp, vec_divf), vv1), vec_divo));
dest += 4;
#include <sys/types.h>
#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
#include <stdio.h>
/* Architectures */
#if defined(IX86CPU) && (defined(_MSC_VER) || defined(__POCC__) || \
defined(__BORLANDC__) || defined(__WATCOMC__))
#define CALLINGCONV __fastcall
-#elif defined(IX86CPU) && defined(__GNUC__)
+#elif defined(IX86CPU) && !defined(AMD64CPU) && defined(__GNUC__)
#define CALLINGCONV __attribute__((fastcall))
#else
#define CALLINGCONV /**/
#ifdef __MINGW32__
#define aligned_malloc __mingw_aligned_malloc
#define aligned_free __mingw_aligned_free
-#elif __STDC_VERSION__ >= 201112L
-#define aligned_malloc(s,a) aligned_alloc(a,s)
-#define aligned_free free
-//#elif _POSIX_VERSION >= 200112L
-//#define aligned_malloc(s,a) posix_memalign(,a,s)
+/* aligned_malloc is unsafe because s must be a multiple of a */
+//#elif __STDC_VERSION__ >= 201112L
+//#define aligned_malloc(s,a) aligned_malloc(a,s)
//#define aligned_free free
+#elif defined(__GNUC__) && _POSIX_VERSION >= 200112L
+#define aligned_malloc(s,a) ({void *ptr; if(!s || posix_memalign(&ptr,a,s)) ptr = NULL; ptr;})
+#define aligned_free free
#elif _MSC_VER
#define aligned_malloc _aligned_malloc
#define aligned_free _aligned_free
vsp = _mm_loadu_ps(sp++);
vsp = _mm_shuffle_ps(vsp, vsp, 0x50); // [0,1,2,3] to {0,0,1,1]
vsp = _mm_mul_ps(vsp, vevol);
-#if !(defined(_MSC_VER) || defined(MSC_VER))
- {
- float *out = (float *)vsp;
- *(lp++) = out[0];
- *(lp++) = out[1];
- }
-#else
- *(lp++) = vsp.m128_f32[0];
- *(lp++) = vsp.m128_f32[1];
-#endif // !(defined(_MSC_VER) || defined(MSC_VER))
+ *(lp++) = MM_EXTRACT_F32(vsp,0);
+ *(lp++) = MM_EXTRACT_F32(vsp,1);
}
#else // ! USE_X86_EXT_INTRIN
for(i = 0; i < count2; i += 8){
MM256_LSU_MUL_PS(&sp[i], vamp);
}
+ }
#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
{
const int32 req_count_mask = ~(0x7);