Originally committed to x264 in
1637239a by Henrik Gramner who has
agreed to re-license it as LGPL. Original commit message follows.
x86: Avoid some bypass delays and false dependencies
A bypass delay of 1-3 clock cycles may occur on some CPUs when transitioning
between int and float domains, so try to avoid that if possible.
psrlq %1, 8*(%2)
%endif
%endmacro
+
+%macro MOVHL 2 ; dst, src
+%ifidn %1, %2
+ punpckhqdq %1, %2
+%elif cpuflag(avx)
+ punpckhqdq %1, %2, %2
+%elif cpuflag(sse4)
+ pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
+%else
+ movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
+%endif
+%endmacro