Alternative to D83897. I believe the big change here is that I removed slow unaligned memory 16
Down side that it may adversely effect tuning if someone explicitly targets -march=pentium4 and expects pentium4 tuned code. Of course pentium4 is so old our default behavior with the previous settings may not have been the best either.
Reviewed By: echristo, RKSimon
Differential Revision: https://reviews.llvm.org/D83913
FeatureCMOV, FeatureInsertVZEROUPPER]>;
foreach P = ["pentium4", "pentium4m"] in {
+// def : ProcessorModel<P, GenericPostRAModel,
+// [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+// FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
+// FeatureCMOV, FeatureInsertVZEROUPPER]>;
+
+ // Since 'pentium4' is the default 32-bit CPU on Linux and Windows,
+ // give it more modern tunings.
+ // FIXME: This wouldn't be needed if we supported mtune.
def : ProcessorModel<P, GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+ [FeatureX87, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
- FeatureCMOV, FeatureInsertVZEROUPPER]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER,
+ FeatureSlow3OpsLEA, FeatureSlowDivide64,
+ FeatureSlowIncDec, FeatureMacroFusion]>;
}
// Intel Quark.
; X32-LABEL: func_cf_vector_x86
; X32: movl 12(%ebp), %eax
; X32: movl 8(%ebp), %ecx
- ; X32: movsd 24(%eax), %xmm4 # xmm4 = mem[0],zero
- ; X32: movsd %xmm4, 24(%esp)
- ; X32: movsd 16(%eax), %xmm5 # xmm5 = mem[0],zero
- ; X32: movsd %xmm5, 16(%esp)
- ; X32: movsd (%eax), %xmm6 # xmm6 = mem[0],zero
- ; X32: movsd 8(%eax), %xmm7 # xmm7 = mem[0],zero
- ; X32: movsd %xmm7, 8(%esp)
- ; X32: movsd %xmm6, (%esp)
+ ; X32: movups (%eax), %xmm0
+ ; X32: movups 16(%eax), %xmm1
+ ; X32: movaps %xmm0, (%esp)
+ ; X32: movaps %xmm1, 16(%esp)
+ ; X32: movsd (%esp), %xmm4
+ ; X32: movsd 8(%esp), %xmm5
+ ; X32: movsd 16(%esp), %xmm6
+ ; X32: movsd 24(%esp), %xmm7
; X32: calll *___guard_check_icall_fptr
- ; X32: movaps %xmm6, %xmm0
- ; X32: movaps %xmm7, %xmm1
- ; X32: movaps %xmm5, %xmm2
- ; X32: movaps %xmm4, %xmm3
+ ; X32: movaps %xmm4, %xmm0
+ ; X32: movaps %xmm5, %xmm1
+ ; X32: movaps %xmm6, %xmm2
+ ; X32: movaps %xmm7, %xmm3
; X32: calll *%ecx
}
attributes #0 = { "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefix=SLOW
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefix=SLOW
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefix=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefix=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefix=SLOW
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefix=SLOW
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefix=SLOW
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona 2>&1 | FileCheck %s --check-prefix=SLOW
; Intel chips with fast unaligned memory accesses
+; Marked fast because this is the default 32-bit mode CPU in clang.
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefix=FAST
+
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont 2>&1 | FileCheck %s --check-prefix=FAST
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nehalem 2>&1 | FileCheck %s --check-prefix=FAST
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=westmere 2>&1 | FileCheck %s --check-prefix=FAST
; OBJ: SubSectionType: FrameData (0xF5)
; OBJ: FrameData {
; OBJ: RvaStart: 0x0
-; OBJ: CodeSize: 0x34
+; OBJ: CodeSize: 0x36
; OBJ: PrologSize: 0x9
; OBJ: FrameFunc [
; OBJ-NEXT: $T0 .raSearch =
; OBJ: }
; OBJ: FrameData {
; OBJ: RvaStart: 0x7
-; OBJ: CodeSize: 0x2D
+; OBJ: CodeSize: 0x2F
; OBJ: PrologSize: 0x2
; OBJ: FrameFunc [
; OBJ-NEXT: $T0 .raSearch =
; OBJ: }
; OBJ: FrameData {
; OBJ: RvaStart: 0x8
-; OBJ: CodeSize: 0x2C
+; OBJ: CodeSize: 0x2E
; OBJ: PrologSize: 0x1
; OBJ: FrameFunc [
; OBJ-NEXT: $T0 .raSearch =
; OBJ: }
; OBJ: FrameData {
; OBJ: RvaStart: 0x9
-; OBJ: CodeSize: 0x2B
+; OBJ: CodeSize: 0x2D
; OBJ: PrologSize: 0x0
; OBJ: FrameFunc [
; OBJ-NEXT: $T0 .raSearch =
; CHECK: PtrParent: 0x0
; CHECK: PtrEnd: 0x0
; CHECK: PtrNext: 0x0
-; CHECK: CodeSize: 0x39
+; CHECK: CodeSize: 0x2A
; CHECK: DbgStart: 0x0
; CHECK: DbgEnd: 0x0
; CHECK: FunctionType: f (0x1002)
; CHECK: LocalVariableAddrRange {
; CHECK: OffsetStart: .text+0x6
; CHECK: ISectStart: 0x0
-; CHECK: Range: 0x33
+; CHECK: Range: 0x24
; CHECK: }
; CHECK: }
; CHECK: ProcEnd {