From 91f2876f143dd8b2304490c3e85f96aaaa5ac219 Mon Sep 17 00:00:00 2001 From: Ivailo Monev Date: Tue, 9 Aug 2016 20:19:41 +0000 Subject: [PATCH] get rid of some SIMD remains Signed-off-by: Ivailo Monev --- src/core/tools/qsimd.cpp | 415 --------------------------------------- src/core/tools/qsimd_p.h | 237 ---------------------- src/core/tools/qstring.cpp | 126 ------------ src/core/tools/tools.cmake | 2 - src/gui/image/qimage.cpp | 1 - src/gui/image/qjpeghandler.cpp | 1 - src/gui/image/qpixmap_raster.cpp | 20 +- src/gui/painting/qdrawhelper.cpp | 251 ----------------------- src/gui/painting/qdrawhelper_p.h | 1 - 9 files changed, 6 insertions(+), 1048 deletions(-) delete mode 100644 src/core/tools/qsimd.cpp delete mode 100644 src/core/tools/qsimd_p.h diff --git a/src/core/tools/qsimd.cpp b/src/core/tools/qsimd.cpp deleted file mode 100644 index 2b8045094..000000000 --- a/src/core/tools/qsimd.cpp +++ /dev/null @@ -1,415 +0,0 @@ -/**************************************************************************** -** -** Copyright (C) 2015 The Qt Company Ltd. -** Contact: http://www.qt.io/licensing/ -** -** This file is part of the QtCore module of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:LGPL$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see http://www.qt.io/terms-conditions. For further -** information use the contact form at http://www.qt.io/contact-us. -** -** GNU Lesser General Public License Usage -** Alternatively, this file may be used under the terms of the GNU Lesser -** General Public License version 2.1 or version 3 as published by the Free -** Software Foundation and appearing in the file LICENSE.LGPLv21 and -** LICENSE.LGPLv3 included in the packaging of this file. Please review the -** following information to ensure the GNU Lesser General Public License -** requirements will be met: https://www.gnu.org/licenses/lgpl.html and -** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. -** -** As a special exception, The Qt Company gives you certain additional -** rights. These rights are described in The Qt Company LGPL Exception -** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. -** -** GNU General Public License Usage -** Alternatively, this file may be used under the terms of the GNU -** General Public License version 3.0 as published by the Free Software -** Foundation and appearing in the file LICENSE.GPL included in the -** packaging of this file. Please review the following information to -** ensure the GNU General Public License version 3.0 requirements will be -** met: http://www.gnu.org/copyleft/gpl.html. -** -** $QT_END_LICENSE$ -** -****************************************************************************/ - -#include "qsimd_p.h" -#include -#include - -#if defined(Q_OS_WINCE) -#include -#endif - -#if defined(Q_OS_WIN64) -#include -#endif - -#if defined(Q_OS_LINUX) && defined(__arm__) -#include "qcore_unix_p.h" - -// the kernel header definitions for HWCAP_* -// (the ones we need/may need anyway) - -// copied from (ARM) -#define HWCAP_IWMMXT 512 -#define HWCAP_CRUNCH 1024 -#define HWCAP_THUMBEE 2048 -#define HWCAP_NEON 4096 -#define HWCAP_VFPv3 8192 -#define HWCAP_VFPv3D16 16384 - -// copied from -#define AT_HWCAP 16 /* arch dependent hints at CPU capabilities */ - -#endif - -QT_BEGIN_NAMESPACE - -#if defined (Q_OS_NACL) -static inline uint detectProcessorFeatures() -{ - return 0; -} -#elif defined (Q_OS_WINCE) -static inline uint detectProcessorFeatures() -{ - uint features = 0; - -#if defined (ARM) - if (IsProcessorFeaturePresent(PF_ARM_INTEL_WMMX)) { - features = IWMMXT; - return features; - } -#elif defined(_X86_) - features = 0; -#if defined QT_HAVE_MMX - if (IsProcessorFeaturePresent(PF_MMX_INSTRUCTIONS_AVAILABLE)) - features |= MMX; -#endif -#if defined QT_HAVE_3DNOW - if (IsProcessorFeaturePresent(PF_3DNOW_INSTRUCTIONS_AVAILABLE)) - features |= MMX3DNOW; -#endif - return features; -#endif - features = 0; - return features; -} - -#elif defined(__arm__) || defined(__arm) || defined(QT_HAVE_IWMMXT) || defined(QT_HAVE_NEON) -static inline uint detectProcessorFeatures() -{ - uint features = 0; - -#if defined(Q_OS_LINUX) - int auxv = ::qt_safe_open("/proc/self/auxv", O_RDONLY); - if (auxv != -1) { - unsigned long vector[64]; - int nread; - while (features == 0) { - nread = ::qt_safe_read(auxv, (char *)vector, sizeof vector); - if (nread <= 0) { - // EOF or error - break; - } - - int max = nread / (sizeof vector[0]); - for (int i = 0; i < max; i += 2) - if (vector[i] == AT_HWCAP) { - if (vector[i+1] & HWCAP_IWMMXT) - features |= IWMMXT; - if (vector[i+1] & HWCAP_NEON) - features |= NEON; - break; - } - } - - ::qt_safe_close(auxv); - return features; - } - // fall back if /proc/self/auxv wasn't found -#endif - -#if defined(QT_HAVE_IWMMXT) - // runtime detection only available when running as a previlegied process - features = IWMMXT; -#elif defined(QT_ALWAYS_HAVE_NEON) - features = NEON; -#endif - - return features; -} - -#elif defined(__i386__) || defined(_M_IX86) -static inline uint detectProcessorFeatures() -{ - uint features = 0; - - unsigned int extended_result = 0; - unsigned int feature_result = 0; - uint result = 0; - /* see p. 118 of amd64 instruction set manual Vol3 */ -#if defined(Q_CC_GNU) - long cpuid_supported, tmp1; - asm ("pushf\n" - "pop %0\n" - "mov %0, %1\n" - "xor $0x00200000, %0\n" - "push %0\n" - "popf\n" - "pushf\n" - "pop %0\n" - "xor %1, %0\n" // %eax is now 0 if CPUID is not supported - : "=a" (cpuid_supported), "=r" (tmp1) - ); - if (cpuid_supported) { - asm ("xchg %%ebx, %2\n" - "cpuid\n" - "xchg %%ebx, %2\n" - : "=c" (feature_result), "=d" (result), "=&r" (tmp1) - : "a" (1)); - - asm ("xchg %%ebx, %1\n" - "cpuid\n" - "cmp $0x80000000, %%eax\n" - "jnbe 1f\n" - "xor %0, %0\n" - "jmp 2f\n" - "1:\n" - "mov $0x80000001, %%eax\n" - "cpuid\n" - "2:\n" - "xchg %%ebx, %1\n" - : "=d" (extended_result), "=&r" (tmp1) - : "a" (0x80000000) - : "%ecx" - ); - } - -#elif defined (Q_OS_WIN) - _asm { - push eax - push ebx - push ecx - push edx - pushfd - pop eax - mov ebx, eax - xor eax, 00200000h - push eax - popfd - pushfd - pop eax - mov edx, 0 - xor eax, ebx - jz skip - - mov eax, 1 - cpuid - mov result, edx - mov feature_result, ecx - skip: - pop edx - pop ecx - pop ebx - pop eax - } - - _asm { - push eax - push ebx - push ecx - push edx - pushfd - pop eax - mov ebx, eax - xor eax, 00200000h - push eax - popfd - pushfd - pop eax - mov edx, 0 - xor eax, ebx - jz skip2 - - mov eax, 80000000h - cpuid - cmp eax, 80000000h - jbe skip2 - mov eax, 80000001h - cpuid - mov extended_result, edx - skip2: - pop edx - pop ecx - pop ebx - pop eax - } -#endif - - - // result now contains the standard feature bits - if (result & (1u << 15)) - features |= CMOV; - if (result & (1u << 23)) - features |= MMX; - if (extended_result & (1u << 22)) - features |= MMXEXT; - if (extended_result & (1u << 31)) - features |= MMX3DNOW; - if (extended_result & (1u << 30)) - features |= MMX3DNOWEXT; - if (result & (1u << 25)) - features |= SSE; - if (result & (1u << 26)) - features |= SSE2; - if (feature_result & (1u)) - features |= SSE3; - if (feature_result & (1u << 9)) - features |= SSSE3; - if (feature_result & (1u << 19)) - features |= SSE4_1; - if (feature_result & (1u << 20)) - features |= SSE4_2; - if (feature_result & (1u << 28)) - features |= AVX; - - return features; -} - -#elif defined(__x86_64) || defined(Q_OS_WIN64) -static inline uint detectProcessorFeatures() -{ - uint features = MMX|SSE|SSE2|CMOV; - uint feature_result = 0; - -#if defined(Q_CC_GNU) - quint64 tmp; - asm ("xchg %%rbx, %1\n" - "cpuid\n" - "xchg %%rbx, %1\n" - : "=c" (feature_result), "=&r" (tmp) - : "a" (1) - : "%edx" - ); -#elif defined (Q_OS_WIN64) - { - int info[4]; - __cpuid(info, 1); - feature_result = info[2]; - } -#endif - - if (feature_result & (1u)) - features |= SSE3; - if (feature_result & (1u << 9)) - features |= SSSE3; - if (feature_result & (1u << 19)) - features |= SSE4_1; - if (feature_result & (1u << 20)) - features |= SSE4_2; - if (feature_result & (1u << 28)) - features |= AVX; - - return features; -} - -#elif defined(__ia64__) -static inline uint detectProcessorFeatures() -{ - return MMX|SSE|SSE2; -} - -#else -static inline uint detectProcessorFeatures() -{ - return 0; -} -#endif - -/* - * Use kdesdk/scripts/generate_string_table.pl to update the table below. - * Here's the data (don't forget the ONE leading space): - mmx - mmxext - mmx3dnow - mmx3dnowext - sse - sse2 - cmov - iwmmxt - neon - sse3 - ssse3 - sse4.1 - sse4.2 - avx - */ - -// begin generated -static const char features_string[] = - " mmx\0" - " mmxext\0" - " mmx3dnow\0" - " mmx3dnowext\0" - " sse\0" - " sse2\0" - " cmov\0" - " iwmmxt\0" - " neon\0" - " sse3\0" - " ssse3\0" - " sse4.1\0" - " sse4.2\0" - " avx\0" - "\0"; - -static const int features_indices[] = { - 0, 5, 13, 23, 36, 41, 47, 53, - 61, 67, 73, 80, 88, 96, -1 -}; -// end generated - -const int features_count = (sizeof features_indices - 1) / (sizeof features_indices[0]); - -uint qDetectCPUFeatures() -{ - static QBasicAtomicInt features = Q_BASIC_ATOMIC_INITIALIZER(-1); - if (features != -1) - return features; - - uint f = detectProcessorFeatures(); - QByteArray disable = qgetenv("QT_NO_CPU_FEATURE"); - if (disable == "all") { - f = 0; - } else if (!disable.isEmpty()) { - disable.prepend(' '); - for (int i = 0; i < features_count; ++i) { - if (disable.contains(features_string + features_indices[i])) - f &= ~(1 << i); - } - } - - features = f; - return features; -} - -void qDumpCPUFeatures() -{ - uint features = qDetectCPUFeatures(); - printf("Processor features: "); - for (int i = 0; i < features_count; ++i) { - if (features & (1 << i)) - printf("%s", features_string + features_indices[i]); - } - puts(""); -} - -QT_END_NAMESPACE diff --git a/src/core/tools/qsimd_p.h b/src/core/tools/qsimd_p.h deleted file mode 100644 index fbc34c108..000000000 --- a/src/core/tools/qsimd_p.h +++ /dev/null @@ -1,237 +0,0 @@ -/**************************************************************************** -** -** Copyright (C) 2015 The Qt Company Ltd. -** Contact: http://www.qt.io/licensing/ -** -** This file is part of the QtCore module of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:LGPL$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see http://www.qt.io/terms-conditions. For further -** information use the contact form at http://www.qt.io/contact-us. -** -** GNU Lesser General Public License Usage -** Alternatively, this file may be used under the terms of the GNU Lesser -** General Public License version 2.1 or version 3 as published by the Free -** Software Foundation and appearing in the file LICENSE.LGPLv21 and -** LICENSE.LGPLv3 included in the packaging of this file. Please review the -** following information to ensure the GNU Lesser General Public License -** requirements will be met: https://www.gnu.org/licenses/lgpl.html and -** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. -** -** As a special exception, The Qt Company gives you certain additional -** rights. These rights are described in The Qt Company LGPL Exception -** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. -** -** GNU General Public License Usage -** Alternatively, this file may be used under the terms of the GNU -** General Public License version 3.0 as published by the Free Software -** Foundation and appearing in the file LICENSE.GPL included in the -** packaging of this file. Please review the following information to -** ensure the GNU General Public License version 3.0 requirements will be -** met: http://www.gnu.org/copyleft/gpl.html. -** -** $QT_END_LICENSE$ -** -****************************************************************************/ - -#ifndef QSIMD_P_H -#define QSIMD_P_H - -#include - - -QT_BEGIN_HEADER - - -#if defined(QT_NO_MAC_XARCH) || (defined(Q_OS_DARWIN) && (defined(__ppc__) || defined(__ppc64__))) -// Disable MMX and SSE on Mac/PPC builds, or if the compiler -// does not support -Xarch argument passing -#undef QT_HAVE_SSE -#undef QT_HAVE_SSE2 -#undef QT_HAVE_SSE3 -#undef QT_HAVE_SSSE3 -#undef QT_HAVE_SSE4_1 -#undef QT_HAVE_SSE4_2 -#undef QT_HAVE_AVX -#undef QT_HAVE_3DNOW -#undef QT_HAVE_MMX -#endif - -// SSE intrinsics -#if defined(QT_HAVE_SSE2) && (defined(__SSE2__) || defined(Q_CC_MSVC)) -#if defined(QT_LINUXBASE) -/// this is an evil hack - the posix_memalign declaration in LSB -/// is wrong - see http://bugs.linuxbase.org/show_bug.cgi?id=2431 -# define posix_memalign _lsb_hack_posix_memalign -# include -# undef posix_memalign -#else -# ifdef Q_CC_MINGW -# include -# endif -# include -#endif - -// SSE3 intrinsics -#if defined(QT_HAVE_SSE3) && (defined(__SSE3__) || defined(Q_CC_MSVC)) -#include -#endif - -// SSSE3 intrinsics -#if defined(QT_HAVE_SSSE3) && (defined(__SSSE3__) || defined(Q_CC_MSVC)) -#include -#endif - -// SSE4.1 intrinsics -#if defined(QT_HAVE_SSE4_1) && (defined(__SSE4_1__) || defined(Q_CC_MSVC)) -#include -#endif - -// SSE4.2 intrinsics -#if defined(QT_HAVE_SSE4_2) && (defined(__SSE4_2__) || defined(Q_CC_MSVC)) -#include - -// Add missing intrisics in some compilers (e.g. llvm-gcc) -#ifndef _SIDD_UBYTE_OPS -#define _SIDD_UBYTE_OPS 0x00 -#endif - -#ifndef _SIDD_UWORD_OPS -#define _SIDD_UWORD_OPS 0x01 -#endif - -#ifndef _SIDD_SBYTE_OPS -#define _SIDD_SBYTE_OPS 0x02 -#endif - -#ifndef _SIDD_SWORD_OPS -#define _SIDD_SWORD_OPS 0x03 -#endif - -#ifndef _SIDD_CMP_EQUAL_ANY -#define _SIDD_CMP_EQUAL_ANY 0x00 -#endif - -#ifndef _SIDD_CMP_RANGES -#define _SIDD_CMP_RANGES 0x04 -#endif - -#ifndef _SIDD_CMP_EQUAL_EACH -#define _SIDD_CMP_EQUAL_EACH 0x08 -#endif - -#ifndef _SIDD_CMP_EQUAL_ORDERED -#define _SIDD_CMP_EQUAL_ORDERED 0x0c -#endif - -#ifndef _SIDD_POSITIVE_POLARITY -#define _SIDD_POSITIVE_POLARITY 0x00 -#endif - -#ifndef _SIDD_NEGATIVE_POLARITY -#define _SIDD_NEGATIVE_POLARITY 0x10 -#endif - -#ifndef _SIDD_MASKED_POSITIVE_POLARITY -#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 -#endif - -#ifndef _SIDD_MASKED_NEGATIVE_POLARITY -#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 -#endif - -#ifndef _SIDD_LEAST_SIGNIFICANT -#define _SIDD_LEAST_SIGNIFICANT 0x00 -#endif - -#ifndef _SIDD_MOST_SIGNIFICANT -#define _SIDD_MOST_SIGNIFICANT 0x40 -#endif - -#ifndef _SIDD_BIT_MASK -#define _SIDD_BIT_MASK 0x00 -#endif - -#ifndef _SIDD_UNIT_MASK -#define _SIDD_UNIT_MASK 0x40 -#endif - -#endif - -// AVX intrinsics -#if defined(QT_HAVE_AVX) && (defined(__AVX__) || defined(Q_CC_MSVC)) -#include -#endif - - -#if !defined(QT_BOOTSTRAPPED) && (!defined(Q_CC_MSVC) || (defined(_M_X64) || _M_IX86_FP == 2)) -#define QT_ALWAYS_HAVE_SSE2 -#endif -#endif // defined(QT_HAVE_SSE2) && (defined(__SSE2__) || defined(Q_CC_MSVC)) - -// NEON intrinsics -#if defined __ARM_NEON__ -#define QT_ALWAYS_HAVE_NEON -#include -#endif - - -// IWMMXT intrinsics -#if defined(QT_HAVE_IWMMXT) -#include -#if defined(Q_OS_WINCE) -# include "qplatformdefs.h" -#endif -#endif - -#if defined(QT_HAVE_IWMMXT) -#if !defined(__IWMMXT__) && !defined(Q_OS_WINCE) -# include -#elif defined(Q_OS_WINCE_STD) && defined(_X86_) -# pragma warning(disable: 4391) -# include -#endif -#endif - -// 3D now intrinsics -#if defined(QT_HAVE_3DNOW) && (defined(__3dNOW__) || defined(Q_CC_MSVC)) -#include -#endif - -QT_BEGIN_NAMESPACE - - -enum CPUFeatures { - None = 0, - MMX = 0x1, - MMXEXT = 0x2, - MMX3DNOW = 0x4, - MMX3DNOWEXT = 0x8, - SSE = 0x10, - SSE2 = 0x20, - CMOV = 0x40, - IWMMXT = 0x80, - NEON = 0x100, - SSE3 = 0x200, - SSSE3 = 0x400, - SSE4_1 = 0x800, - SSE4_2 = 0x1000, - AVX = 0x2000 -}; - -Q_CORE_EXPORT uint qDetectCPUFeatures(); - - -#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \ - for (; i < static_cast(qMin(static_cast(length), ((4 - ((reinterpret_cast(ptr) >> 2) & 0x3)) & 0x3))); ++i) - -QT_END_NAMESPACE - -QT_END_HEADER - -#endif // QSIMD_P_H diff --git a/src/core/tools/qstring.cpp b/src/core/tools/qstring.cpp index e74cae81a..a81acfb99 100644 --- a/src/core/tools/qstring.cpp +++ b/src/core/tools/qstring.cpp @@ -46,7 +46,6 @@ #include #endif #include -#include "qsimd_p.h" #include #include #include "qlocale.h" @@ -3535,61 +3534,6 @@ bool QString::endsWith(const QChar &c, Qt::CaseSensitivity cs) const Use toLocal8Bit() instead. */ -#if defined(QT_ALWAYS_HAVE_SSE2) -static inline __m128i mergeQuestionMarks(__m128i chunk) -{ - const __m128i questionMark = _mm_set1_epi16('?'); - -# ifdef __SSE4_2__ - // compare the unsigned shorts for the range 0x0100-0xFFFF - // note on the use of _mm_cmpestrm: - // The MSDN documentation online (http://technet.microsoft.com/en-us/library/bb514080.aspx) - // says for range search the following: - // For each character c in a, determine whether b0 <= c <= b1 or b2 <= c <= b3 - // - // However, all examples on the Internet, including from Intel - // (see http://software.intel.com/en-us/articles/xml-parsing-accelerator-with-intel-streaming-simd-extensions-4-intel-sse4/) - // put the range to be searched first - // - // Disassembly and instruction-level debugging with GCC and ICC show - // that they are doing the right thing. Inverting the arguments in the - // instruction does cause a bunch of test failures. - - const int mode = _SIDD_UWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK; - const __m128i rangeMatch = _mm_cvtsi32_si128(0xffff0100); - const __m128i offLimitMask = _mm_cmpestrm(rangeMatch, 2, chunk, 8, mode); - - // replace the non-Latin 1 characters in the chunk with question marks - chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); -# else - // SSE has no compare instruction for unsigned comparison. - // The variables must be shiffted + 0x8000 to be compared - const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000)); - const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000)); - - const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset); - const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); - -# ifdef __SSE4_1__ - // replace the non-Latin 1 characters in the chunk with question marks - chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); -# else - // offLimitQuestionMark contains '?' for each 16 bits that was off-limit - // the 16 bits that were correct contains zeros - const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); - - // correctBytes contains the bytes that were in limit - // the 16 bits that were off limits contains zeros - const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk); - - // merge offLimitQuestionMark and correctBytes to have the result - chunk = _mm_or_si128(correctBytes, offLimitQuestionMark); -# endif -# endif - return chunk; -} -#endif - static QByteArray toLatin1_helper(const QChar *data, int length) { QByteArray ba; @@ -3597,51 +3541,6 @@ static QByteArray toLatin1_helper(const QChar *data, int length) ba.resize(length); const ushort *src = reinterpret_cast(data); uchar *dst = (uchar*) ba.data(); -#if defined(QT_ALWAYS_HAVE_SSE2) - if (length >= 16) { - const int chunkCount = length >> 4; // divided by 16 - - for (int i = 0; i < chunkCount; ++i) { - __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load - chunk1 = mergeQuestionMarks(chunk1); - src += 8; - - __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load - chunk2 = mergeQuestionMarks(chunk2); - src += 8; - - // pack the two vector to 16 x 8bits elements - const __m128i result = _mm_packus_epi16(chunk1, chunk2); - - _mm_storeu_si128((__m128i*)dst, result); // store - dst += 16; - } - length = length % 16; - } -#elif defined(QT_ALWAYS_HAVE_NEON) - // Refer to the documentation of the SSE2 implementation - // this use eactly the same method as for SSE except: - // 1) neon has unsigned comparison - // 2) packing is done to 64 bits (8 x 8bits component). - if (length >= 16) { - const int chunkCount = length >> 3; // divided by 8 - const uint16x8_t questionMark = vdupq_n_u16('?'); // set - const uint16x8_t thresholdMask = vdupq_n_u16(0xff); // set - for (int i = 0; i < chunkCount; ++i) { - uint16x8_t chunk = vld1q_u16((uint16_t *)src); // load - src += 8; - - const uint16x8_t offLimitMask = vcgtq_u16(chunk, thresholdMask); // chunk > thresholdMask - const uint16x8_t offLimitQuestionMark = vandq_u16(offLimitMask, questionMark); // offLimitMask & questionMark - const uint16x8_t correctBytes = vbicq_u16(chunk, offLimitMask); // !offLimitMask & chunk - chunk = vorrq_u16(correctBytes, offLimitQuestionMark); // correctBytes | offLimitQuestionMark - const uint8x8_t result = vmovn_u16(chunk); // narrowing move->packing - vst1_u8(dst, result); // store - dst += 8; - } - length = length % 8; - } -#endif while (length--) { *dst++ = (*src>0xff) ? '?' : (uchar) *src; ++src; @@ -3783,31 +3682,6 @@ QString::Data *QString::fromLatin1_helper(const char *str, int size) d->data = d->array; d->array[size] = '\0'; ushort *dst = d->data; - /* SIMD: - * Unpacking with SSE has been shown to improve performance on recent CPUs - * The same method gives no improvement with NEON. - */ -#if defined(QT_ALWAYS_HAVE_SSE2) - if (size >= 16) { - int chunkCount = size >> 4; // divided by 16 - const __m128i nullMask = _mm_set1_epi32(0); - for (int i = 0; i < chunkCount; ++i) { - const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load - str += 16; - - // unpack the first 8 bytes, padding with zeros - const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); - _mm_storeu_si128((__m128i*)dst, firstHalf); // store - dst += 8; - - // unpack the last 8 bytes, padding with zeros - const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); - _mm_storeu_si128((__m128i*)dst, secondHalf); // store - dst += 8; - } - size = size % 16; - } -#endif while (size--) *dst++ = (uchar)*str++; } diff --git a/src/core/tools/tools.cmake b/src/core/tools/tools.cmake index 0b0335e64..bfc4c8d9a 100644 --- a/src/core/tools/tools.cmake +++ b/src/core/tools/tools.cmake @@ -37,7 +37,6 @@ set(CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/tools/qsharedpointer.h ${CMAKE_CURRENT_SOURCE_DIR}/tools/qsharedpointer_impl.h ${CMAKE_CURRENT_SOURCE_DIR}/tools/qset.h - ${CMAKE_CURRENT_SOURCE_DIR}/tools/qsimd_p.h ${CMAKE_CURRENT_SOURCE_DIR}/tools/qsize.h ${CMAKE_CURRENT_SOURCE_DIR}/tools/qstack.h ${CMAKE_CURRENT_SOURCE_DIR}/tools/qstring.h @@ -80,7 +79,6 @@ set(CORE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/tools/qregexp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tools/qshareddata.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tools/qsharedpointer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/tools/qsimd.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tools/qsize.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tools/qstring.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tools/qstringbuilder.cpp diff --git a/src/gui/image/qimage.cpp b/src/gui/image/qimage.cpp index 001e5e5d7..2da71df69 100644 --- a/src/gui/image/qimage.cpp +++ b/src/gui/image/qimage.cpp @@ -57,7 +57,6 @@ #include #include #include -#include #include diff --git a/src/gui/image/qjpeghandler.cpp b/src/gui/image/qjpeghandler.cpp index cda6c0df2..a7f97896a 100644 --- a/src/gui/image/qjpeghandler.cpp +++ b/src/gui/image/qjpeghandler.cpp @@ -45,7 +45,6 @@ #include #include #include -#include #include // jpeglib needs this to be pre-included #include diff --git a/src/gui/image/qpixmap_raster.cpp b/src/gui/image/qpixmap_raster.cpp index e2c1db911..8e5c5b114 100644 --- a/src/gui/image/qpixmap_raster.cpp +++ b/src/gui/image/qpixmap_raster.cpp @@ -40,22 +40,18 @@ ****************************************************************************/ #include "qpixmap.h" - -#include - +#include "qfont_p.h" #include "qpixmap_raster_p.h" #include "qnativeimage_p.h" #include "qimage_p.h" #include "qpaintengine.h" - #include "qbitmap.h" #include "qimage.h" -#include -#include -#include -#include -#include -#include +#include "qbuffer.h" +#include "qimagereader.h" +#include "qimage_p.h" +#include "qwidget_p.h" +#include "qdrawhelper_p.h" QT_BEGIN_NAMESPACE @@ -163,7 +159,6 @@ void QRasterPixmapData::fill(const QColor &color) if (alpha != 255) { if (!image.hasAlphaChannel()) { QImage::Format toFormat; -#if !(defined(QT_HAVE_NEON) || defined(QT_ALWAYS_HAVE_SSE2)) if (image.format() == QImage::Format_RGB16) toFormat = QImage::Format_ARGB8565_Premultiplied; else if (image.format() == QImage::Format_RGB666) @@ -173,7 +168,6 @@ void QRasterPixmapData::fill(const QColor &color) else if (image.format() == QImage::Format_RGB444) toFormat = QImage::Format_ARGB4444_Premultiplied; else -#endif toFormat = QImage::Format_ARGB32_Premultiplied; if (!image.isNull() && qt_depthForFormat(image.format()) == qt_depthForFormat(toFormat)) { @@ -364,7 +358,6 @@ void QRasterPixmapData::createPixmapForImage(QImage &sourceImage, Qt::ImageConve QImage::Format opaqueFormat = QNativeImage::systemFormat(); QImage::Format alphaFormat = QImage::Format_ARGB32_Premultiplied; -#if !defined(QT_HAVE_NEON) && !defined(QT_ALWAYS_HAVE_SSE2) switch (opaqueFormat) { case QImage::Format_RGB16: alphaFormat = QImage::Format_ARGB8565_Premultiplied; @@ -372,7 +365,6 @@ void QRasterPixmapData::createPixmapForImage(QImage &sourceImage, Qt::ImageConve default: // We don't care about the others... break; } -#endif if (!sourceImage.hasAlphaChannel()) { format = opaqueFormat; diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index e54a45588..9c33cc019 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -625,76 +625,6 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, i return (((tlrb + trrb + blrb + brrb) >> 8) & 0x00ff00ff) | ((tlag + trag + blag + brag) & 0xff00ff00); } -#if defined(QT_ALWAYS_HAVE_SSE2) -#define interpolate_4_pixels_16_sse2(tl, tr, bl, br, distx, disty, colorMask, v_256, b) \ -{ \ - const __m128i dxdy = _mm_mullo_epi16 (distx, disty); \ - const __m128i distx_ = _mm_slli_epi16(distx, 4); \ - const __m128i disty_ = _mm_slli_epi16(disty, 4); \ - const __m128i idxidy = _mm_add_epi16(dxdy, _mm_sub_epi16(v_256, _mm_add_epi16(distx_, disty_))); \ - const __m128i dxidy = _mm_sub_epi16(distx_, dxdy); \ - const __m128i idxdy = _mm_sub_epi16(disty_, dxdy); \ - \ - __m128i tlAG = _mm_srli_epi16(tl, 8); \ - __m128i tlRB = _mm_and_si128(tl, colorMask); \ - __m128i trAG = _mm_srli_epi16(tr, 8); \ - __m128i trRB = _mm_and_si128(tr, colorMask); \ - __m128i blAG = _mm_srli_epi16(bl, 8); \ - __m128i blRB = _mm_and_si128(bl, colorMask); \ - __m128i brAG = _mm_srli_epi16(br, 8); \ - __m128i brRB = _mm_and_si128(br, colorMask); \ - \ - tlAG = _mm_mullo_epi16(tlAG, idxidy); \ - tlRB = _mm_mullo_epi16(tlRB, idxidy); \ - trAG = _mm_mullo_epi16(trAG, dxidy); \ - trRB = _mm_mullo_epi16(trRB, dxidy); \ - blAG = _mm_mullo_epi16(blAG, idxdy); \ - blRB = _mm_mullo_epi16(blRB, idxdy); \ - brAG = _mm_mullo_epi16(brAG, dxdy); \ - brRB = _mm_mullo_epi16(brRB, dxdy); \ - \ - /* Add the values, and shift to only keep 8 significant bits per colors */ \ - __m128i rAG =_mm_add_epi16(_mm_add_epi16(tlAG, trAG), _mm_add_epi16(blAG, brAG)); \ - __m128i rRB =_mm_add_epi16(_mm_add_epi16(tlRB, trRB), _mm_add_epi16(blRB, brRB)); \ - rAG = _mm_andnot_si128(colorMask, rAG); \ - rRB = _mm_srli_epi16(rRB, 8); \ - _mm_storeu_si128((__m128i*)(b), _mm_or_si128(rAG, rRB)); \ -} -#endif - -#if defined(QT_ALWAYS_HAVE_NEON) -#define interpolate_4_pixels_16_neon(tl, tr, bl, br, distx, disty, disty_, colorMask, invColorMask, v_256, b) \ -{ \ - const int16x8_t dxdy = vmulq_s16(distx, disty); \ - const int16x8_t distx_ = vshlq_n_s16(distx, 4); \ - const int16x8_t idxidy = vaddq_s16(dxdy, vsubq_s16(v_256, vaddq_s16(distx_, disty_))); \ - const int16x8_t dxidy = vsubq_s16(distx_, dxdy); \ - const int16x8_t idxdy = vsubq_s16(disty_, dxdy); \ - \ - int16x8_t tlAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(tl), 8)); \ - int16x8_t tlRB = vandq_s16(tl, colorMask); \ - int16x8_t trAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(tr), 8)); \ - int16x8_t trRB = vandq_s16(tr, colorMask); \ - int16x8_t blAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(bl), 8)); \ - int16x8_t blRB = vandq_s16(bl, colorMask); \ - int16x8_t brAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(br), 8)); \ - int16x8_t brRB = vandq_s16(br, colorMask); \ - \ - int16x8_t rAG = vmulq_s16(tlAG, idxidy); \ - int16x8_t rRB = vmulq_s16(tlRB, idxidy); \ - rAG = vmlaq_s16(rAG, trAG, dxidy); \ - rRB = vmlaq_s16(rRB, trRB, dxidy); \ - rAG = vmlaq_s16(rAG, blAG, idxdy); \ - rRB = vmlaq_s16(rRB, blRB, idxdy); \ - rAG = vmlaq_s16(rAG, brAG, dxdy); \ - rRB = vmlaq_s16(rRB, brRB, dxdy); \ - \ - rAG = vandq_s16(invColorMask, rAG); \ - rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8)); \ - vst1q_s16((int16_t*)(b), vorrq_s16(rAG, rRB)); \ -} -#endif - template Q_STATIC_TEMPLATE_FUNCTION inline void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2) { @@ -801,70 +731,6 @@ const uint * QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Operator * } } - if (blendType != BlendTransformedBilinearTiled && - (format == QImage::Format_ARGB32_Premultiplied || format == QImage::Format_RGB32)) { -#if defined(QT_ALWAYS_HAVE_SSE2) - const __m128i disty_ = _mm_set1_epi16(disty); - const __m128i idisty_ = _mm_set1_epi16(idisty); - const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); - - lim -= 3; - for (; f < lim; x += 4, f += 4) { - // Load 4 pixels from s1, and split the alpha-green and red-blue component - __m128i top = _mm_loadu_si128((__m128i*)((const uint *)(s1)+x)); - __m128i topAG = _mm_srli_epi16(top, 8); - __m128i topRB = _mm_and_si128(top, colorMask); - // Multiplies each colour component by idisty - topAG = _mm_mullo_epi16 (topAG, idisty_); - topRB = _mm_mullo_epi16 (topRB, idisty_); - - // Same for the s2 vector - __m128i bottom = _mm_loadu_si128((__m128i*)((const uint *)(s2)+x)); - __m128i bottomAG = _mm_srli_epi16(bottom, 8); - __m128i bottomRB = _mm_and_si128(bottom, colorMask); - bottomAG = _mm_mullo_epi16 (bottomAG, disty_); - bottomRB = _mm_mullo_epi16 (bottomRB, disty_); - - // Add the values, and shift to only keep 8 significant bits per colors - __m128i rAG =_mm_add_epi16(topAG, bottomAG); - rAG = _mm_srli_epi16(rAG, 8); - _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG); - __m128i rRB =_mm_add_epi16(topRB, bottomRB); - rRB = _mm_srli_epi16(rRB, 8); - _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB); - } -#elif defined(QT_ALWAYS_HAVE_NEON) - const int16x8_t disty_ = vdupq_n_s16(disty); - const int16x8_t idisty_ = vdupq_n_s16(idisty); - const int16x8_t colorMask = vdupq_n_s16(0x00ff); - - lim -= 3; - for (; f < lim; x += 4, f += 4) { - // Load 4 pixels from s1, and split the alpha-green and red-blue component - int16x8_t top = vld1q_s16((int16_t*)((const uint *)(s1)+x)); - int16x8_t topAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(top), 8)); - int16x8_t topRB = vandq_s16(top, colorMask); - // Multiplies each colour component by idisty - topAG = vmulq_s16(topAG, idisty_); - topRB = vmulq_s16(topRB, idisty_); - - // Same for the s2 vector - int16x8_t bottom = vld1q_s16((int16_t*)((const uint *)(s2)+x)); - int16x8_t bottomAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(bottom), 8)); - int16x8_t bottomRB = vandq_s16(bottom, colorMask); - bottomAG = vmulq_s16(bottomAG, disty_); - bottomRB = vmulq_s16(bottomRB, disty_); - - // Add the values, and shift to only keep 8 significant bits per colors - int16x8_t rAG = vaddq_s16(topAG, bottomAG); - rAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rAG), 8)); - vst1q_s16((int16_t*)(&intermediate_buffer[1][f]), rAG); - int16x8_t rRB = vaddq_s16(topRB, bottomRB); - rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8)); - vst1q_s16((int16_t*)(&intermediate_buffer[0][f]), rRB); - } -#endif - } for (; f < count; f++) { // Same as above but without sse2 if (blendType == BlendTransformedBilinearTiled) { if (x >= image_width) x -= image_width; @@ -931,123 +797,6 @@ const uint * QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Operator * const uchar *s2 = data->texture.scanLine(y2); int disty = (fy & 0x0000ffff) >> 12; - if (blendType != BlendTransformedBilinearTiled && - (format == QImage::Format_ARGB32_Premultiplied || format == QImage::Format_RGB32)) { - -#define BILINEAR_DOWNSCALE_BOUNDS_PROLOG \ - while (b < end) { \ - int x1 = (fx >> 16); \ - int x2; \ - fetchTransformedBilinear_pixelBounds(image_width, image_x1, image_x2, x1, x2); \ - if (x1 != x2) \ - break; \ - uint tl = fetch(s1, x1, data->texture.colorTable); \ - uint tr = fetch(s1, x2, data->texture.colorTable); \ - uint bl = fetch(s2, x1, data->texture.colorTable); \ - uint br = fetch(s2, x2, data->texture.colorTable); \ - int distx = (fx & 0x0000ffff) >> 12; \ - *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); \ - fx += fdx; \ - ++b; \ - } \ - uint *boundedEnd; \ - if (fdx > 0) \ - boundedEnd = qMin(end, buffer + uint((image_x2 - (fx >> 16)) / data->m11)); \ - else \ - boundedEnd = qMin(end, buffer + uint((image_x1 - (fx >> 16)) / data->m11)); \ - boundedEnd -= 3; - -#if defined(QT_ALWAYS_HAVE_SSE2) - BILINEAR_DOWNSCALE_BOUNDS_PROLOG - - const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); - const __m128i v_256 = _mm_set1_epi16(256); - const __m128i v_disty = _mm_set1_epi16(disty); - __m128i v_fdx = _mm_set1_epi32(fdx*4); - - ptrdiff_t secondLine = reinterpret_cast(s2) - reinterpret_cast(s1); - - union Vect_buffer { __m128i vect; quint32 i[4]; }; - Vect_buffer v_fx; - - for (int i = 0; i < 4; i++) { - v_fx.i[i] = fx; - fx += fdx; - } - - while (b < boundedEnd) { - - Vect_buffer tl, tr, bl, br; - - for (int i = 0; i < 4; i++) { - int x1 = v_fx.i[i] >> 16; - const uint *addr_tl = reinterpret_cast(s1) + x1; - const uint *addr_tr = addr_tl + 1; - tl.i[i] = *addr_tl; - tr.i[i] = *addr_tr; - bl.i[i] = *(addr_tl+secondLine); - br.i[i] = *(addr_tr+secondLine); - } - __m128i v_distx = _mm_srli_epi16(v_fx.vect, 12); - v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); - v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); - - interpolate_4_pixels_16_sse2(tl.vect, tr.vect, bl.vect, br.vect, v_distx, v_disty, colorMask, v_256, b); - b+=4; - v_fx.vect = _mm_add_epi32(v_fx.vect, v_fdx); - } - fx = v_fx.i[0]; -#elif defined(QT_ALWAYS_HAVE_NEON) - BILINEAR_DOWNSCALE_BOUNDS_PROLOG - - const int16x8_t colorMask = vdupq_n_s16(0x00ff); - const int16x8_t invColorMask = vmvnq_s16(colorMask); - const int16x8_t v_256 = vdupq_n_s16(256); - const int16x8_t v_disty = vdupq_n_s16(disty); - const int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4); - int32x4_t v_fdx = vdupq_n_s32(fdx*4); - - ptrdiff_t secondLine = reinterpret_cast(s2) - reinterpret_cast(s1); - - union Vect_buffer { int32x4_t vect; quint32 i[4]; }; - Vect_buffer v_fx; - - for (int i = 0; i < 4; i++) { - v_fx.i[i] = fx; - fx += fdx; - } - - const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff); - - while (b < boundedEnd) { - - Vect_buffer tl, tr, bl, br; - - Vect_buffer v_fx_shifted; - v_fx_shifted.vect = vshrq_n_s32(v_fx.vect, 16); - - int32x4_t v_distx = vshrq_n_s32(vandq_s32(v_fx.vect, v_ffff_mask), 12); - - for (int i = 0; i < 4; i++) { - int x1 = v_fx_shifted.i[i]; - const uint *addr_tl = reinterpret_cast(s1) + x1; - const uint *addr_tr = addr_tl + 1; - tl.i[i] = *addr_tl; - tr.i[i] = *addr_tr; - bl.i[i] = *(addr_tl+secondLine); - br.i[i] = *(addr_tr+secondLine); - } - - v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16)); - - interpolate_4_pixels_16_neon(vreinterpretq_s16_s32(tl.vect), vreinterpretq_s16_s32(tr.vect), vreinterpretq_s16_s32(bl.vect), vreinterpretq_s16_s32(br.vect), vreinterpretq_s16_s32(v_distx), v_disty, v_disty_, colorMask, invColorMask, v_256, b); - b+=4; - v_fx.vect = vaddq_s32(v_fx.vect, v_fdx); - } - fx = v_fx.i[0]; -#endif - } - while (b < end) { int x1 = (fx >> 16); int x2; diff --git a/src/gui/painting/qdrawhelper_p.h b/src/gui/painting/qdrawhelper_p.h index c758ba55f..9c535dca2 100644 --- a/src/gui/painting/qdrawhelper_p.h +++ b/src/gui/painting/qdrawhelper_p.h @@ -62,7 +62,6 @@ #define QT_FT_END_HEADER #endif #include "qrasterdefs_p.h" -#include #include QT_BEGIN_NAMESPACE -- 2.11.0