From 45e10e5c8d3df09c80a4d80483bff2712367f3fa Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Mon, 5 Aug 2013 13:12:48 +0100 Subject: [PATCH] arm: Add assembly version of h264_find_start_code_candidate MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Before After Mean StdDev Mean StdDev Change This function 508.8 23.4 185.4 9.0 +174.4% Overall 3068.5 31.7 2752.1 29.4 +11.5% In combination with the preceding patch: Before After Mean StdDev Mean StdDev Change Overall 2925.6 26.2 2752.1 29.4 +6.3% Signed-off-by: Martin Storsjö --- libavcodec/arm/Makefile | 1 + libavcodec/arm/h264dsp_armv6.S | 253 ++++++++++++++++++++++++++++++++++++++ libavcodec/arm/h264dsp_init_arm.c | 4 + 3 files changed, 258 insertions(+) create mode 100644 libavcodec/arm/h264dsp_armv6.S diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index e941aaa806..9c64b361f1 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -45,6 +45,7 @@ ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv6.o \ arm/simple_idct_armv6.o \ ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o +ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ arm/hpeldsp_armv6.o ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o diff --git a/libavcodec/arm/h264dsp_armv6.S b/libavcodec/arm/h264dsp_armv6.S new file mode 100644 index 0000000000..c4f12a6375 --- /dev/null +++ b/libavcodec/arm/h264dsp_armv6.S @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2013 RISC OS Open Ltd + * Author: Ben Avison + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +RESULT .req a1 +BUF .req a1 +SIZE .req a2 +PATTERN .req a3 +PTR .req a4 +DAT0 .req v1 +DAT1 .req v2 +DAT2 .req v3 +DAT3 .req v4 +TMP0 .req v5 +TMP1 .req v6 +TMP2 .req ip +TMP3 .req lr + +#define PRELOAD_DISTANCE 4 + +.macro innerloop4 + ldr DAT0, [PTR], #4 + subs SIZE, SIZE, #4 @ C flag survives rest of macro + sub TMP0, DAT0, PATTERN, lsr #14 + bic TMP0, TMP0, DAT0 + ands TMP0, TMP0, PATTERN +.endm + +.macro innerloop16 decrement, do_preload + ldmia PTR!, {DAT0,DAT1,DAT2,DAT3} + .ifnc "\do_preload","" + pld [PTR, #PRELOAD_DISTANCE*32] + .endif + .ifnc "\decrement","" + subs SIZE, SIZE, #\decrement @ C flag survives rest of macro + .endif + sub TMP0, DAT0, PATTERN, lsr #14 + sub TMP1, DAT1, PATTERN, lsr #14 + bic TMP0, TMP0, DAT0 + bic TMP1, TMP1, DAT1 + sub TMP2, DAT2, PATTERN, lsr #14 + sub TMP3, DAT3, PATTERN, lsr #14 + ands TMP0, TMP0, PATTERN + bic TMP2, TMP2, DAT2 + it eq + andseq TMP1, TMP1, PATTERN + bic TMP3, TMP3, DAT3 + itt eq + andseq TMP2, TMP2, PATTERN + andseq TMP3, TMP3, PATTERN +.endm + +/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */ +function ff_h264_find_start_code_candidate_armv6, export=1 + push {v1-v6,lr} + mov PTR, BUF + @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go + @ before using code that does preloads + cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1 + blo 60f + + @ Get to word-alignment, 1 byte at a time + tst PTR, #3 + beq 2f +1: ldrb DAT0, [PTR], #1 + sub SIZE, SIZE, #1 + teq DAT0, #0 + beq 90f + tst PTR, #3 + bne 1b +2: @ Get to 4-word alignment, 1 word at a time + ldr PATTERN, =0x80008000 + setend be + tst PTR, #12 + beq 4f +3: innerloop4 + bne 91f + tst PTR, #12 + bne 3b +4: @ Get to cacheline (8-word) alignment + tst PTR, #16 + beq 5f + innerloop16 16 + bne 93f +5: @ Check complete cachelines, with preloading + @ We need to stop when there are still (PRELOAD_DISTANCE+1) + @ complete cachelines to go + sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 +6: innerloop16 , do_preload + bne 93f + innerloop16 32 + bne 93f + bcs 6b + @ Preload trailing part-cacheline, if any + tst SIZE, #31 + beq 7f + pld [PTR, #(PRELOAD_DISTANCE+1)*32] + @ Check remaining data without doing any more preloads. First + @ do in chunks of 4 words: +7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16 + bmi 9f +8: innerloop16 16 + bne 93f + bcs 8b + @ Then in words: +9: adds SIZE, SIZE, #16 - 4 + bmi 11f +10: innerloop4 + bne 91f + bcs 10b +11: setend le + @ Check second byte of final halfword + ldrb DAT0, [PTR, #-1] + teq DAT0, #0 + beq 90f + @ Check any remaining bytes + tst SIZE, #3 + beq 13f +12: ldrb DAT0, [PTR], #1 + sub SIZE, SIZE, #1 + teq DAT0, #0 + beq 90f + tst SIZE, #3 + bne 12b + @ No candidate found +13: sub RESULT, PTR, BUF + b 99f + +60: @ Small buffer - simply check by looping over bytes + subs SIZE, SIZE, #1 + bcc 99f +61: ldrb DAT0, [PTR], #1 + subs SIZE, SIZE, #1 + teq DAT0, #0 + beq 90f + bcs 61b + @ No candidate found + sub RESULT, PTR, BUF + b 99f + +90: @ Found a candidate at the preceding byte + sub RESULT, PTR, BUF + sub RESULT, RESULT, #1 + b 99f + +91: @ Found a candidate somewhere in the preceding 4 bytes + sub RESULT, PTR, BUF + sub RESULT, RESULT, #4 + sub TMP0, DAT0, #0x20000 + bics TMP0, TMP0, DAT0 + itt pl + ldrbpl DAT0, [PTR, #-3] + addpl RESULT, RESULT, #2 + bpl 92f + teq RESULT, #0 + beq 98f @ don't look back a byte if found at first byte in buffer + ldrb DAT0, [PTR, #-5] +92: teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f + +93: @ Found a candidate somewhere in the preceding 16 bytes + sub RESULT, PTR, BUF + sub RESULT, RESULT, #16 + teq TMP0, #0 + beq 95f @ not in first 4 bytes + sub TMP0, DAT0, #0x20000 + bics TMP0, TMP0, DAT0 + itt pl + ldrbpl DAT0, [PTR, #-15] + addpl RESULT, RESULT, #2 + bpl 94f + teq RESULT, #0 + beq 98f @ don't look back a byte if found at first byte in buffer + ldrb DAT0, [PTR, #-17] +94: teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f +95: add RESULT, RESULT, #4 + teq TMP1, #0 + beq 96f @ not in next 4 bytes + sub TMP1, DAT1, #0x20000 + bics TMP1, TMP1, DAT1 + itee mi + ldrbmi DAT0, [PTR, #-13] + ldrbpl DAT0, [PTR, #-11] + addpl RESULT, RESULT, #2 + teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f +96: add RESULT, RESULT, #4 + teq TMP2, #0 + beq 97f @ not in next 4 bytes + sub TMP2, DAT2, #0x20000 + bics TMP2, TMP2, DAT2 + itee mi + ldrbmi DAT0, [PTR, #-9] + ldrbpl DAT0, [PTR, #-7] + addpl RESULT, RESULT, #2 + teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f +97: add RESULT, RESULT, #4 + sub TMP3, DAT3, #0x20000 + bics TMP3, TMP3, DAT3 + itee mi + ldrbmi DAT0, [PTR, #-5] + ldrbpl DAT0, [PTR, #-3] + addpl RESULT, RESULT, #2 + teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + @ drop through to 98f +98: setend le +99: pop {v1-v6,pc} +.endfunc + + .unreq RESULT + .unreq BUF + .unreq SIZE + .unreq PATTERN + .unreq PTR + .unreq DAT0 + .unreq DAT1 + .unreq DAT2 + .unreq DAT3 + .unreq TMP0 + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c index bb8b3b98b6..b206a1b3ba 100644 --- a/libavcodec/arm/h264dsp_init_arm.c +++ b/libavcodec/arm/h264dsp_init_arm.c @@ -24,6 +24,8 @@ #include "libavutil/arm/cpu.h" #include "libavcodec/h264dsp.h" +int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size); + void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, @@ -102,6 +104,8 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, { int cpu_flags = av_get_cpu_flags(); + if (have_armv6(cpu_flags)) + c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6; if (have_neon(cpu_flags)) h264dsp_init_neon(c, bit_depth, chroma_format_idc); } -- 2.11.0