Update bundled libpng to version 1.6.37

This commit is contained in:
Admiral H. Curtiss
2020-12-12 16:57:24 +01:00
parent d312495934
commit 3d8736f9d7
35 changed files with 5783 additions and 1731 deletions

136
Externals/libpng/arm/arm_init.c vendored Normal file
View File

@ -0,0 +1,136 @@
/* arm_init.c - NEON optimised filter functions
*
* Copyright (c) 2018 Cosmin Truta
* Copyright (c) 2014,2016 Glenn Randers-Pehrson
* Written by Mans Rullgard, 2011.
*
* This code is released under the libpng license.
* For conditions of distribution and use, see the disclaimer
* and license in png.h
*/
/* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are
* called.
*/
#define _POSIX_SOURCE 1
#include "../pngpriv.h"
#ifdef PNG_READ_SUPPORTED
#if PNG_ARM_NEON_OPT > 0
#ifdef PNG_ARM_NEON_CHECK_SUPPORTED /* Do run-time checks */
/* WARNING: it is strongly recommended that you do not build libpng with
* run-time checks for CPU features if at all possible. In the case of the ARM
* NEON instructions there is no processor-specific way of detecting the
* presence of the required support, therefore run-time detection is extremely
* OS specific.
*
* You may set the macro PNG_ARM_NEON_FILE to the file name of file containing
* a fragment of C source code which defines the png_have_neon function. There
* are a number of implementations in contrib/arm-neon, but the only one that
* has partial support is contrib/arm-neon/linux.c - a generic Linux
* implementation which reads /proc/cpufino.
*/
#ifndef PNG_ARM_NEON_FILE
# ifdef __linux__
# define PNG_ARM_NEON_FILE "contrib/arm-neon/linux.c"
# endif
#endif
#ifdef PNG_ARM_NEON_FILE
#include <signal.h> /* for sig_atomic_t */
static int png_have_neon(png_structp png_ptr);
#include PNG_ARM_NEON_FILE
#else /* PNG_ARM_NEON_FILE */
# error "PNG_ARM_NEON_FILE undefined: no support for run-time ARM NEON checks"
#endif /* PNG_ARM_NEON_FILE */
#endif /* PNG_ARM_NEON_CHECK_SUPPORTED */
#ifndef PNG_ALIGNED_MEMORY_SUPPORTED
# error "ALIGNED_MEMORY is required; set: -DPNG_ALIGNED_MEMORY_SUPPORTED"
#endif
void
png_init_filter_functions_neon(png_structp pp, unsigned int bpp)
{
/* The switch statement is compiled in for ARM_NEON_API, the call to
* png_have_neon is compiled in for ARM_NEON_CHECK. If both are defined
* the check is only performed if the API has not set the NEON option on
* or off explicitly. In this case the check controls what happens.
*
* If the CHECK is not compiled in and the option is UNSET the behavior prior
* to 1.6.7 was to use the NEON code - this was a bug caused by having the
* wrong order of the 'ON' and 'default' cases. UNSET now defaults to OFF,
* as documented in png.h
*/
png_debug(1, "in png_init_filter_functions_neon");
#ifdef PNG_ARM_NEON_API_SUPPORTED
switch ((pp->options >> PNG_ARM_NEON) & 3)
{
case PNG_OPTION_UNSET:
/* Allow the run-time check to execute if it has been enabled -
* thus both API and CHECK can be turned on. If it isn't supported
* this case will fall through to the 'default' below, which just
* returns.
*/
#endif /* PNG_ARM_NEON_API_SUPPORTED */
#ifdef PNG_ARM_NEON_CHECK_SUPPORTED
{
static volatile sig_atomic_t no_neon = -1; /* not checked */
if (no_neon < 0)
no_neon = !png_have_neon(pp);
if (no_neon)
return;
}
#ifdef PNG_ARM_NEON_API_SUPPORTED
break;
#endif
#endif /* PNG_ARM_NEON_CHECK_SUPPORTED */
#ifdef PNG_ARM_NEON_API_SUPPORTED
default: /* OFF or INVALID */
return;
case PNG_OPTION_ON:
/* Option turned on */
break;
}
#endif
/* IMPORTANT: any new external functions used here must be declared using
* PNG_INTERNAL_FUNCTION in ../pngpriv.h. This is required so that the
* 'prefix' option to configure works:
*
* ./configure --with-libpng-prefix=foobar_
*
* Verify you have got this right by running the above command, doing a build
* and examining pngprefix.h; it must contain a #define for every external
* function you add. (Notice that this happens automatically for the
* initialization function.)
*/
pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_neon;
if (bpp == 3)
{
pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_neon;
pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_neon;
pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
png_read_filter_row_paeth3_neon;
}
else if (bpp == 4)
{
pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_neon;
pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_neon;
pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
png_read_filter_row_paeth4_neon;
}
}
#endif /* PNG_ARM_NEON_OPT > 0 */
#endif /* READ */

253
Externals/libpng/arm/filter_neon.S vendored Normal file
View File

@ -0,0 +1,253 @@
/* filter_neon.S - NEON optimised filter functions
*
* Copyright (c) 2018 Cosmin Truta
* Copyright (c) 2014,2017 Glenn Randers-Pehrson
* Written by Mans Rullgard, 2011.
*
* This code is released under the libpng license.
* For conditions of distribution and use, see the disclaimer
* and license in png.h
*/
/* This is required to get the symbol renames, which are #defines, and the
* definitions (or not) of PNG_ARM_NEON_OPT and PNG_ARM_NEON_IMPLEMENTATION.
*/
#define PNG_VERSION_INFO_ONLY
#include "../pngpriv.h"
#if (defined(__linux__) || defined(__FreeBSD__)) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
#endif
#ifdef PNG_READ_SUPPORTED
/* Assembler NEON support - only works for 32-bit ARM (i.e. it does not work for
* ARM64). The code in arm/filter_neon_intrinsics.c supports ARM64, however it
* only works if -mfpu=neon is specified on the GCC command line. See pngpriv.h
* for the logic which sets PNG_USE_ARM_NEON_ASM:
*/
#if PNG_ARM_NEON_IMPLEMENTATION == 2 /* hand-coded assembler */
#if PNG_ARM_NEON_OPT > 0
#ifdef __ELF__
# define ELF
#else
# define ELF @
#endif
.arch armv7-a
.fpu neon
.macro func name, export=0
.macro endfunc
ELF .size \name, . - \name
.endfunc
.purgem endfunc
.endm
.text
/* Explicitly specifying alignment here because some versions of
* GAS don't align code correctly. This is harmless in correctly
* written versions of GAS.
*/
.align 2
.if \export
.global \name
.endif
ELF .type \name, STT_FUNC
.func \name
\name:
.endm
func png_read_filter_row_sub4_neon, export=1
ldr r3, [r0, #4] @ rowbytes
vmov.i8 d3, #0
1:
vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128]
vadd.u8 d0, d3, d4
vadd.u8 d1, d0, d5
vadd.u8 d2, d1, d6
vadd.u8 d3, d2, d7
vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
subs r3, r3, #16
bgt 1b
bx lr
endfunc
func png_read_filter_row_sub3_neon, export=1
ldr r3, [r0, #4] @ rowbytes
vmov.i8 d3, #0
mov r0, r1
mov r2, #3
mov r12, #12
vld1.8 {q11}, [r0], r12
1:
vext.8 d5, d22, d23, #3
vadd.u8 d0, d3, d22
vext.8 d6, d22, d23, #6
vadd.u8 d1, d0, d5
vext.8 d7, d23, d23, #1
vld1.8 {q11}, [r0], r12
vst1.32 {d0[0]}, [r1,:32], r2
vadd.u8 d2, d1, d6
vst1.32 {d1[0]}, [r1], r2
vadd.u8 d3, d2, d7
vst1.32 {d2[0]}, [r1], r2
vst1.32 {d3[0]}, [r1], r2
subs r3, r3, #12
bgt 1b
bx lr
endfunc
func png_read_filter_row_up_neon, export=1
ldr r3, [r0, #4] @ rowbytes
1:
vld1.8 {q0}, [r1,:128]
vld1.8 {q1}, [r2,:128]!
vadd.u8 q0, q0, q1
vst1.8 {q0}, [r1,:128]!
subs r3, r3, #16
bgt 1b
bx lr
endfunc
func png_read_filter_row_avg4_neon, export=1
ldr r12, [r0, #4] @ rowbytes
vmov.i8 d3, #0
1:
vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128]
vld4.32 {d16[],d17[],d18[],d19[]},[r2,:128]!
vhadd.u8 d0, d3, d16
vadd.u8 d0, d0, d4
vhadd.u8 d1, d0, d17
vadd.u8 d1, d1, d5
vhadd.u8 d2, d1, d18
vadd.u8 d2, d2, d6
vhadd.u8 d3, d2, d19
vadd.u8 d3, d3, d7
vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
subs r12, r12, #16
bgt 1b
bx lr
endfunc
func png_read_filter_row_avg3_neon, export=1
push {r4,lr}
ldr r12, [r0, #4] @ rowbytes
vmov.i8 d3, #0
mov r0, r1
mov r4, #3
mov lr, #12
vld1.8 {q11}, [r0], lr
1:
vld1.8 {q10}, [r2], lr
vext.8 d5, d22, d23, #3
vhadd.u8 d0, d3, d20
vext.8 d17, d20, d21, #3
vadd.u8 d0, d0, d22
vext.8 d6, d22, d23, #6
vhadd.u8 d1, d0, d17
vext.8 d18, d20, d21, #6
vadd.u8 d1, d1, d5
vext.8 d7, d23, d23, #1
vld1.8 {q11}, [r0], lr
vst1.32 {d0[0]}, [r1,:32], r4
vhadd.u8 d2, d1, d18
vst1.32 {d1[0]}, [r1], r4
vext.8 d19, d21, d21, #1
vadd.u8 d2, d2, d6
vhadd.u8 d3, d2, d19
vst1.32 {d2[0]}, [r1], r4
vadd.u8 d3, d3, d7
vst1.32 {d3[0]}, [r1], r4
subs r12, r12, #12
bgt 1b
pop {r4,pc}
endfunc
.macro paeth rx, ra, rb, rc
vaddl.u8 q12, \ra, \rb @ a + b
vaddl.u8 q15, \rc, \rc @ 2*c
vabdl.u8 q13, \rb, \rc @ pa
vabdl.u8 q14, \ra, \rc @ pb
vabd.u16 q15, q12, q15 @ pc
vcle.u16 q12, q13, q14 @ pa <= pb
vcle.u16 q13, q13, q15 @ pa <= pc
vcle.u16 q14, q14, q15 @ pb <= pc
vand q12, q12, q13 @ pa <= pb && pa <= pc
vmovn.u16 d28, q14
vmovn.u16 \rx, q12
vbsl d28, \rb, \rc
vbsl \rx, \ra, d28
.endm
func png_read_filter_row_paeth4_neon, export=1
ldr r12, [r0, #4] @ rowbytes
vmov.i8 d3, #0
vmov.i8 d20, #0
1:
vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128]
vld4.32 {d16[],d17[],d18[],d19[]},[r2,:128]!
paeth d0, d3, d16, d20
vadd.u8 d0, d0, d4
paeth d1, d0, d17, d16
vadd.u8 d1, d1, d5
paeth d2, d1, d18, d17
vadd.u8 d2, d2, d6
paeth d3, d2, d19, d18
vmov d20, d19
vadd.u8 d3, d3, d7
vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
subs r12, r12, #16
bgt 1b
bx lr
endfunc
func png_read_filter_row_paeth3_neon, export=1
push {r4,lr}
ldr r12, [r0, #4] @ rowbytes
vmov.i8 d3, #0
vmov.i8 d4, #0
mov r0, r1
mov r4, #3
mov lr, #12
vld1.8 {q11}, [r0], lr
1:
vld1.8 {q10}, [r2], lr
paeth d0, d3, d20, d4
vext.8 d5, d22, d23, #3
vadd.u8 d0, d0, d22
vext.8 d17, d20, d21, #3
paeth d1, d0, d17, d20
vst1.32 {d0[0]}, [r1,:32], r4
vext.8 d6, d22, d23, #6
vadd.u8 d1, d1, d5
vext.8 d18, d20, d21, #6
paeth d2, d1, d18, d17
vext.8 d7, d23, d23, #1
vld1.8 {q11}, [r0], lr
vst1.32 {d1[0]}, [r1], r4
vadd.u8 d2, d2, d6
vext.8 d19, d21, d21, #1
paeth d3, d2, d19, d18
vst1.32 {d2[0]}, [r1], r4
vmov d4, d19
vadd.u8 d3, d3, d7
vst1.32 {d3[0]}, [r1], r4
subs r12, r12, #12
bgt 1b
pop {r4,pc}
endfunc
#endif /* PNG_ARM_NEON_OPT > 0 */
#endif /* PNG_ARM_NEON_IMPLEMENTATION == 2 (assembler) */
#endif /* READ */

View File

@ -0,0 +1,402 @@
/* filter_neon_intrinsics.c - NEON optimised filter functions
*
* Copyright (c) 2018 Cosmin Truta
* Copyright (c) 2014,2016 Glenn Randers-Pehrson
* Written by James Yu <james.yu at linaro.org>, October 2013.
* Based on filter_neon.S, written by Mans Rullgard, 2011.
*
* This code is released under the libpng license.
* For conditions of distribution and use, see the disclaimer
* and license in png.h
*/
#include "../pngpriv.h"
#ifdef PNG_READ_SUPPORTED
/* This code requires -mfpu=neon on the command line: */
#if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
#if defined(_MSC_VER) && defined(_M_ARM64)
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
/* libpng row pointers are not necessarily aligned to any particular boundary,
* however this code will only work with appropriate alignment. arm/arm_init.c
* checks for this (and will not compile unless it is done). This code uses
* variants of png_aligncast to avoid compiler warnings.
*/
#define png_ptr(type,pointer) png_aligncast(type *,pointer)
#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
/* The following relies on a variable 'temp_pointer' being declared with type
* 'type'. This is written this way just to hide the GCC strict aliasing
* warning; note that the code is safe because there never is an alias between
* the input and output pointers.
*
* When compiling with MSVC ARM64, the png_ldr macro can't be passed directly
* to vst4_lane_u32, because of an internal compiler error inside MSVC.
* To avoid this compiler bug, we use a temporary variable (vdest_val) to store
* the result of png_ldr.
*/
#define png_ldr(type,pointer)\
(temp_pointer = png_ptr(type,pointer), *temp_pointer)
#if PNG_ARM_NEON_OPT > 0
void
png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
png_bytep rp = row;
png_bytep rp_stop = row + row_info->rowbytes;
png_const_bytep pp = prev_row;
png_debug(1, "in png_read_filter_row_up_neon");
for (; rp < rp_stop; rp += 16, pp += 16)
{
uint8x16_t qrp, qpp;
qrp = vld1q_u8(rp);
qpp = vld1q_u8(pp);
qrp = vaddq_u8(qrp, qpp);
vst1q_u8(rp, qrp);
}
}
void
png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
png_bytep rp = row;
png_bytep rp_stop = row + row_info->rowbytes;
uint8x16_t vtmp = vld1q_u8(rp);
uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
uint8x8x2_t vrp = *vrpt;
uint8x8x4_t vdest;
vdest.val[3] = vdup_n_u8(0);
png_debug(1, "in png_read_filter_row_sub3_neon");
for (; rp < rp_stop;)
{
uint8x8_t vtmp1, vtmp2;
uint32x2_t *temp_pointer;
vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
vtmp = vld1q_u8(rp + 12);
vrpt = png_ptr(uint8x8x2_t, &vtmp);
vrp = *vrpt;
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
rp += 3;
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
rp += 3;
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
rp += 3;
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
rp += 3;
}
PNG_UNUSED(prev_row)
}
void
png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
png_bytep rp = row;
png_bytep rp_stop = row + row_info->rowbytes;
uint8x8x4_t vdest;
vdest.val[3] = vdup_n_u8(0);
png_debug(1, "in png_read_filter_row_sub4_neon");
for (; rp < rp_stop; rp += 16)
{
uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
uint8x8x4_t vrp = *vrpt;
uint32x2x4_t *temp_pointer;
uint32x2x4_t vdest_val;
vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
vdest_val = png_ldr(uint32x2x4_t, &vdest);
vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
}
PNG_UNUSED(prev_row)
}
void
png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
png_bytep rp = row;
png_const_bytep pp = prev_row;
png_bytep rp_stop = row + row_info->rowbytes;
uint8x16_t vtmp;
uint8x8x2_t *vrpt;
uint8x8x2_t vrp;
uint8x8x4_t vdest;
vdest.val[3] = vdup_n_u8(0);
vtmp = vld1q_u8(rp);
vrpt = png_ptr(uint8x8x2_t,&vtmp);
vrp = *vrpt;
png_debug(1, "in png_read_filter_row_avg3_neon");
for (; rp < rp_stop; pp += 12)
{
uint8x8_t vtmp1, vtmp2, vtmp3;
uint8x8x2_t *vppt;
uint8x8x2_t vpp;
uint32x2_t *temp_pointer;
vtmp = vld1q_u8(pp);
vppt = png_ptr(uint8x8x2_t,&vtmp);
vpp = *vppt;
vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
vtmp = vld1q_u8(rp + 12);
vrpt = png_ptr(uint8x8x2_t,&vtmp);
vrp = *vrpt;
vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
rp += 3;
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
rp += 3;
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
rp += 3;
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
rp += 3;
}
}
void
png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
png_bytep rp = row;
png_bytep rp_stop = row + row_info->rowbytes;
png_const_bytep pp = prev_row;
uint8x8x4_t vdest;
vdest.val[3] = vdup_n_u8(0);
png_debug(1, "in png_read_filter_row_avg4_neon");
for (; rp < rp_stop; rp += 16, pp += 16)
{
uint32x2x4_t vtmp;
uint8x8x4_t *vrpt, *vppt;
uint8x8x4_t vrp, vpp;
uint32x2x4_t *temp_pointer;
uint32x2x4_t vdest_val;
vtmp = vld4_u32(png_ptr(uint32_t,rp));
vrpt = png_ptr(uint8x8x4_t,&vtmp);
vrp = *vrpt;
vtmp = vld4_u32(png_ptrc(uint32_t,pp));
vppt = png_ptr(uint8x8x4_t,&vtmp);
vpp = *vppt;
vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
vdest_val = png_ldr(uint32x2x4_t, &vdest);
vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
}
}
static uint8x8_t
paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
{
uint8x8_t d, e;
uint16x8_t p1, pa, pb, pc;
p1 = vaddl_u8(a, b); /* a + b */
pc = vaddl_u8(c, c); /* c * 2 */
pa = vabdl_u8(b, c); /* pa */
pb = vabdl_u8(a, c); /* pb */
pc = vabdq_u16(p1, pc); /* pc */
p1 = vcleq_u16(pa, pb); /* pa <= pb */
pa = vcleq_u16(pa, pc); /* pa <= pc */
pb = vcleq_u16(pb, pc); /* pb <= pc */
p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
d = vmovn_u16(pb);
e = vmovn_u16(p1);
d = vbsl_u8(d, b, c);
e = vbsl_u8(e, a, d);
return e;
}
void
png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
png_bytep rp = row;
png_const_bytep pp = prev_row;
png_bytep rp_stop = row + row_info->rowbytes;
uint8x16_t vtmp;
uint8x8x2_t *vrpt;
uint8x8x2_t vrp;
uint8x8_t vlast = vdup_n_u8(0);
uint8x8x4_t vdest;
vdest.val[3] = vdup_n_u8(0);
vtmp = vld1q_u8(rp);
vrpt = png_ptr(uint8x8x2_t,&vtmp);
vrp = *vrpt;
png_debug(1, "in png_read_filter_row_paeth3_neon");
for (; rp < rp_stop; pp += 12)
{
uint8x8x2_t *vppt;
uint8x8x2_t vpp;
uint8x8_t vtmp1, vtmp2, vtmp3;
uint32x2_t *temp_pointer;
vtmp = vld1q_u8(pp);
vppt = png_ptr(uint8x8x2_t,&vtmp);
vpp = *vppt;
vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
vtmp = vld1q_u8(rp + 12);
vrpt = png_ptr(uint8x8x2_t,&vtmp);
vrp = *vrpt;
vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
vlast = vtmp2;
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
rp += 3;
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
rp += 3;
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
rp += 3;
vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
rp += 3;
}
}
void
png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
{
png_bytep rp = row;
png_bytep rp_stop = row + row_info->rowbytes;
png_const_bytep pp = prev_row;
uint8x8_t vlast = vdup_n_u8(0);
uint8x8x4_t vdest;
vdest.val[3] = vdup_n_u8(0);
png_debug(1, "in png_read_filter_row_paeth4_neon");
for (; rp < rp_stop; rp += 16, pp += 16)
{
uint32x2x4_t vtmp;
uint8x8x4_t *vrpt, *vppt;
uint8x8x4_t vrp, vpp;
uint32x2x4_t *temp_pointer;
uint32x2x4_t vdest_val;
vtmp = vld4_u32(png_ptr(uint32_t,rp));
vrpt = png_ptr(uint8x8x4_t,&vtmp);
vrp = *vrpt;
vtmp = vld4_u32(png_ptrc(uint32_t,pp));
vppt = png_ptr(uint8x8x4_t,&vtmp);
vpp = *vppt;
vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
vlast = vpp.val[3];
vdest_val = png_ldr(uint32x2x4_t, &vdest);
vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
}
}
#endif /* PNG_ARM_NEON_OPT > 0 */
#endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
#endif /* READ */

View File

@ -0,0 +1,149 @@
/* palette_neon_intrinsics.c - NEON optimised palette expansion functions
*
* Copyright (c) 2018-2019 Cosmin Truta
* Copyright (c) 2017-2018 Arm Holdings. All rights reserved.
* Written by Richard Townsend <Richard.Townsend@arm.com>, February 2017.
*
* This code is released under the libpng license.
* For conditions of distribution and use, see the disclaimer
* and license in png.h
*/
#include "../pngpriv.h"
#if PNG_ARM_NEON_IMPLEMENTATION == 1
#if defined(_MSC_VER) && defined(_M_ARM64)
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
/* Build an RGBA8 palette from the separate RGB and alpha palettes. */
void
png_riffle_palette_neon(png_structrp png_ptr)
{
png_const_colorp palette = png_ptr->palette;
png_bytep riffled_palette = png_ptr->riffled_palette;
png_const_bytep trans_alpha = png_ptr->trans_alpha;
int num_trans = png_ptr->num_trans;
int i;
png_debug(1, "in png_riffle_palette_neon");
/* Initially black, opaque. */
uint8x16x4_t w = {{
vdupq_n_u8(0x00),
vdupq_n_u8(0x00),
vdupq_n_u8(0x00),
vdupq_n_u8(0xff),
}};
/* First, riffle the RGB colours into an RGBA8 palette.
* The alpha component is set to opaque for now.
*/
for (i = 0; i < 256; i += 16)
{
uint8x16x3_t v = vld3q_u8((png_const_bytep)(palette + i));
w.val[0] = v.val[0];
w.val[1] = v.val[1];
w.val[2] = v.val[2];
vst4q_u8(riffled_palette + (i << 2), w);
}
/* Fix up the missing transparency values. */
for (i = 0; i < num_trans; i++)
riffled_palette[(i << 2) + 3] = trans_alpha[i];
}
/* Expands a palettized row into RGBA8. */
int
png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info,
png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
{
png_uint_32 row_width = row_info->width;
const png_uint_32 *riffled_palette =
(const png_uint_32 *)png_ptr->riffled_palette;
const png_int_32 pixels_per_chunk = 4;
int i;
png_debug(1, "in png_do_expand_palette_rgba8_neon");
if (row_width < pixels_per_chunk)
return 0;
/* This function originally gets the last byte of the output row.
* The NEON part writes forward from a given position, so we have
* to seek this back by 4 pixels x 4 bytes.
*/
*ddp = *ddp - ((pixels_per_chunk * sizeof(png_uint_32)) - 1);
for (i = 0; i < row_width; i += pixels_per_chunk)
{
uint32x4_t cur;
png_bytep sp = *ssp - i, dp = *ddp - (i << 2);
cur = vld1q_dup_u32 (riffled_palette + *(sp - 3));
cur = vld1q_lane_u32(riffled_palette + *(sp - 2), cur, 1);
cur = vld1q_lane_u32(riffled_palette + *(sp - 1), cur, 2);
cur = vld1q_lane_u32(riffled_palette + *(sp - 0), cur, 3);
vst1q_u32((void *)dp, cur);
}
if (i != row_width)
{
/* Remove the amount that wasn't processed. */
i -= pixels_per_chunk;
}
/* Decrement output pointers. */
*ssp = *ssp - i;
*ddp = *ddp - (i << 2);
return i;
}
/* Expands a palettized row into RGB8. */
int
png_do_expand_palette_rgb8_neon(png_structrp png_ptr, png_row_infop row_info,
png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
{
png_uint_32 row_width = row_info->width;
png_const_bytep palette = (png_const_bytep)png_ptr->palette;
const png_uint_32 pixels_per_chunk = 8;
int i;
png_debug(1, "in png_do_expand_palette_rgb8_neon");
if (row_width <= pixels_per_chunk)
return 0;
/* Seeking this back by 8 pixels x 3 bytes. */
*ddp = *ddp - ((pixels_per_chunk * sizeof(png_color)) - 1);
for (i = 0; i < row_width; i += pixels_per_chunk)
{
uint8x8x3_t cur;
png_bytep sp = *ssp - i, dp = *ddp - ((i << 1) + i);
cur = vld3_dup_u8(palette + sizeof(png_color) * (*(sp - 7)));
cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 6)), cur, 1);
cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 5)), cur, 2);
cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 4)), cur, 3);
cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 3)), cur, 4);
cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 2)), cur, 5);
cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 1)), cur, 6);
cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 0)), cur, 7);
vst3_u8((void *)dp, cur);
}
if (i != row_width)
{
/* Remove the amount that wasn't processed. */
i -= pixels_per_chunk;
}
/* Decrement output pointers. */
*ssp = *ssp - i;
*ddp = *ddp - ((i << 1) + i);
return i;
}
#endif /* PNG_ARM_NEON_IMPLEMENTATION */