Attachment #42018 for bug #66350




/* libFLAC - Free Lossless Audio Codec library
 * Copyright (C) 2004  Josh Coalson
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * - Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution.
 *
 * - Neither the name of the Xiph.org Foundation nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
.text
	.align 2
.globl FLAC__lpc_restore_signal_asm_ppc_altivec_16
.globl FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8

_FLAC__lpc_restore_signal_asm_ppc_altivec_16:
;	r3: residual[]
;	r4: data_len
;	r5: qlp_coeff[]
;	r6: order
;	r7: lp_quantization
;	r8: data[]

; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
; bps<=15 for mid-side coding, since that uses an extra bit)

; these should be fast; the inner loop is unrolled (it takes no more than
; 3*(order%4) instructions, all of which are arithmetic), and all of the
; coefficients and all relevant history stay in registers, so the outer loop
; has only one load from memory (the residual)

; I have not yet run this through simg4, so there may be some avoidable stalls,
; and there may be a somewhat more clever way to do the outer loop

; the branch mechanism may prevent dynamic loading; I still need to examine
; this issue, and there may be a more elegant method

FLAC__lpc_restore_signal_asm_ppc_altivec_16:
/*	r3: residual[]
 *	r4: data_len
 *	r5: qlp_coeff[]
 *	r6: order
 *	r7: lp_quantization
 *	r8: data[]
 *
 * see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
 * these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
 * bps<=15 for mid-side coding, since that uses an extra bit)
 *
 * these should be fast; the inner loop is unrolled (it takes no more than
 * 3*(order%4) instructions, all of which are arithmetic), and all of the
 * coefficients and all relevant history stay in registers, so the outer loop
 * has only one load from memory (the residual)
 *
 * I have not yet run this through simg4, so there may be some avoidable stalls,
 * and there may be a somewhat more clever way to do the outer loop
 *
 * the branch mechanism may prevent dynamic loading; I still need to examine 
 * this issue, and there may be a more elegant method
 */
	stmw r31,-4(r1)

	addi r9,r1,-28
	li r31,0xf
	andc r9,r9,r31 /* for quadword-aligned stack data */

	slwi r6,r6,2  /* adjust for word size */
	slwi r4,r4,2
	add r4,r4,r8 /* r4 = data+data_len */

	mfspr r0,256 /* cache old vrsave */
	addis r31,0,0xfffffc00@ha
	ori r31,r31,0xfffffc00@l
	mtspr 256,r31 /* declare VRs in vrsave */

	cmplw cr0,r8,r4 /* i<data_len */
	bc 4,0,L1400

	/* load coefficients into v0-v7 and initial history into v8-v15 */
	li r31,0xf
	and r31,r8,r31 /* r31: data%4 */
	li r11,16
	subf r31,r31,r11 /* r31: 4-(data%4) */
	slwi r31,r31,3 /* convert to bits for vsro */
	li r10,-4
	stw r31,-4(r9)
	lvewx v0,r10,r9
	vspltisb v18,-1
	vsro v18,v18,v0 /* v18: mask vector */

	li r31,0x8
	lvsl v0,0,r31

	vspltisb v2,0
	vspltisb v3,-1
	vmrglw v2,v2,v3
	vsel v0,v1,v0,v2 /* v0: reversal permutation vector */

	add r10,r5,r6
	lvsl v17,0,r5 /* v17: coefficient alignment permutation vector */
	vperm v17,v17,v17,v0 /* v17: reversal coefficient alignment permutation vector */

	mr r11,r8
	lvsl v16,0,r11 /* v16: history alignment permutation vector */

	lvx v0,0,r5
	addi r5,r5,16

	cmplw cr0,r5,r10
	bc 12,0,L1101
	vand v0,v0,v18
	addis r31,0,L1307@ha
	ori r31,r31,L1307@l
	b L1199

L1101:

	cmplw cr0,r5,r10
	bc 12,0,L1102
	vand v1,v1,v18
	addis r31,0,L1306@ha
	ori r31,r31,L1306@l
	b L1199

L1102:

	cmplw cr0,r5,r10
	bc 12,0,L1103
	vand v2,v2,v18
	addis r31,0,L1305@ha
	ori r31,r31,L1305@l
	b L1199

L1103:

	cmplw cr0,r5,r10
	bc 12,0,L1104
	vand v3,v3,v18
	addis r31,0,L1304@ha
	ori r31,r31,L1304@l
	b L1199

L1104:

	cmplw cr0,r5,r10
	bc 12,0,L1105
	vand v4,v4,v18
	addis r31,0,L1303@ha
	ori r31,r31,L1303@l
	b L1199

L1105:

	cmplw cr0,r5,r10
	bc 12,0,L1106
	vand v5,v5,v18
	addis r31,0,L1302@ha
	ori r31,r31,L1302@l
	b L1199

L1106:

	cmplw cr0,r5,r10
	bc 12,0,L1107
	vand v6,v6,v18
	addis r31,0,L1301@ha
	ori r31,r31,L1301@l
	b L1199

L1107:

	lvx v19,0,r11
	vperm v15,v19,v15,v16
	vand v7,v7,v18
	addis r31,0,L1300@ha
	ori r31,r31,L1300@l

L1199:
	mtctr r31

	/* set up invariant vectors */
	vspltish v16,0 /* v16: zero vector */

	li r10,-12
	lvsr v17,r10,r8 /* v17: result shift vector */
	lvsl v18,r10,r3 /* v18: residual shift back vector */

	li r10,-4
	stw r7,-4(r9)
	lvewx v19,r10,r9 /* v19: lp_quantization vector */

L1200:
	vmulosh v20,v0,v8 /* v20: sum vector */
	bcctr 20,0

L1300:
	vmulosh v21,v7,v15
	vsldoi v15,v15,v14,4 /* increment history */
	vaddsws v20,v20,v21

L1301:

	vaddsws v20,v20,v21

L1307:
	vsumsws v20,v20,v16 /* v20[3]: sum */
	vsraw v20,v20,v19 /* v20[3]: sum >> lp_quantization */

	lvewx v21,0,r3 /* v21[n]: *residual */
	vperm v21,v21,v21,v18 /* v21[3]: *residual */
	vaddsws v20,v21,v20 /* v20[3]: *residual + (sum >> lp_quantization) */
	vsldoi v18,v18,v18,4 /* increment shift vector */

	vperm v21,v20,v20,v17 /* v21[n]: shift for storage */
	vsldoi v17,v17,v17,12 /* increment shift vector */
	stvewx v21,0,r8

	vsldoi v20,v20,v20,12
	vsldoi v8,v8,v20,4 /* insert value onto history */

	addi r3,r3,4
	addi r8,r8,4
	cmplw cr0,r8,r4 /* i<data_len */
	bc 12,0,L1200

L1400:
	mtspr 256,r0 /* restore old vrsave */
	lmw r31,-4(r1)
	blr

FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
/*	r3: residual[]
 *	r4: data_len
 *	r5: qlp_coeff[]
 *	r6: order
 *	r7: lp_quantization
 *	r8: data[]
 *
 * see FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
 * this version assumes order<=8; it uses fewer vector registers, which should
 * save time in context switches, and has less code, which may improve
 * instruction caching
 */
	stmw r31,-4(r1)

	addi r9,r1,-28
	li r31,0xf
	andc r9,r9,r31 /* for quadword-aligned stack data */

	slwi r6,r6,2 /* adjust for word size */
	slwi r4,r4,2
	add r4,r4,r8 /* r4 = data+data_len */

	mfspr r0,256 /* cache old vrsave */
	addis r31,0,0xffc00000@ha
	ori r31,r31,0xffc00000@l
	mtspr 256,r31 /* declare VRs in vrsave */

	cmplw cr0,r8,r4 /* i<data_len */
	bc 4,0,L2400

	/* load coefficients into v0-v1 and initial history into v2-v3 */
	li r31,0xf
	and r31,r8,r31 /* r31: data%4 */
	li r11,16
	subf r31,r31,r11 /* r31: 4-(data%4) */
	slwi r31,r31,3 /* convert to bits for vsro */
	li r10,-4
	stw r31,-4(r9)
	lvewx v0,r10,r9
	vspltisb v6,-1
	vsro v6,v6,v0 /* v6: mask vector */

	li r31,0x8
	lvsl v0,0,r31

	vspltisb v2,0
	vspltisb v3,-1
	vmrglw v2,v2,v3
	vsel v0,v1,v0,v2 /* v0: reversal permutation vector */

	add r10,r5,r6
	lvsl v5,0,r5 /* v5: coefficient alignment permutation vector */
	vperm v5,v5,v5,v0 /* v5: reversal coefficient alignment permutation vector */

	mr r11,r8
	lvsl v4,0,r11 /* v4: history alignment permutation vector */

	lvx v0,0,r5
	addi r5,r5,16

	cmplw cr0,r5,r10
	bc 12,0,L2101
	vand v0,v0,v6
	addis r31,0,L2301@ha
	ori r31,r31,L2301@l
	b L2199

L2101:

	lvx v7,0,r11
	vperm v3,v7,v3,v4
	vand v1,v1,v6
	addis r31,0,L2300@ha
	ori r31,r31,L2300@l

L2199:
	mtctr r31

	/* set up invariant vectors */
	vspltish v4,0 /* v4: zero vector */

	li r10,-12
	lvsr v5,r10,r8 /* v5: result shift vector */
	lvsl v6,r10,r3 /* v6: residual shift back vector */

	li r10,-4
	stw r7,-4(r9)
	lvewx v7,r10,r9 /* v7: lp_quantization vector */

L2200:
	vmulosh v8,v0,v2 /* v8: sum vector */
	bcctr 20,0

L2300:

	vaddsws v8,v8,v9

L2301:
	vsumsws v8,v8,v4 /* v8[3]: sum */
	vsraw v8,v8,v7 /* v8[3]: sum >> lp_quantization */

	lvewx v9,0,r3 /* v9[n]: *residual */
	vperm v9,v9,v9,v6 /* v9[3]: *residual */
	vaddsws v8,v9,v8 /* v8[3]: *residual + (sum >> lp_quantization) */
	vsldoi v6,v6,v6,4 /* increment shift vector */

	vperm v9,v8,v8,v5 /* v9[n]: shift for storage */
	vsldoi v5,v5,v5,12 /* increment shift vector */
	stvewx v9,0,r8

	vsldoi v8,v8,v8,12
	vsldoi v2,v2,v8,4 /* insert value onto history */

	addi r3,r3,4
	addi r8,r8,4
	cmplw cr0,r8,r4 /* i<data_len */
	bc 12,0,L2200

L2400:
	mtspr 256,r0 /* restore old vrsave */
	lmw r31,-4(r1)
	blr

Lines 35-41 Link Here

(-)flac-1.1.1/src/libFLAC/ppc/Makefile.am (-1 / +1 lines)
35	# For some unknown reason libtool can't figure out the tag for 'as', so	35	# For some unknown reason libtool can't figure out the tag for 'as', so
36	# we fake it with --tag=CC and strip out unwanted options.	36	# we fake it with --tag=CC and strip out unwanted options.
37	.s.lo:	37	.s.lo:
38	$(LIBTOOL) --tag=CC --mode=compile $(STRIP_NON_ASM) as -force_cpusubtype_ALL -o $@ $<	38	$(LIBTOOL) --tag=CC --mode=compile $(STRIP_NON_ASM) as -maltivec -mregnames -o $@ $<
39		39
40	noinst_LTLIBRARIES = libFLAC-asm.la	40	noinst_LTLIBRARIES = libFLAC-asm.la
41	libFLAC_asm_la_SOURCES = \	41	libFLAC_asm_la_SOURCES = \

Lines 482-488 Link Here

(-)flac-1.1.1/src/libFLAC/ppc/Makefile.in (-1 / +1 lines)
482	# For some unknown reason libtool can't figure out the tag for 'as', so	488	# For some unknown reason libtool can't figure out the tag for 'as', so
483	# we fake it with --tag=CC and strip out unwanted options.	489	# we fake it with --tag=CC and strip out unwanted options.
484	.s.lo:	490	.s.lo:
485	$(LIBTOOL) --tag=CC --mode=compile $(STRIP_NON_ASM) as -force_cpusubtype_ALL -o $@ $<	491	$(LIBTOOL) --tag=CC --mode=compile $(STRIP_NON_ASM) as -maltivec -mregnames -o $@ $<
486	# Tell versions [3.59,3.63) of GNU make to not export all variables.	492	# Tell versions [3.59,3.63) of GNU make to not export all variables.
487	# Otherwise a system limit (for SysV at least) may be exceeded.	493	# Otherwise a system limit (for SysV at least) may be exceeded.
488	.NOEXPORT:	494	.NOEXPORT:

Lines 45-51 Link Here

(-)flac-1.1.1/src/libFLAC/Makefile.am (-1 / +1 lines)
45	if FLaC__CPU_PPC	45	if FLaC__CPU_PPC
46	ARCH_SUBDIRS = ppc	46	ARCH_SUBDIRS = ppc
47	libFLAC_la_LIBADD = ppc/libFLAC-asm.la	47	libFLAC_la_LIBADD = ppc/libFLAC-asm.la
48	LOCAL_EXTRA_LDFLAGS = "-Wl,-read_only_relocs,warning"	48	LOCAL_EXTRA_LDFLAGS =
49	endif	49	endif
50	endif	50	endif
51		51

Lines 277-283 Link Here

(-)flac-1.1.1/src/libFLAC/Makefile.in (-1 / +1 lines)
277	@FLaC__CPU_PPC_TRUE@@FLaC__NO_ASM_FALSE@ARCH_SUBDIRS = ppc	277	@FLaC__CPU_PPC_TRUE@@FLaC__NO_ASM_FALSE@ARCH_SUBDIRS = ppc
278	@FLaC__CPU_IA32_TRUE@@FLaC__HAS_NASM_TRUE@@FLaC__NO_ASM_FALSE@libFLAC_la_LIBADD = ia32/libFLAC-asm.la	278	@FLaC__CPU_IA32_TRUE@@FLaC__HAS_NASM_TRUE@@FLaC__NO_ASM_FALSE@libFLAC_la_LIBADD = ia32/libFLAC-asm.la
279	@FLaC__CPU_PPC_TRUE@@FLaC__NO_ASM_FALSE@libFLAC_la_LIBADD = ppc/libFLAC-asm.la	279	@FLaC__CPU_PPC_TRUE@@FLaC__NO_ASM_FALSE@libFLAC_la_LIBADD = ppc/libFLAC-asm.la
280	@FLaC__CPU_PPC_TRUE@@FLaC__NO_ASM_FALSE@LOCAL_EXTRA_LDFLAGS = "-Wl,-read_only_relocs,warning"	280	@FLaC__CPU_PPC_TRUE@@FLaC__NO_ASM_FALSE@LOCAL_EXTRA_LDFLAGS =
281	SUBDIRS = $(ARCH_SUBDIRS) include .	281	SUBDIRS = $(ARCH_SUBDIRS) include .
282	m4datadir = $(datadir)/aclocal	282	m4datadir = $(datadir)/aclocal
283	m4data_DATA = libFLAC.m4	283	m4data_DATA = libFLAC.m4

Return to bug 66350

Lines 1-90 Link Here

(-)flac-1.1.1/src/libFLAC/ppc/lpc_asm.s (-157 / +157 lines)
1	; libFLAC - Free Lossless Audio Codec library	1	/* libFLAC - Free Lossless Audio Codec library
2	; Copyright (C) 2004 Josh Coalson	2	* Copyright (C) 2004 Josh Coalson
3	;	3	*
4	; Redistribution and use in source and binary forms, with or without	4	* Redistribution and use in source and binary forms, with or without
5	; modification, are permitted provided that the following conditions	5	* modification, are permitted provided that the following conditions
6	; are met:	6	* are met:
7	;	7	*
8	; - Redistributions of source code must retain the above copyright	8	* - Redistributions of source code must retain the above copyright
9	; notice, this list of conditions and the following disclaimer.	9	* notice, this list of conditions and the following disclaimer.
10	;	10	*
11	; - Redistributions in binary form must reproduce the above copyright	11	* - Redistributions in binary form must reproduce the above copyright
12	; notice, this list of conditions and the following disclaimer in the	12	* notice, this list of conditions and the following disclaimer in the
13	; documentation and/or other materials provided with the distribution.	13	* documentation and/or other materials provided with the distribution.
14	;	14	*
15	; - Neither the name of the Xiph.org Foundation nor the names of its	15	* - Neither the name of the Xiph.org Foundation nor the names of its
16	; contributors may be used to endorse or promote products derived from	16	* contributors may be used to endorse or promote products derived from
17	; this software without specific prior written permission.	17	* this software without specific prior written permission.
18	;	18	*
19	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS	19	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20	; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT	20	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR	21	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR	22	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
23	; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,	23	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24	; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,	24	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25	; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR	25	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26	; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF	26	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27	; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING	27	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28	; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS	28	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29	; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	29	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30		30	*/
31	.text	31	.text
32	.align 2	32	.align 2
33	.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16	33	.globl FLAC__lpc_restore_signal_asm_ppc_altivec_16
34	.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8	34	.globl FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
35
36	_FLAC__lpc_restore_signal_asm_ppc_altivec_16:
37	; r3: residual[]
38	; r4: data_len
39	; r5: qlp_coeff[]
40	; r6: order
41	; r7: lp_quantization
42	; r8: data[]
43
44	; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
45	; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
46	; bps<=15 for mid-side coding, since that uses an extra bit)
47
48	; these should be fast; the inner loop is unrolled (it takes no more than
49	; 3*(order%4) instructions, all of which are arithmetic), and all of the
50	; coefficients and all relevant history stay in registers, so the outer loop
51	; has only one load from memory (the residual)
52
53	; I have not yet run this through simg4, so there may be some avoidable stalls,
54	; and there may be a somewhat more clever way to do the outer loop
55
56	; the branch mechanism may prevent dynamic loading; I still need to examine
57	; this issue, and there may be a more elegant method
58		35
		36	FLAC__lpc_restore_signal_asm_ppc_altivec_16:
		37	/* r3: residual[]
		38	* r4: data_len
		39	* r5: qlp_coeff[]
		40	* r6: order
		41	* r7: lp_quantization
		42	* r8: data[]
		43	*
		44	* see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
		45	* these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
		46	* bps<=15 for mid-side coding, since that uses an extra bit)
		47	*
		48	* these should be fast; the inner loop is unrolled (it takes no more than
		49	* 3*(order%4) instructions, all of which are arithmetic), and all of the
		50	* coefficients and all relevant history stay in registers, so the outer loop
		51	* has only one load from memory (the residual)
		52	*
		53	* I have not yet run this through simg4, so there may be some avoidable stalls,
		54	* and there may be a somewhat more clever way to do the outer loop
		55	*
		56	* the branch mechanism may prevent dynamic loading; I still need to examine
		57	* this issue, and there may be a more elegant method
		58	*/
59	stmw r31,-4(r1)	59	stmw r31,-4(r1)
60		60
61	addi r9,r1,-28	61	addi r9,r1,-28
62	li r31,0xf	62	li r31,0xf
63	andc r9,r9,r31 ; for quadword-aligned stack data	63	andc r9,r9,r31 /* for quadword-aligned stack data */
64		64
65	slwi r6,r6,2 ; adjust for word size	65	slwi r6,r6,2 /* adjust for word size */
66	slwi r4,r4,2	66	slwi r4,r4,2
67	add r4,r4,r8 ; r4 = data+data_len	67	add r4,r4,r8 /* r4 = data+data_len */
68		68
69	mfspr r0,256 ; cache old vrsave	69	mfspr r0,256 /* cache old vrsave */
70	addis r31,0,hi16(0xfffffc00)	70	addis r31,0,0xfffffc00@ha
71	ori r31,r31,lo16(0xfffffc00)	71	ori r31,r31,0xfffffc00@l
72	mtspr 256,r31 ; declare VRs in vrsave	72	mtspr 256,r31 /* declare VRs in vrsave */
73		73
74	cmplw cr0,r8,r4 ; i<data_len	74	cmplw cr0,r8,r4 /* i<data_len */
75	bc 4,0,L1400	75	bc 4,0,L1400
76		76
77	; load coefficients into v0-v7 and initial history into v8-v15	77	/* load coefficients into v0-v7 and initial history into v8-v15 */
78	li r31,0xf	78	li r31,0xf
79	and r31,r8,r31 ; r31: data%4	79	and r31,r8,r31 /* r31: data%4 */
80	li r11,16	80	li r11,16
81	subf r31,r31,r11 ; r31: 4-(data%4)	81	subf r31,r31,r11 /* r31: 4-(data%4) */
82	slwi r31,r31,3 ; convert to bits for vsro	82	slwi r31,r31,3 /* convert to bits for vsro */
83	li r10,-4	83	li r10,-4
84	stw r31,-4(r9)	84	stw r31,-4(r9)
85	lvewx v0,r10,r9	85	lvewx v0,r10,r9
86	vspltisb v18,-1	86	vspltisb v18,-1
87	vsro v18,v18,v0 ; v18: mask vector	87	vsro v18,v18,v0 /* v18: mask vector */
88		88
89	li r31,0x8	89	li r31,0x8
90	lvsl v0,0,r31	90	lvsl v0,0,r31
Lines 94-107 Link Here
94	vspltisb v2,0	94	vspltisb v2,0
95	vspltisb v3,-1	95	vspltisb v3,-1
96	vmrglw v2,v2,v3	96	vmrglw v2,v2,v3
97	vsel v0,v1,v0,v2 ; v0: reversal permutation vector	97	vsel v0,v1,v0,v2 /* v0: reversal permutation vector */
98		98
99	add r10,r5,r6	99	add r10,r5,r6
100	lvsl v17,0,r5 ; v17: coefficient alignment permutation vector	100	lvsl v17,0,r5 /* v17: coefficient alignment permutation vector */
101	vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector	101	vperm v17,v17,v17,v0 /* v17: reversal coefficient alignment permutation vector */
102		102
103	mr r11,r8	103	mr r11,r8
104	lvsl v16,0,r11 ; v16: history alignment permutation vector	104	lvsl v16,0,r11 /* v16: history alignment permutation vector */
105		105
106	lvx v0,0,r5	106	lvx v0,0,r5
107	addi r5,r5,16	107	addi r5,r5,16
Lines 114-121 Link Here
114	cmplw cr0,r5,r10	114	cmplw cr0,r5,r10
115	bc 12,0,L1101	115	bc 12,0,L1101
116	vand v0,v0,v18	116	vand v0,v0,v18
117	addis r31,0,hi16(L1307)	117	addis r31,0,L1307@ha
118	ori r31,r31,lo16(L1307)	118	ori r31,r31,L1307@l
119	b L1199	119	b L1199
120		120
121	L1101:	121	L1101:
Lines 128-135 Link Here
128	cmplw cr0,r5,r10	128	cmplw cr0,r5,r10
129	bc 12,0,L1102	129	bc 12,0,L1102
130	vand v1,v1,v18	130	vand v1,v1,v18
131	addis r31,0,hi16(L1306)	131	addis r31,0,L1306@ha
132	ori r31,r31,lo16(L1306)	132	ori r31,r31,L1306@l
133	b L1199	133	b L1199
134		134
135	L1102:	135	L1102:
Lines 142-149 Link Here
142	cmplw cr0,r5,r10	142	cmplw cr0,r5,r10
143	bc 12,0,L1103	143	bc 12,0,L1103
144	vand v2,v2,v18	144	vand v2,v2,v18
145	addis r31,0,hi16(L1305)	145	addis r31,0,L1305@ha
146	ori r31,r31,lo16(L1305)	146	ori r31,r31,L1305@l
147	b L1199	147	b L1199
148		148
149	L1103:	149	L1103:
Lines 156-163 Link Here
156	cmplw cr0,r5,r10	156	cmplw cr0,r5,r10
157	bc 12,0,L1104	157	bc 12,0,L1104
158	vand v3,v3,v18	158	vand v3,v3,v18
159	addis r31,0,hi16(L1304)	159	addis r31,0,L1304@ha
160	ori r31,r31,lo16(L1304)	160	ori r31,r31,L1304@l
161	b L1199	161	b L1199
162		162
163	L1104:	163	L1104:
Lines 170-177 Link Here
170	cmplw cr0,r5,r10	170	cmplw cr0,r5,r10
171	bc 12,0,L1105	171	bc 12,0,L1105
172	vand v4,v4,v18	172	vand v4,v4,v18
173	addis r31,0,hi16(L1303)	173	addis r31,0,L1303@ha
174	ori r31,r31,lo16(L1303)	174	ori r31,r31,L1303@l
175	b L1199	175	b L1199
176		176
177	L1105:	177	L1105:
Lines 184-191 Link Here
184	cmplw cr0,r5,r10	184	cmplw cr0,r5,r10
185	bc 12,0,L1106	185	bc 12,0,L1106
186	vand v5,v5,v18	186	vand v5,v5,v18
187	addis r31,0,hi16(L1302)	187	addis r31,0,L1302@ha
188	ori r31,r31,lo16(L1302)	188	ori r31,r31,L1302@l
189	b L1199	189	b L1199
190		190
191	L1106:	191	L1106:
Lines 198-205 Link Here
198	cmplw cr0,r5,r10	198	cmplw cr0,r5,r10
199	bc 12,0,L1107	199	bc 12,0,L1107
200	vand v6,v6,v18	200	vand v6,v6,v18
201	addis r31,0,hi16(L1301)	201	addis r31,0,L1301@ha
202	ori r31,r31,lo16(L1301)	202	ori r31,r31,L1301@l
203	b L1199	203	b L1199
204		204
205	L1107:	205	L1107:
Lines 210-239 Link Here
210	lvx v19,0,r11	210	lvx v19,0,r11
211	vperm v15,v19,v15,v16	211	vperm v15,v19,v15,v16
212	vand v7,v7,v18	212	vand v7,v7,v18
213	addis r31,0,hi16(L1300)	213	addis r31,0,L1300@ha
214	ori r31,r31,lo16(L1300)	214	ori r31,r31,L1300@l
215		215
216	L1199:	216	L1199:
217	mtctr r31	217	mtctr r31
218		218
219	; set up invariant vectors	219	/* set up invariant vectors */
220	vspltish v16,0 ; v16: zero vector	220	vspltish v16,0 /* v16: zero vector */
221		221
222	li r10,-12	222	li r10,-12
223	lvsr v17,r10,r8 ; v17: result shift vector	223	lvsr v17,r10,r8 /* v17: result shift vector */
224	lvsl v18,r10,r3 ; v18: residual shift back vector	224	lvsl v18,r10,r3 /* v18: residual shift back vector */
225		225
226	li r10,-4	226	li r10,-4
227	stw r7,-4(r9)	227	stw r7,-4(r9)
228	lvewx v19,r10,r9 ; v19: lp_quantization vector	228	lvewx v19,r10,r9 /* v19: lp_quantization vector */
229		229
230	L1200:	230	L1200:
231	vmulosh v20,v0,v8 ; v20: sum vector	231	vmulosh v20,v0,v8 /* v20: sum vector */
232	bcctr 20,0	232	bcctr 20,0
233		233
234	L1300:	234	L1300:
235	vmulosh v21,v7,v15	235	vmulosh v21,v7,v15
236	vsldoi v15,v15,v14,4 ; increment history	236	vsldoi v15,v15,v14,4 /* increment history */
237	vaddsws v20,v20,v21	237	vaddsws v20,v20,v21
238		238
239	L1301:	239	L1301:
Lines 267-339 Link Here
267	vaddsws v20,v20,v21	267	vaddsws v20,v20,v21
268		268
269	L1307:	269	L1307:
270	vsumsws v20,v20,v16 ; v20[3]: sum	270	vsumsws v20,v20,v16 /* v20[3]: sum */
271	vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization	271	vsraw v20,v20,v19 /* v20[3]: sum >> lp_quantization */
272		272
273	lvewx v21,0,r3 ; v21[n]: *residual	273	lvewx v21,0,r3 /* v21[n]: residual /
274	vperm v21,v21,v21,v18 ; v21[3]: *residual	274	vperm v21,v21,v21,v18 /* v21[3]: residual /
275	vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization)	275	vaddsws v20,v21,v20 /* v20[3]: residual + (sum >> lp_quantization) /
276	vsldoi v18,v18,v18,4 ; increment shift vector	276	vsldoi v18,v18,v18,4 /* increment shift vector */
277		277
278	vperm v21,v20,v20,v17 ; v21[n]: shift for storage	278	vperm v21,v20,v20,v17 /* v21[n]: shift for storage */
279	vsldoi v17,v17,v17,12 ; increment shift vector	279	vsldoi v17,v17,v17,12 /* increment shift vector */
280	stvewx v21,0,r8	280	stvewx v21,0,r8
281		281
282	vsldoi v20,v20,v20,12	282	vsldoi v20,v20,v20,12
283	vsldoi v8,v8,v20,4 ; insert value onto history	283	vsldoi v8,v8,v20,4 /* insert value onto history */
284		284
285	addi r3,r3,4	285	addi r3,r3,4
286	addi r8,r8,4	286	addi r8,r8,4
287	cmplw cr0,r8,r4 ; i<data_len	287	cmplw cr0,r8,r4 /* i<data_len */
288	bc 12,0,L1200	288	bc 12,0,L1200
289		289
290	L1400:	290	L1400:
291	mtspr 256,r0 ; restore old vrsave	291	mtspr 256,r0 /* restore old vrsave */
292	lmw r31,-4(r1)	292	lmw r31,-4(r1)
293	blr	293	blr
294		294
295	_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:	295	FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
296	; r3: residual[]	296	/* r3: residual[]
297	; r4: data_len	297	* r4: data_len
298	; r5: qlp_coeff[]	298	* r5: qlp_coeff[]
299	; r6: order	299	* r6: order
300	; r7: lp_quantization	300	* r7: lp_quantization
301	; r8: data[]	301	* r8: data[]
302		302	*
303	; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above	303	* see FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
304	; this version assumes order<=8; it uses fewer vector registers, which should	304	* this version assumes order<=8; it uses fewer vector registers, which should
305	; save time in context switches, and has less code, which may improve	305	* save time in context switches, and has less code, which may improve
306	; instruction caching	306	* instruction caching
307		307	*/
308	stmw r31,-4(r1)	308	stmw r31,-4(r1)
309		309
310	addi r9,r1,-28	310	addi r9,r1,-28
311	li r31,0xf	311	li r31,0xf
312	andc r9,r9,r31 ; for quadword-aligned stack data	312	andc r9,r9,r31 /* for quadword-aligned stack data */
313		313
314	slwi r6,r6,2 ; adjust for word size	314	slwi r6,r6,2 /* adjust for word size */
315	slwi r4,r4,2	315	slwi r4,r4,2
316	add r4,r4,r8 ; r4 = data+data_len	316	add r4,r4,r8 /* r4 = data+data_len */
317		317
318	mfspr r0,256 ; cache old vrsave	318	mfspr r0,256 /* cache old vrsave */
319	addis r31,0,hi16(0xffc00000)	319	addis r31,0,0xffc00000@ha
320	ori r31,r31,lo16(0xffc00000)	320	ori r31,r31,0xffc00000@l
321	mtspr 256,r31 ; declare VRs in vrsave	321	mtspr 256,r31 /* declare VRs in vrsave */
322		322
323	cmplw cr0,r8,r4 ; i<data_len	323	cmplw cr0,r8,r4 /* i<data_len */
324	bc 4,0,L2400	324	bc 4,0,L2400
325		325
326	; load coefficients into v0-v1 and initial history into v2-v3	326	/* load coefficients into v0-v1 and initial history into v2-v3 */
327	li r31,0xf	327	li r31,0xf
328	and r31,r8,r31 ; r31: data%4	328	and r31,r8,r31 /* r31: data%4 */
329	li r11,16	329	li r11,16
330	subf r31,r31,r11 ; r31: 4-(data%4)	330	subf r31,r31,r11 /* r31: 4-(data%4) */
331	slwi r31,r31,3 ; convert to bits for vsro	331	slwi r31,r31,3 /* convert to bits for vsro */
332	li r10,-4	332	li r10,-4
333	stw r31,-4(r9)	333	stw r31,-4(r9)
334	lvewx v0,r10,r9	334	lvewx v0,r10,r9
335	vspltisb v6,-1	335	vspltisb v6,-1
336	vsro v6,v6,v0 ; v6: mask vector	336	vsro v6,v6,v0 /* v6: mask vector */
337		337
338	li r31,0x8	338	li r31,0x8
339	lvsl v0,0,r31	339	lvsl v0,0,r31
Lines 343-356 Link Here
343	vspltisb v2,0	343	vspltisb v2,0
344	vspltisb v3,-1	344	vspltisb v3,-1
345	vmrglw v2,v2,v3	345	vmrglw v2,v2,v3
346	vsel v0,v1,v0,v2 ; v0: reversal permutation vector	346	vsel v0,v1,v0,v2 /* v0: reversal permutation vector */
347		347
348	add r10,r5,r6	348	add r10,r5,r6
349	lvsl v5,0,r5 ; v5: coefficient alignment permutation vector	349	lvsl v5,0,r5 /* v5: coefficient alignment permutation vector */
350	vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector	350	vperm v5,v5,v5,v0 /* v5: reversal coefficient alignment permutation vector */
351		351
352	mr r11,r8	352	mr r11,r8
353	lvsl v4,0,r11 ; v4: history alignment permutation vector	353	lvsl v4,0,r11 /* v4: history alignment permutation vector */
354		354
355	lvx v0,0,r5	355	lvx v0,0,r5
356	addi r5,r5,16	356	addi r5,r5,16
Lines 363-370 Link Here
363	cmplw cr0,r5,r10	363	cmplw cr0,r5,r10
364	bc 12,0,L2101	364	bc 12,0,L2101
365	vand v0,v0,v6	365	vand v0,v0,v6
366	addis r31,0,hi16(L2301)	366	addis r31,0,L2301@ha
367	ori r31,r31,lo16(L2301)	367	ori r31,r31,L2301@l
368	b L2199	368	b L2199
369		369
370	L2101:	370	L2101:
Lines 375-399 Link Here
375	lvx v7,0,r11	375	lvx v7,0,r11
376	vperm v3,v7,v3,v4	376	vperm v3,v7,v3,v4
377	vand v1,v1,v6	377	vand v1,v1,v6
378	addis r31,0,hi16(L2300)	378	addis r31,0,L2300@ha
379	ori r31,r31,lo16(L2300)	379	ori r31,r31,L2300@l
380		380
381	L2199:	381	L2199:
382	mtctr r31	382	mtctr r31
383		383
384	; set up invariant vectors	384	/* set up invariant vectors */
385	vspltish v4,0 ; v4: zero vector	385	vspltish v4,0 /* v4: zero vector */
386		386
387	li r10,-12	387	li r10,-12
388	lvsr v5,r10,r8 ; v5: result shift vector	388	lvsr v5,r10,r8 /* v5: result shift vector */
389	lvsl v6,r10,r3 ; v6: residual shift back vector	389	lvsl v6,r10,r3 /* v6: residual shift back vector */
390		390
391	li r10,-4	391	li r10,-4
392	stw r7,-4(r9)	392	stw r7,-4(r9)
393	lvewx v7,r10,r9 ; v7: lp_quantization vector	393	lvewx v7,r10,r9 /* v7: lp_quantization vector */
394		394
395	L2200:	395	L2200:
396	vmulosh v8,v0,v2 ; v8: sum vector	396	vmulosh v8,v0,v2 /* v8: sum vector */
397	bcctr 20,0	397	bcctr 20,0
398		398
399	L2300:	399	L2300:
Lines 402-428 Link Here
402	vaddsws v8,v8,v9	402	vaddsws v8,v8,v9
403		403
404	L2301:	404	L2301:
405	vsumsws v8,v8,v4 ; v8[3]: sum	405	vsumsws v8,v8,v4 /* v8[3]: sum */
406	vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization	406	vsraw v8,v8,v7 /* v8[3]: sum >> lp_quantization */
407		407
408	lvewx v9,0,r3 ; v9[n]: *residual	408	lvewx v9,0,r3 /* v9[n]: residual /
409	vperm v9,v9,v9,v6 ; v9[3]: *residual	409	vperm v9,v9,v9,v6 /* v9[3]: residual /
410	vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization)	410	vaddsws v8,v9,v8 /* v8[3]: residual + (sum >> lp_quantization) /
411	vsldoi v6,v6,v6,4 ; increment shift vector	411	vsldoi v6,v6,v6,4 /* increment shift vector */
412		412
413	vperm v9,v8,v8,v5 ; v9[n]: shift for storage	413	vperm v9,v8,v8,v5 /* v9[n]: shift for storage */
414	vsldoi v5,v5,v5,12 ; increment shift vector	414	vsldoi v5,v5,v5,12 /* increment shift vector */
415	stvewx v9,0,r8	415	stvewx v9,0,r8
416		416
417	vsldoi v8,v8,v8,12	417	vsldoi v8,v8,v8,12
418	vsldoi v2,v2,v8,4 ; insert value onto history	418	vsldoi v2,v2,v8,4 /* insert value onto history */
419		419
420	addi r3,r3,4	420	addi r3,r3,4
421	addi r8,r8,4	421	addi r8,r8,4
422	cmplw cr0,r8,r4 ; i<data_len	422	cmplw cr0,r8,r4 /* i<data_len */
423	bc 12,0,L2200	423	bc 12,0,L2200
424		424
425	L2400:	425	L2400:
426	mtspr 256,r0 ; restore old vrsave	426	mtspr 256,r0 /* restore old vrsave */
427	lmw r31,-4(r1)	427	lmw r31,-4(r1)
428	blr	428	blr