[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Using PPC asm (from Linux kernel) in xine



	Dear people,

	As you probably know, I am trying what I can to play DVDs well
	in my (now outdated) :-) iBook 600MHz, combo, which features a
	G3 processor.

	I starting to learn PPC assembly and just from reading a bit
	of Motorola's MPC750 User's Guide, I became convinced that
	optimizing code for this processor is not as easy as it may
	seem at first.

	Anyway, one of the first things I see is that xine uses a
	function called xine_fast_memcpy, which is an alternative
	memcpy function possibly written in assembly (if available) or
	the standard glibc, if no other version is available, as is
	the case with PPC.

	I saw that the Linux kernel has an assembly implementation of
	memcopy and decided to try that instead of the glibc version.

	After just a few adaptations and removals of unnecessary
	functions, I ended up with a string.S file with only
	cacheable_memcpy and memcpy, which seem to be the important
	parts of the file for my purposes.

	According to my tests, cacheable_memcpy is approximately 40%
	faster than the original glibc version, which is quite an
	improvement: with my tests, the glibc version took approx. 69s
	to run, while the cacheable_memcpy took only 42s (repeated
	many times to avoid noise errors).

	I can use it fine with code that I write (compiled statically)
	and it works quite well, but when I try to use it with xine
	(which is plugin-based), it doesn't work.  It compiles fine
	(it seems) and a debian package is successfully built.

	But upon initialization, when xine is loading its plugins, it
	complains with:

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
xine: error while loading shared libraries: /usr/lib/libxineutils-0.9.9.so.0: unexpected reloc type 0x0bÿõøÿö
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

	Not good. :-)

	I am attaching the patch I created to this e-mail and would
	appreciate if anybody could help here. I may be doing many
	stupid things...

	The first thing that crossed my mind was that the assembly
	version cannot be incorporated "as is" in a shared library,
	because it does not seem to take care of the stack of other
	functions and may clobber other registers.
	Is that correct?

	In that case, would wrapping it in an "asm" part of a C
	snippet be enough?

	Well, any help is welcome.


	Thanks, Roger...

-- 
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  Rogério Brito - rbrito@iname.com - http://www.ime.usp.br/~rbrito/
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
diff -urdN xine-lib-0.9.9/src/xine-utils/Makefile.am /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/Makefile.am
--- xine-lib-0.9.9/src/xine-utils/Makefile.am	Mon Mar 25 20:07:17 2002
+++ /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/Makefile.am	Sat May 18 17:26:38 2002
@@ -4,7 +4,7 @@
 
 lib_LTLIBRARIES = libxineutils.la
 
-libxineutils_la_SOURCES = utils.c memcpy.c monitor.c cpu_accel.c xine_mutex.c
+libxineutils_la_SOURCES = utils.c memcpy.c monitor.c cpu_accel.c xine_mutex.c string.S
 
 libxineutils_la_LDFLAGS =  \
 	-version-info $(LT_CURRENT):$(LT_REVISION):$(LT_AGE) \
diff -urdN xine-lib-0.9.9/src/xine-utils/memcpy.c /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/memcpy.c
--- xine-lib-0.9.9/src/xine-utils/memcpy.c	Wed Nov 21 18:40:28 2001
+++ /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/memcpy.c	Sat May 18 18:25:40 2002
@@ -353,6 +353,13 @@
 
 #endif /* ARCH_X86 */
 
+#ifdef ARCH_PPC
+static void *linux_kernel_memcpy(void *to, const void *from, size_t len)
+{
+  return cacheable_kernel_memcpy(to, from, len);
+}
+#endif
+
 static struct {
   char *name;
   void *(* function)(void *to, const void *from, size_t len);
@@ -368,6 +375,9 @@
   { "MMXEXT optimized memcpy()", mmx2_memcpy, 0, MM_MMXEXT },
   { "SSE optimized memcpy()", sse_memcpy, 0, MM_MMXEXT|MM_SSE },
 #endif /* ARCH_X86 */
+#ifdef ARCH_PPC
+  { "linux kernel memcpy()", linux_kernel_memcpy, 0, 0 },
+#endif /* ARCH_PPC */
   { NULL, NULL, 0, 0 }
 };
 
@@ -414,6 +424,9 @@
   static char *memcpy_methods[] = {"probe", "glibc",
 #ifdef ARCH_X86
      "kernel", "mmx", "mmxext", "sse", 
+#endif
+#ifdef ARCH_PPC
+     "kernel",
 #endif
      NULL};
   
diff -urdN xine-lib-0.9.9/src/xine-utils/string.S /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/string.S
--- xine-lib-0.9.9/src/xine-utils/string.S	Wed Dec 31 21:00:00 1969
+++ /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/string.S	Sat May 18 16:11:05 2002
@@ -0,0 +1,211 @@
+/*
+ * BK Id: SCCS/s.string.S 1.10 11/04/01 22:58:20 paulus
+ */
+/*
+ * String handling functions for PowerPC.
+ *
+ * Copyright (C) 1996 Paul Mackerras.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Shamelessly torn apart from the Linux kernel sources by Rogerio Brito
+ * for the xine project.
+ */
+
+#include "config.h"
+
+#ifdef ARCH_PPC
+
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+
+/* General Purpose Registers (GPRs) */
+
+#define	r0	0
+#define	r1	1
+#define	r2	2
+#define	r3	3
+#define	r4	4
+#define	r5	5
+#define	r6	6
+#define	r7	7
+#define	r8	8
+#define	r9	9
+#define	r10	10
+#define	r11	11
+#define	r12	12
+#define	r13	13
+#define	r14	14
+#define	r15	15
+#define	r16	16
+#define	r17	17
+#define	r18	18
+#define	r19	19
+#define	r20	20
+#define	r21	21
+#define	r22	22
+#define	r23	23
+#define	r24	24
+#define	r25	25
+#define	r26	26
+#define	r27	27
+#define	r28	28
+#define	r29	29
+#define	r30	30
+#define	r31	31
+
+#define L1_CACHE_LINE_SIZE  32
+#define LG_L1_CACHE_LINE_SIZE   5
+#define MAX_L1_COPY_PREFETCH    4
+
+#define COPY_16_BYTES		\
+	lwz	r7,4(r4);	\
+	lwz	r8,8(r4);	\
+	lwz	r9,12(r4);	\
+	lwzu	r10,16(r4);	\
+	stw	r7,4(r6);	\
+	stw	r8,8(r6);	\
+	stw	r9,12(r6);	\
+	stwu	r10,16(r6)
+
+	.text
+
+CACHELINE_BYTES = L1_CACHE_LINE_SIZE
+LG_CACHELINE_BYTES = LG_L1_CACHE_LINE_SIZE
+CACHELINE_MASK = (L1_CACHE_LINE_SIZE-1)
+
+/*
+ * This version uses dcbz on the complete cache lines in the
+ * destination area to reduce memory traffic.  This requires that
+ * the destination area is cacheable.
+ * We only use this version if the source and dest don't overlap.
+ * -- paulus.
+ */
+	.global	cacheable_kernel_memcpy
+cacheable_kernel_memcpy:
+	add	r7,r3,r5		/* test if the src & dst overlap */
+	add	r8,r4,r5
+	cmplw	0,r4,r7
+	cmplw	1,r3,r8
+	crand	0,0,4			/* cr0.lt &= cr1.lt */
+	blt	kernel_memcpy			/* if regions overlap */
+
+	addi	r4,r4,-4
+	addi	r6,r3,-4
+	neg	r0,r3
+	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
+	beq	58f
+
+	cmplw	0,r5,r0			/* is this more than total to do? */
+	blt	63f			/* if not much to do */
+	andi.	r8,r0,3			/* get it word-aligned first */
+	subf	r5,r0,r5
+	mtctr	r8
+	beq+	61f
+70:	lbz	r9,4(r4)		/* do some bytes */
+	stb	r9,4(r6)
+	addi	r4,r4,1
+	addi	r6,r6,1
+	bdnz	70b
+61:	srwi.	r0,r0,2
+	mtctr	r0
+	beq	58f
+72:	lwzu	r9,4(r4)		/* do some words */
+	stwu	r9,4(r6)
+	bdnz	72b
+
+58:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
+	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
+	li	r11,4
+	mtctr	r0
+	beq	63f
+53:
+#if !defined(CONFIG_8xx)
+	dcbz	r11,r6
+#endif
+	COPY_16_BYTES
+#if L1_CACHE_LINE_SIZE >= 32
+	COPY_16_BYTES
+#if L1_CACHE_LINE_SIZE >= 64
+	COPY_16_BYTES
+	COPY_16_BYTES
+#if L1_CACHE_LINE_SIZE >= 128
+	COPY_16_BYTES
+	COPY_16_BYTES
+	COPY_16_BYTES
+	COPY_16_BYTES
+#endif
+#endif
+#endif
+	bdnz	53b
+
+63:	srwi.	r0,r5,2
+	mtctr	r0
+	beq	64f
+30:	lwzu	r0,4(r4)
+	stwu	r0,4(r6)
+	bdnz	30b
+
+64:	andi.	r0,r5,3
+	mtctr	r0
+	beq+	65f
+40:	lbz	r0,4(r4)
+	stb	r0,4(r6)
+	addi	r4,r4,1
+	addi	r6,r6,1
+	bdnz	40b
+65:	blr
+
+	.globl	kernel_memcpy
+kernel_memcpy:
+	srwi.	r7,r5,3
+	addi	r6,r3,-4
+	addi	r4,r4,-4
+	beq	2f			/* if less than 8 bytes to do */
+	andi.	r0,r6,3			/* get dest word aligned */
+	mtctr	r7
+	bne	5f
+1:	lwz	r7,4(r4)
+	lwzu	r8,8(r4)
+	stw	r7,4(r6)
+	stwu	r8,8(r6)
+	bdnz	1b
+	andi.	r5,r5,7
+2:	cmplwi	0,r5,4
+	blt	3f
+	lwzu	r0,4(r4)
+	addi	r5,r5,-4
+	stwu	r0,4(r6)
+3:	cmpwi	0,r5,0
+	beqlr
+	mtctr	r5
+	addi	r4,r4,3
+	addi	r6,r6,3
+4:	lbzu	r0,1(r4)
+	stbu	r0,1(r6)
+	bdnz	4b
+	blr
+5:	subfic	r0,r0,4
+	mtctr	r0
+6:	lbz	r7,4(r4)
+	addi	r4,r4,1
+	stb	r7,4(r6)
+	addi	r6,r6,1
+	bdnz	6b
+	subf	r5,r0,r5
+	rlwinm.	r7,r5,32-3,3,31
+	beq	2b
+	mtctr	r7
+	b	1b
+
+/*	.section __ex_table,"a"
+	.align	2
+	.long	1b,99b
+*/
+
+#endif

Reply to: