Using PPC asm (from Linux kernel) in xine
Dear people,
As you probably know, I am trying what I can to play DVDs well
in my (now outdated) :-) iBook 600MHz, combo, which features a
G3 processor.
I starting to learn PPC assembly and just from reading a bit
of Motorola's MPC750 User's Guide, I became convinced that
optimizing code for this processor is not as easy as it may
seem at first.
Anyway, one of the first things I see is that xine uses a
function called xine_fast_memcpy, which is an alternative
memcpy function possibly written in assembly (if available) or
the standard glibc, if no other version is available, as is
the case with PPC.
I saw that the Linux kernel has an assembly implementation of
memcopy and decided to try that instead of the glibc version.
After just a few adaptations and removals of unnecessary
functions, I ended up with a string.S file with only
cacheable_memcpy and memcpy, which seem to be the important
parts of the file for my purposes.
According to my tests, cacheable_memcpy is approximately 40%
faster than the original glibc version, which is quite an
improvement: with my tests, the glibc version took approx. 69s
to run, while the cacheable_memcpy took only 42s (repeated
many times to avoid noise errors).
I can use it fine with code that I write (compiled statically)
and it works quite well, but when I try to use it with xine
(which is plugin-based), it doesn't work. It compiles fine
(it seems) and a debian package is successfully built.
But upon initialization, when xine is loading its plugins, it
complains with:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
xine: error while loading shared libraries: /usr/lib/libxineutils-0.9.9.so.0: unexpected reloc type 0x0bÿõøÿö
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Not good. :-)
I am attaching the patch I created to this e-mail and would
appreciate if anybody could help here. I may be doing many
stupid things...
The first thing that crossed my mind was that the assembly
version cannot be incorporated "as is" in a shared library,
because it does not seem to take care of the stack of other
functions and may clobber other registers.
Is that correct?
In that case, would wrapping it in an "asm" part of a C
snippet be enough?
Well, any help is welcome.
Thanks, Roger...
--
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Rogério Brito - rbrito@iname.com - http://www.ime.usp.br/~rbrito/
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
diff -urdN xine-lib-0.9.9/src/xine-utils/Makefile.am /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/Makefile.am
--- xine-lib-0.9.9/src/xine-utils/Makefile.am Mon Mar 25 20:07:17 2002
+++ /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/Makefile.am Sat May 18 17:26:38 2002
@@ -4,7 +4,7 @@
lib_LTLIBRARIES = libxineutils.la
-libxineutils_la_SOURCES = utils.c memcpy.c monitor.c cpu_accel.c xine_mutex.c
+libxineutils_la_SOURCES = utils.c memcpy.c monitor.c cpu_accel.c xine_mutex.c string.S
libxineutils_la_LDFLAGS = \
-version-info $(LT_CURRENT):$(LT_REVISION):$(LT_AGE) \
diff -urdN xine-lib-0.9.9/src/xine-utils/memcpy.c /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/memcpy.c
--- xine-lib-0.9.9/src/xine-utils/memcpy.c Wed Nov 21 18:40:28 2001
+++ /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/memcpy.c Sat May 18 18:25:40 2002
@@ -353,6 +353,13 @@
#endif /* ARCH_X86 */
+#ifdef ARCH_PPC
+static void *linux_kernel_memcpy(void *to, const void *from, size_t len)
+{
+ return cacheable_kernel_memcpy(to, from, len);
+}
+#endif
+
static struct {
char *name;
void *(* function)(void *to, const void *from, size_t len);
@@ -368,6 +375,9 @@
{ "MMXEXT optimized memcpy()", mmx2_memcpy, 0, MM_MMXEXT },
{ "SSE optimized memcpy()", sse_memcpy, 0, MM_MMXEXT|MM_SSE },
#endif /* ARCH_X86 */
+#ifdef ARCH_PPC
+ { "linux kernel memcpy()", linux_kernel_memcpy, 0, 0 },
+#endif /* ARCH_PPC */
{ NULL, NULL, 0, 0 }
};
@@ -414,6 +424,9 @@
static char *memcpy_methods[] = {"probe", "glibc",
#ifdef ARCH_X86
"kernel", "mmx", "mmxext", "sse",
+#endif
+#ifdef ARCH_PPC
+ "kernel",
#endif
NULL};
diff -urdN xine-lib-0.9.9/src/xine-utils/string.S /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/string.S
--- xine-lib-0.9.9/src/xine-utils/string.S Wed Dec 31 21:00:00 1969
+++ /home/rbrito/src/xine/xine-lib-0.9.9.modified/src/xine-utils/string.S Sat May 18 16:11:05 2002
@@ -0,0 +1,211 @@
+/*
+ * BK Id: SCCS/s.string.S 1.10 11/04/01 22:58:20 paulus
+ */
+/*
+ * String handling functions for PowerPC.
+ *
+ * Copyright (C) 1996 Paul Mackerras.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Shamelessly torn apart from the Linux kernel sources by Rogerio Brito
+ * for the xine project.
+ */
+
+#include "config.h"
+
+#ifdef ARCH_PPC
+
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+
+/* General Purpose Registers (GPRs) */
+
+#define r0 0
+#define r1 1
+#define r2 2
+#define r3 3
+#define r4 4
+#define r5 5
+#define r6 6
+#define r7 7
+#define r8 8
+#define r9 9
+#define r10 10
+#define r11 11
+#define r12 12
+#define r13 13
+#define r14 14
+#define r15 15
+#define r16 16
+#define r17 17
+#define r18 18
+#define r19 19
+#define r20 20
+#define r21 21
+#define r22 22
+#define r23 23
+#define r24 24
+#define r25 25
+#define r26 26
+#define r27 27
+#define r28 28
+#define r29 29
+#define r30 30
+#define r31 31
+
+#define L1_CACHE_LINE_SIZE 32
+#define LG_L1_CACHE_LINE_SIZE 5
+#define MAX_L1_COPY_PREFETCH 4
+
+#define COPY_16_BYTES \
+ lwz r7,4(r4); \
+ lwz r8,8(r4); \
+ lwz r9,12(r4); \
+ lwzu r10,16(r4); \
+ stw r7,4(r6); \
+ stw r8,8(r6); \
+ stw r9,12(r6); \
+ stwu r10,16(r6)
+
+ .text
+
+CACHELINE_BYTES = L1_CACHE_LINE_SIZE
+LG_CACHELINE_BYTES = LG_L1_CACHE_LINE_SIZE
+CACHELINE_MASK = (L1_CACHE_LINE_SIZE-1)
+
+/*
+ * This version uses dcbz on the complete cache lines in the
+ * destination area to reduce memory traffic. This requires that
+ * the destination area is cacheable.
+ * We only use this version if the source and dest don't overlap.
+ * -- paulus.
+ */
+ .global cacheable_kernel_memcpy
+cacheable_kernel_memcpy:
+ add r7,r3,r5 /* test if the src & dst overlap */
+ add r8,r4,r5
+ cmplw 0,r4,r7
+ cmplw 1,r3,r8
+ crand 0,0,4 /* cr0.lt &= cr1.lt */
+ blt kernel_memcpy /* if regions overlap */
+
+ addi r4,r4,-4
+ addi r6,r3,-4
+ neg r0,r3
+ andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
+ beq 58f
+
+ cmplw 0,r5,r0 /* is this more than total to do? */
+ blt 63f /* if not much to do */
+ andi. r8,r0,3 /* get it word-aligned first */
+ subf r5,r0,r5
+ mtctr r8
+ beq+ 61f
+70: lbz r9,4(r4) /* do some bytes */
+ stb r9,4(r6)
+ addi r4,r4,1
+ addi r6,r6,1
+ bdnz 70b
+61: srwi. r0,r0,2
+ mtctr r0
+ beq 58f
+72: lwzu r9,4(r4) /* do some words */
+ stwu r9,4(r6)
+ bdnz 72b
+
+58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
+ clrlwi r5,r5,32-LG_CACHELINE_BYTES
+ li r11,4
+ mtctr r0
+ beq 63f
+53:
+#if !defined(CONFIG_8xx)
+ dcbz r11,r6
+#endif
+ COPY_16_BYTES
+#if L1_CACHE_LINE_SIZE >= 32
+ COPY_16_BYTES
+#if L1_CACHE_LINE_SIZE >= 64
+ COPY_16_BYTES
+ COPY_16_BYTES
+#if L1_CACHE_LINE_SIZE >= 128
+ COPY_16_BYTES
+ COPY_16_BYTES
+ COPY_16_BYTES
+ COPY_16_BYTES
+#endif
+#endif
+#endif
+ bdnz 53b
+
+63: srwi. r0,r5,2
+ mtctr r0
+ beq 64f
+30: lwzu r0,4(r4)
+ stwu r0,4(r6)
+ bdnz 30b
+
+64: andi. r0,r5,3
+ mtctr r0
+ beq+ 65f
+40: lbz r0,4(r4)
+ stb r0,4(r6)
+ addi r4,r4,1
+ addi r6,r6,1
+ bdnz 40b
+65: blr
+
+ .globl kernel_memcpy
+kernel_memcpy:
+ srwi. r7,r5,3
+ addi r6,r3,-4
+ addi r4,r4,-4
+ beq 2f /* if less than 8 bytes to do */
+ andi. r0,r6,3 /* get dest word aligned */
+ mtctr r7
+ bne 5f
+1: lwz r7,4(r4)
+ lwzu r8,8(r4)
+ stw r7,4(r6)
+ stwu r8,8(r6)
+ bdnz 1b
+ andi. r5,r5,7
+2: cmplwi 0,r5,4
+ blt 3f
+ lwzu r0,4(r4)
+ addi r5,r5,-4
+ stwu r0,4(r6)
+3: cmpwi 0,r5,0
+ beqlr
+ mtctr r5
+ addi r4,r4,3
+ addi r6,r6,3
+4: lbzu r0,1(r4)
+ stbu r0,1(r6)
+ bdnz 4b
+ blr
+5: subfic r0,r0,4
+ mtctr r0
+6: lbz r7,4(r4)
+ addi r4,r4,1
+ stb r7,4(r6)
+ addi r6,r6,1
+ bdnz 6b
+ subf r5,r0,r5
+ rlwinm. r7,r5,32-3,3,31
+ beq 2b
+ mtctr r7
+ b 1b
+
+/* .section __ex_table,"a"
+ .align 2
+ .long 1b,99b
+*/
+
+#endif
Reply to: