[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: Call for help: altivec-enabled vlc



Hi all,

> I've sat down a bit ;-) and came up with an Altivec-optimised IDCT
> implementation in vlc (well, I integrated Motorla's Altivec IDCT).

New release: this time it actually works ;-)

> This is in fact the same code that already exists in vlc for MacOS X,
> but it uses the Motorola-published assembler code (you can find it on
> their site).

... plus the final matrix transpose which they forgot :-)

I'll be on holiday for a week; I figured I'd send this off since it is
in a working state ;-) Diff should apply to both 0.2.82 and 0.2.83.

Have fun!

Michel

PS this is not Paul's IDCT, which I plan on integrating later on and
compare performance and accuracy against Motorola's.

-------------------------------------------------------------------------
Michel Lanners                 |  " Read Philosophy.  Study Art.
23, Rue Paul Henkes            |    Ask Questions.  Make Mistakes.
L-1710 Luxembourg              |
email   mlan@cpu.lu            |
http://www.cpu.lu/~mlan        |                     Learn Always. "
diff -ur vlc-0.2.82/Makefile vlc-0.2.82-altivec/Makefile
--- vlc-0.2.82/Makefile	Tue Aug  7 12:55:49 2001
+++ vlc-0.2.82-altivec/Makefile	Sun Aug 26 10:04:20 2001
@@ -18,7 +18,7 @@
 #
 # All possible plugin objects
 #
-PLUGINS_TARGETS := alsa/alsa beos/beos darwin/darwin directx/directx dsp/dsp dummy/dummy dummy/null dvd/dvd esd/esd fb/fb ggi/ggi glide/glide gtk/gnome gtk/gtk downmix/downmix downmix/downmixsse downmix/downmix3dn idct/idct idct/idctclassic idct/idctmmx idct/idctmmxext imdct/imdct imdct/imdct3dn imdct/imdctsse kde/kde macosx/macosx mga/mga motion/motion motion/motionmmx motion/motionmmxext mpeg/es mpeg/ps mpeg/ts qt/qt sdl/sdl text/ncurses text/rc x11/x11 x11/xvideo yuv/yuv yuv/yuvmmx
+PLUGINS_TARGETS := alsa/alsa beos/beos darwin/darwin directx/directx dsp/dsp dummy/dummy dummy/null dvd/dvd esd/esd fb/fb ggi/ggi glide/glide gtk/gnome gtk/gtk downmix/downmix downmix/downmixsse downmix/downmix3dn idct/idct idct/idctclassic idct/idctmmx idct/idctmmxext idct/idctaltivec imdct/imdct imdct/imdct3dn imdct/imdctsse kde/kde macosx/macosx mga/mga motion/motion motion/motionmmx motion/motionmmxext mpeg/es mpeg/ps mpeg/ts qt/qt sdl/sdl text/ncurses text/rc x11/x11 x11/xvideo yuv/yuv yuv/yuvmmx
 
 #
 # C Objects
diff -ur vlc-0.2.82/Makefile.opts.in vlc-0.2.82-altivec/Makefile.opts.in
--- vlc-0.2.82/Makefile.opts.in	Tue Aug  7 12:55:49 2001
+++ vlc-0.2.82-altivec/Makefile.opts.in	Sun Aug 26 15:12:53 2001
@@ -45,7 +45,7 @@
 # Build environment
 # 
 CC = @CC@
-CFLAGS = @CFLAGS@
+CFLAGS = -Wa,-m7400 @CFLAGS@
 SHELL = @SHELL@
 RANLIB = @RANLIB@
 WINDRES = @WINDRES@
diff -ur vlc-0.2.82/configure vlc-0.2.82-altivec/configure
--- vlc-0.2.82/configure	Tue Aug  7 12:55:49 2001
+++ vlc-0.2.82-altivec/configure	Sun Aug 26 10:06:10 2001
@@ -3675,7 +3675,8 @@
   enableval="$enable_altivec"
    if test x$enableval = xyes; then ARCH="${ARCH} altivec"
     BUILTINS="${BUILTINS} idctaltivec"
-    LIB_IDCTALTIVEC="-framework vecLib"
+#    LIB_IDCTALTIVEC="-framework vecLib"
+    LIB_IDCTALTIVEC=""
   fi 
 fi
 
diff -ur vlc-0.2.82/include/vdec_ext-plugins.h vlc-0.2.82-altivec/include/vdec_ext-plugins.h
--- vlc-0.2.82/include/vdec_ext-plugins.h	Tue Aug  7 12:55:48 2001
+++ vlc-0.2.82-altivec/include/vdec_ext-plugins.h	Tue Aug 28 21:05:03 2001
@@ -103,6 +103,7 @@
 
     /* IDCT iformations */
     void *              p_idct_data;
+    void *              p_idct_data_raw;
 
     /* Input properties */
     struct vdec_pool_s * p_pool;
diff -ur vlc-0.2.82/plugins/idct/idctaltivec.c vlc-0.2.82-altivec/plugins/idct/idctaltivec.c
--- vlc-0.2.82/plugins/idct/idctaltivec.c	Tue Aug  7 12:55:49 2001
+++ vlc-0.2.82-altivec/plugins/idct/idctaltivec.c	Tue Aug 28 00:32:21 2001
@@ -23,6 +23,8 @@
 
 #define MODULE_NAME idctaltivec
 
+#undef DEBUG
+
 /*****************************************************************************
  * Preamble
  *****************************************************************************/
@@ -47,10 +49,14 @@
 #include "vdec_block.h"
 #include "vdec_idct.h"
 
-#include "idctaltivec.h"
+//#include "idctaltivec.h"
+//extern void IDCT(short *input, short *output);
+#include "idctaltivecasm.h"
 
 #include "modules_export.h"
 
+#include "testdata.h"
+//int dummy, dummy2;
 /*****************************************************************************
  * Local prototypes.
  *****************************************************************************/
@@ -115,7 +121,8 @@
     }
 
     /* The Altivec iDCT is deactivated until it really works */
-    return( 0 /* 200 */ );
+    //return( 0 /* 200 */ );
+    return( 200 );
 }
 
 /*****************************************************************************
@@ -130,6 +137,25 @@
  *****************************************************************************/
 void _M( vdec_IDCT )( void * p_idct_data, dctelem_t * p_block, int i_idontcare )
 {
+#ifdef DEBUG
+	int i;
+	
+       for(i=0; i<64;i++)
+           *(p_block+i)=testdata[i];
+
+	fprintf(stderr, "p_block alignment: 0x%p\n", p_block);
+	fprintf(stderr, "p_block before IDCT: 0x%04hx 0x%04hx 0x%04hx 0x%04hx 0x%04hx 0x%04hx 0x%04hx 0x%04hx\n", *p_block, *(p_block+1), *(p_block+2), *(p_block+3), *(p_block+4), *(p_block+5), *(p_block+6), *(p_block+7));
+#endif
+
     IDCT( p_block, p_block );
+
+#ifdef DEBUG
+   fprintf(stderr, "p_block after IDCT:\n");
+    for (i=0;i<8;i++) {
+        dctelem_t *p=p_block+(i*8);
+        fprintf(stderr, " % 5hi % 5hi % 5hi % 5hi % 5hi % 5hi % 5hi % 5hi\n", *p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5), *(p+6), *(p+7));
+    }
+   exit(0);
+#endif
 }
 
diff -ur vlc-0.2.82/plugins/idct/vdec_idct.c vlc-0.2.82-altivec/plugins/idct/vdec_idct.c
--- vlc-0.2.82/plugins/idct/vdec_idct.c	Tue Aug  7 12:55:49 2001
+++ vlc-0.2.82-altivec/plugins/idct/vdec_idct.c	Tue Aug 28 21:27:23 2001
@@ -57,7 +57,14 @@
     int i;
     dctelem_t * p_pre;
 
-    p_vdec->p_idct_data = malloc( sizeof(dctelem_t) * 64 * 64 );
+    /* the IDCT data buffer needs to meet certain alignment constraints
+     * (currently 16 bytes for Altivec vector ops)
+     */
+#define align 16
+
+    p_vdec->p_idct_data_raw = malloc( sizeof(dctelem_t) * 64 * 64 + align );
+    p_vdec->p_idct_data =
+	    (void *)(((unsigned long)p_vdec->p_idct_data_raw + align - 1) & -align);
     p_pre = (dctelem_t *) p_vdec->p_idct_data;
     memset( p_pre, 0, 64 * 64 * sizeof(dctelem_t) );
 
diff -ur vlc-0.2.82/src/interface/main.c vlc-0.2.82-altivec/src/interface/main.c
--- vlc-0.2.82/src/interface/main.c	Tue Aug  7 12:55:48 2001
+++ vlc-0.2.82-altivec/src/interface/main.c	Sun Aug 26 13:26:16 2001
@@ -1031,6 +1031,7 @@
 {
     volatile int i_capabilities = CPU_CAPABILITY_NONE;
 
+        i_capabilities |= CPU_CAPABILITY_ALTIVEC;
 #if defined( SYS_BEOS )
     i_capabilities |= CPU_CAPABILITY_486
                       | CPU_CAPABILITY_586
diff -ur vlc-0.2.82/src/video_decoder/video_decoder.c vlc-0.2.82-altivec/src/video_decoder/video_decoder.c
--- vlc-0.2.82/src/video_decoder/video_decoder.c	Tue Aug  7 12:55:48 2001
+++ vlc-0.2.82-altivec/src/video_decoder/video_decoder.c	Tue Aug 28 21:08:38 2001
@@ -166,9 +166,9 @@
 {
     intf_DbgMsg("vdec debug: EndThread(%p)", p_vdec);
 
-    if( p_vdec->p_idct_data != NULL )
+    if( p_vdec->p_idct_data_raw != NULL )
     {
-        free( p_vdec->p_idct_data );
+        free( p_vdec->p_idct_data_raw );
     }
 
     free( p_vdec );
diff -uNr vlc-0.2.82/plugins/idct/idctaltivecasm.h vlc-0.2.82-altivec/plugins/idct/idctaltivecasm.h
--- vlc-0.2.82/plugins/idct/idctaltivecasm.h	Thu Jan  1 01:00:00 1970
+++ vlc-0.2.82-altivec/plugins/idct/idctaltivecasm.h	Tue Aug 28 00:00:07 2001
@@ -0,0 +1,211 @@
+/* IDCT ASM function from Motorola
+ *
+ * The original Motorola implementation lacks a matrix transpose
+ * operation on the result. Duh...
+ */
+
+/***************************************************************
+ *
+ * Copyright:   (c) Copyright Motorola Inc. 1998
+ *
+ * Date:        April 17, 1998
+ *
+ * Function:    IDCT
+ *
+ * Description: Scaled Chen (III) algorithm for IDCT
+ *              Arithmetic is 16-bit fixed point.
+ *
+ * Inputs:      input - Pointer to input data (short), which
+ *                      must be between -2048 to +2047.
+ *                      It is assumed that the allocated array
+ *                      has been 128-bit aligned and contains
+ *                      8x8 short elements.
+ *
+ * Outputs:     output - Pointer to output area for the transfored
+ *                       data. The output values are between -255
+ *                       and 255 . It is assumed that a 128-bit
+ *                       aligned 8x8 array of short has been
+ *                       pre-allocated.
+ *
+ * Return:      None
+ *
+ ***************************************************************/
+
+signed short SpecialConstants[8] __attribute__ ((aligned (16))) = {
+			23170, 13573, 6518, 21895, -23170, -21895, 0, 0 };
+
+signed short PreScale[64] __attribute__ ((aligned (16))) = {
+			4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681,
+			5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880,
+			5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422,
+			4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680,
+			4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681,
+			4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680,
+			5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422,
+			5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 };
+
+static __inline__ void IDCT(short *input, short *output) {
+
+
+//	fprintf(stderr, "Input: %p, Addresses: %p, %p\n", input,
+//			&SpecialConstants[0], &PreScale[0]);
+
+	asm volatile ( "
+		addi    11,0,16
+		addi    9,0,32
+		lvx     0,%1,11
+		addi    7,0,48
+		lvx     1,%3,11
+		vspltisw        2,0
+		lvx     3,0,%2
+		addi    10,0,80
+		lvx     4,%1,9
+		vmhraddshs      5,0,1,2
+		lvx     6,%3,9
+		addi    6,0,112
+		lvx     7,%1,7
+		vsplth  8,3,0x2
+		lvx     9,%3,7
+		vmhraddshs      10,4,6,2
+		lvx     11,%1,10
+		vsplth  12,3,0x1
+		lvx     13,%3,10
+		vsplth  14,3,0x3
+		lvx     15,%1,6
+		vmhraddshs      16,7,9,2
+		lvx     17,0,%1
+		vsplth  18,3,0x5
+		lvx     19,%3,6
+		addi    5,0,64
+		lvx     1,0,%3
+		vmhraddshs      0,11,13,2
+		addi    8,0,96
+		lvx     6,%1,5
+		lvx     4,%3,5
+		vmhraddshs      9,15,19,2
+		vmhraddshs      7,17,1,2
+		lvx     13,%1,8
+		lvx     11,%3,8
+		vmhraddshs      19,6,4,2
+		vmhraddshs      15,8,5,2
+		vsplth  1,3,0x0
+		vmhraddshs      17,13,11,2
+		vsplth  4,3,0x4
+		vmhraddshs      6,8,9,5
+		vmhraddshs      11,14,0,16
+		vmhraddshs      13,18,16,0
+		vmhraddshs      3,12,10,2
+		vsubshs 5,15,9
+		vsubshs 0,7,19
+		vsubshs 16,3,17
+		vmhraddshs      15,12,17,10
+		vsubshs 9,5,13
+		vsubshs 3,6,11
+		vaddshs 17,7,19
+		vaddshs 10,0,16
+		vsubshs 19,0,16
+		vsubshs 7,3,9
+		vaddshs 16,3,9
+		vaddshs 0,5,13
+		vmhraddshs      3,1,16,10
+		vaddshs 9,6,11
+		vmhraddshs      5,4,16,10
+		vaddshs 13,17,15
+		vmhraddshs      11,1,7,19
+		vsubshs 6,17,15
+		vmhraddshs      16,4,7,19
+		vaddshs 10,13,9
+		vmrghh  17,11,5
+		vsubshs 15,13,9
+		vmrglh  7,11,5
+		vaddshs 19,6,0
+		vmrghh  13,3,16
+		vsubshs 9,6,0
+		vmrghh  11,19,15
+		vmrghh  5,10,9
+		vmrglh  6,10,9
+		vmrglh  0,3,16
+		vmrglh  9,19,15
+		vmrghh  10,5,17
+		vmrghh  16,13,11
+		vmrglh  3,5,17
+		vmrghh  19,6,7
+		vmrglh  15,6,7
+		vmrglh  5,13,11
+		vmrghh  17,0,9
+		vmrglh  6,0,9
+		vmrglh  7,10,16
+		vmrghh  11,3,5
+		vmhraddshs      13,8,7,2
+		vmrglh  9,3,5
+		vmhraddshs      0,12,11,2
+		vmrglh  5,19,17
+		vmrglh  3,15,6
+		vmhraddshs      2,14,5,9
+		vmhraddshs      14,8,3,7
+		vmrghh  7,10,16
+		vmhraddshs      8,18,9,5
+		vmrghh  16,19,17
+		vmrghh  10,15,6
+		vsubshs 5,13,3
+		vsubshs 9,7,16
+		vsubshs 18,0,10
+		vmhraddshs      17,12,10,11
+		vsubshs 19,5,8
+		vsubshs 6,14,2
+		vaddshs 15,7,16
+		vaddshs 3,9,18
+		vsubshs 13,9,18
+		vsubshs 0,6,19
+		vaddshs 10,6,19
+		vaddshs 11,5,8
+		vmhraddshs	12,1,10,3
+		vaddshs		16,14,2
+		vmhraddshs	7,4,10,3
+		vaddshs		18,15,17
+		vmhraddshs	9,1,0,13
+		vsubshs		6,15,17
+		vmhraddshs	19,4,0,13
+		vaddshs		5,18,16
+		vsubshs		8,18,16
+		vaddshs		14,6,11
+		vsubshs		2,6,11
+		vmrghh		0,5,2
+		vmrglh		1,5,2
+		vmrghh		3,12,19
+		vmrglh		4,12,19
+		vmrghh		10,9,7
+		vmrglh		13,9,7
+		vmrghh		15,14,8
+		vmrglh		17,14,8
+		vmrghh		5,0,10
+		vmrglh		12,0,10
+		vmrghh		9,1,13
+		vmrglh		14,1,13
+		vmrghh		2,3,15
+		vmrglh		19,3,15
+		vmrghh		7,4,17
+		vmrglh		8,4,17
+		vmrghh		0,5,2
+		vmrglh		1,5,2
+		stvx		0,0,%0
+		vmrghh		3,12,19
+		stvx		1,%0,11
+		vmrglh		4,12,19
+		stvx		3,%0,9
+		vmrghh		10,9,7
+		stvx		4,%0,7
+		vmrglh		13,9,7
+		stvx		10,%0,5
+		vmrghh		15,14,8
+		stvx		13,%0,10
+		vmrglh		17,14,8
+		stvx		15,%0,8
+		stvx		17,%0,6
+		"
+		:
+		: "r" (output), "r" (input), "r" (SpecialConstants), "r" (PreScale)
+		: "cc", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "memory" );
+	/* End asm */
+
+}
diff -uNr vlc-0.2.82/plugins/idct/testdata.h vlc-0.2.82-altivec/plugins/idct/testdata.h
--- vlc-0.2.82/plugins/idct/testdata.h	Thu Jan  1 01:00:00 1970
+++ vlc-0.2.82-altivec/plugins/idct/testdata.h	Mon Aug 27 22:01:57 2001
@@ -0,0 +1,12 @@
+/* Testdata for IDCT */
+
+dctelem_t testdata[] = {
+   131,     0,     0,   131,   -51,     0,     0,     0,
+     0,   -51,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,   -51,     0,     0,     0,     0,
+     0,     0,   101,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,   -51,     0,     0,
+     0,     0,     0,     0,   101,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,   101,     0,     0,     0,   101 };
+

Reply to: