[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#723793: Bug#723587: release.debian.org: Non-free file in PyOpenCL - new version upload to stable and oldstable



Dnia 2013-10-02, śro o godzinie 01:12 +0200, Cyril Brulebois pisze:
> Hi again Tomasz,
> 
> Cyril Brulebois <kibi@debian.org> (2013-09-23):
> > The squeeze.diff one seems to have unrelated noise in patches,
> > presumably because something refreshed them while you were preparing
> > the diff? Having a targeted patched like the first one would be nice,
> > so please follow up to 723793@bugs.debian.org with a cleaner debdiff.
> 
> kind reminder: o-p-u NEW freeze is less than 2 weeks from now. Fixing
> this issue in a later point release is of course perfectly OK, I just
> thought I'd give you a heads-up.
> 

Thanks for the reminder.
I was aware of this, but other things, completely unrelated
to software worlds, took my time and energy.

I attach new patch with proposed changes. It only changes
changelog, debian/rules (removal of offending file)
and mentioned file.

Best regards.

-- 
Tomasz Rybak  GPG/PGP key ID: 2AD5 9860
Fingerprint A481 824E 7DD3 9C0E C40A  488E C654 FB33 2AD5 9860
http://member.acm.org/~tomaszrybak

diff -Nru pyopencl-0.92/debian/changelog pyopencl-0.92.dfsg/debian/changelog
--- pyopencl-0.92/debian/changelog	2010-11-11 23:10:57.000000000 +0100
+++ pyopencl-0.92.dfsg/debian/changelog	2013-10-04 18:00:49.000000000 +0200
@@ -1,3 +1,9 @@
+pyopencl (0.92.dfsg-1) oldstable; urgency=low
+
+  * Remove non-free file from examples (#722014, #723793).
+
+ -- Tomasz Rybak <tomasz.rybak@post.pl>  Fri, 04 Oct 2013 17:54:19 +0200
+
 pyopencl (0.92-1) unstable; urgency=high
 
   * New upstream release
diff -Nru pyopencl-0.92/debian/rules pyopencl-0.92.dfsg/debian/rules
--- pyopencl-0.92/debian/rules	2010-11-11 13:30:27.000000000 +0100
+++ pyopencl-0.92.dfsg/debian/rules	2013-10-04 18:00:49.000000000 +0200
@@ -36,6 +36,7 @@
 	git clone $(GIT_URL) $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION)
 	cd $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION) && git checkout $(GIT_REVISION)
 	rm -rf $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION)/.git $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION)/.gitignore $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION)/.gitmodules
+	rm -rf $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION)/examples/matrix-multiply.py
 	tar czf $(MODULE_NAME)_$(DEB_UPSTREAM_VERSION).orig.tar.gz $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION)
 	rm -rf $(MODULE_NAME)-$(DEB_UPSTREAM_VERSION)
 
diff -Nru pyopencl-0.92/examples/matrix-multiply.py pyopencl-0.92.dfsg/examples/matrix-multiply.py
--- pyopencl-0.92/examples/matrix-multiply.py	2010-10-21 19:10:19.000000000 +0200
+++ pyopencl-0.92.dfsg/examples/matrix-multiply.py	1970-01-01 01:00:00.000000000 +0100
@@ -1,241 +0,0 @@
-# example provided by Eilif Muller
-
-from __future__ import division
-
-KERNEL_CODE = """
-
-// Thread block size
-#define BLOCK_SIZE %(block_size)d
-
-// Matrix dimensions
-// (chosen as multiples of the thread block size for simplicity)
-#define WA %(w_a)d // Matrix A width
-#define HA %(h_a)d // Matrix A height
-#define WB %(w_b)d // Matrix B width
-#define HB WA  // Matrix B height
-#define WC WB  // Matrix C width
-#define HC HA  // Matrix C height
-
-
-/*
- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and
- * proprietary rights in and to this software and related documentation.
- * Any use, reproduction, disclosure, or distribution of this software
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA)
- * associated with this source code for terms and conditions that govern
- * your use of this NVIDIA software.
- *
- */
-
-/* Matrix multiplication: C = A * B.
- * Device code.
- */
-
-#define AS(j, i) As[i + j * BLOCK_SIZE]
-#define BS(j, i) Bs[i + j * BLOCK_SIZE]
-
-////////////////////////////////////////////////////////////////////////////////
-//! Matrix multiplication on the device: C = A * B
-//! WA is A's width and WB is B's width
-////////////////////////////////////////////////////////////////////////////////
-__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1))) 
-void
-matrixMul( __global float* C, __global float* A, __global float* B)
-{
-    __local float As[BLOCK_SIZE*BLOCK_SIZE];
-    __local float Bs[BLOCK_SIZE*BLOCK_SIZE];
-
-    // Block index
-    int bx = get_group_id(0);
-    int by = get_group_id(1);
-
-    // Thread index
-    int tx = get_local_id(0);
-    int ty = get_local_id(1);
-
-    // Index of the first sub-matrix of A processed by the block
-    int aBegin = WA * BLOCK_SIZE * by;
-
-    // Index of the last sub-matrix of A processed by the block
-    int aEnd   = aBegin + WA - 1;
-
-    // Step size used to iterate through the sub-matrices of A
-    int aStep  = BLOCK_SIZE;
-
-    // Index of the first sub-matrix of B processed by the block
-    int bBegin = BLOCK_SIZE * bx;
-
-    // Step size used to iterate through the sub-matrices of B
-    int bStep  = BLOCK_SIZE * WB;
-
-    // Csub is used to store the element of the block sub-matrix
-    // that is computed by the thread
-    float Csub = 0.0f;
-
-    // Loop over all the sub-matrices of A and B
-    // required to compute the block sub-matrix
-    for (int a = aBegin, b = bBegin;
-             a <= aEnd;
-             a += aStep, b += bStep) {
-
-        // Load the matrices from device memory
-        // to shared memory; each thread loads
-        // one element of each matrix
-        AS(ty, tx) = A[a + WA * ty + tx];
-        BS(ty, tx) = B[b + WB * ty + tx];
-
-        // Synchronize to make sure the matrices are loaded
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Multiply the two matrices together;
-        // each thread computes one element
-        // of the block sub-matrix
-        for (int k = 0; k < BLOCK_SIZE; ++k)
-            Csub += AS(ty, k) * BS(k, tx);
-
-        // Synchronize to make sure that the preceding
-        // computation is done before loading two new
-        // sub-matrices of A and B in the next iteration
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    // Write the block sub-matrix to device memory;
-    // each thread writes one element
-    C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
-
-}
-
-"""
-
-import pyopencl as cl
-from time import time
-import numpy
-
-block_size = 16
-
-ctx = cl.create_some_context()
-
-for dev in ctx.devices:
-    assert dev.local_mem_size > 0
-
-queue = cl.CommandQueue(ctx,
-        properties=cl.command_queue_properties.PROFILING_ENABLE)
-
-#queue = cl.CommandQueue(ctx)
-
-if False:
-    a_height = 4096
-    #a_height = 1024
-    a_width = 2048
-    #a_width = 256
-    #b_height == a_width
-    b_width = a_height
-
-elif False:
-    # like PyCUDA
-    a_height = 2516
-    a_width = 1472
-    b_height = a_width
-    b_width = 2144
-
-else:
-    # CL SDK
-    a_width = 50*block_size
-    a_height = 100*block_size
-    b_width = 50*block_size
-    b_height = a_width
-
-c_width = b_width
-c_height = a_height
-
-h_a = numpy.random.rand(a_height, a_width).astype(numpy.float32)
-h_b = numpy.random.rand(b_height, b_width).astype(numpy.float32)
-h_c = numpy.empty((c_height, c_width)).astype(numpy.float32)
-
-
-kernel_params = {"block_size": block_size,
-        "w_a":a_width, "h_a":a_height, "w_b":b_width}
-
-if "NVIDIA" in queue.device.vendor:
-    options = "-cl-mad-enable -cl-fast-relaxed-math"
-else:
-    options = None
-prg = cl.Program(ctx, KERNEL_CODE % kernel_params,
-        ).build(options=options)
-kernel = prg.matrixMul
-#print prg.binaries[0]
-
-assert a_width % block_size == 0
-assert a_height % block_size == 0
-assert b_width % block_size == 0
-
-# transfer host -> device -----------------------------------------------------
-mf = cl.mem_flags
-
-t1 = time()
-
-d_a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_a)
-d_b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_b)
-d_c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=h_c.nbytes)
-
-push_time = time()-t1
-
-# warmup ----------------------------------------------------------------------
-for i in range(5):
-    event = kernel(queue, h_c.shape, (block_size, block_size), 
-            d_c_buf, d_a_buf, d_b_buf)
-    event.wait()
-
-queue.finish()
-
-# actual benchmark ------------------------------------------------------------
-t1 = time()
-
-count = 20
-for i in range(count):
-    event = kernel(queue, h_c.shape, (block_size, block_size),
-            d_c_buf, d_a_buf, d_b_buf)
-
-event.wait()
-
-gpu_time = (time()-t1)/count
-
-# transfer device -> host -----------------------------------------------------
-t1 = time()
-cl.enqueue_read_buffer(queue, d_c_buf, h_c).wait()
-pull_time = time()-t1
-
-# timing output ---------------------------------------------------------------
-gpu_total_time = gpu_time+push_time+pull_time
-
-print "GPU push+compute+pull total [s]:", gpu_total_time
-print "GPU push [s]:", push_time
-print "GPU pull [s]:", pull_time
-print "GPU compute (host-timed) [s]:", gpu_time
-print "GPU compute (event-timed) [s]: ", (event.profile.end-event.profile.start)*1e-9
-
-gflop = h_c.size * (a_width * 2.) / (1000**3.)
-gflops = gflop / gpu_time
-
-print
-print "GFlops/s:", gflops
-
-# cpu comparison --------------------------------------------------------------
-t1 = time()
-h_c_cpu = numpy.dot(h_a,h_b)
-cpu_time = time()-t1
-
-print
-print "GPU==CPU:",numpy.allclose(h_c, h_c_cpu)
-print
-print "CPU time (s)", cpu_time
-print
-
-print "GPU speedup (with transfer): ", cpu_time/gpu_total_time
-print "GPU speedup (without transfer): ", cpu_time/gpu_time
-

Reply to: