import numpy as np from numba import cuda, int32, float32 from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase from numba.core.config import ENABLE_CUDASIM def useless_syncthreads(ary): i = cuda.grid(1) cuda.syncthreads() ary[i] = i def useless_syncwarp(ary): i = cuda.grid(1) cuda.syncwarp() ary[i] = i def useless_syncwarp_with_mask(ary): i = cuda.grid(1) cuda.syncwarp(0xFFFF) ary[i] = i def coop_syncwarp(res): sm = cuda.shared.array(32, int32) i = cuda.grid(1) sm[i] = i cuda.syncwarp() if i < 16: sm[i] = sm[i] + sm[i + 16] cuda.syncwarp(0xFFFF) if i < 8: sm[i] = sm[i] + sm[i + 8] cuda.syncwarp(0xFF) if i < 4: sm[i] = sm[i] + sm[i + 4] cuda.syncwarp(0xF) if i < 2: sm[i] = sm[i] + sm[i + 2] cuda.syncwarp(0x3) if i == 0: res[0] = sm[0] + sm[1] def simple_smem(ary): N = 100 sm = cuda.shared.array(N, int32) i = cuda.grid(1) if i == 0: for j in range(N): sm[j] = j cuda.syncthreads() ary[i] = sm[i] def coop_smem2d(ary): i, j = cuda.grid(2) sm = cuda.shared.array((10, 20), float32) sm[i, j] = (i + 1) / (j + 1) cuda.syncthreads() ary[i, j] = sm[i, j] def dyn_shared_memory(ary): i = cuda.grid(1) sm = cuda.shared.array(0, float32) sm[i] = i * 2 cuda.syncthreads() ary[i] = sm[i] def use_threadfence(ary): ary[0] += 123 cuda.threadfence() ary[0] += 321 def use_threadfence_block(ary): ary[0] += 123 cuda.threadfence_block() ary[0] += 321 def use_threadfence_system(ary): ary[0] += 123 cuda.threadfence_system() ary[0] += 321 def use_syncthreads_count(ary_in, ary_out): i = cuda.grid(1) ary_out[i] = cuda.syncthreads_count(ary_in[i]) def use_syncthreads_and(ary_in, ary_out): i = cuda.grid(1) ary_out[i] = cuda.syncthreads_and(ary_in[i]) def use_syncthreads_or(ary_in, ary_out): i = cuda.grid(1) ary_out[i] = cuda.syncthreads_or(ary_in[i]) def _safe_cc_check(cc): if ENABLE_CUDASIM: return True else: return cuda.get_current_device().compute_capability >= cc class TestCudaSync(CUDATestCase): def _test_useless(self, kernel): compiled = cuda.jit("void(int32[::1])")(kernel) nelem = 10 ary = np.empty(nelem, dtype=np.int32) exp = np.arange(nelem, dtype=np.int32) compiled[1, nelem](ary) np.testing.assert_equal(ary, exp) def test_useless_syncthreads(self): self._test_useless(useless_syncthreads) @skip_on_cudasim("syncwarp not implemented on cudasim") def test_useless_syncwarp(self): self._test_useless(useless_syncwarp) @skip_on_cudasim("syncwarp not implemented on cudasim") @unittest.skipUnless(_safe_cc_check((7, 0)), "Partial masks require CC 7.0 or greater") def test_useless_syncwarp_with_mask(self): self._test_useless(useless_syncwarp_with_mask) @skip_on_cudasim("syncwarp not implemented on cudasim") @unittest.skipUnless(_safe_cc_check((7, 0)), "Partial masks require CC 7.0 or greater") def test_coop_syncwarp(self): # coop_syncwarp computes the sum of all integers from 0 to 31 (496) # using a single warp expected = 496 nthreads = 32 nblocks = 1 compiled = cuda.jit("void(int32[::1])")(coop_syncwarp) res = np.zeros(1, dtype=np.int32) compiled[nblocks, nthreads](res) np.testing.assert_equal(expected, res[0]) def test_simple_smem(self): compiled = cuda.jit("void(int32[::1])")(simple_smem) nelem = 100 ary = np.empty(nelem, dtype=np.int32) compiled[1, nelem](ary) self.assertTrue(np.all(ary == np.arange(nelem, dtype=np.int32))) def test_coop_smem2d(self): compiled = cuda.jit("void(float32[:,::1])")(coop_smem2d) shape = 10, 20 ary = np.empty(shape, dtype=np.float32) compiled[1, shape](ary) exp = np.empty_like(ary) for i in range(ary.shape[0]): for j in range(ary.shape[1]): exp[i, j] = (i + 1) / (j + 1) self.assertTrue(np.allclose(ary, exp)) def test_dyn_shared_memory(self): compiled = cuda.jit("void(float32[::1])")(dyn_shared_memory) shape = 50 ary = np.empty(shape, dtype=np.float32) compiled[1, shape, 0, ary.size * 4](ary) self.assertTrue(np.all(ary == 2 * np.arange(ary.size, dtype=np.int32))) def test_threadfence_codegen(self): # Does not test runtime behavior, just the code generation. sig = (int32[:],) compiled = cuda.jit(sig)(use_threadfence) ary = np.zeros(10, dtype=np.int32) compiled[1, 1](ary) self.assertEqual(123 + 321, ary[0]) if not ENABLE_CUDASIM: self.assertIn("membar.gl;", compiled.ptx[sig]) def test_threadfence_block_codegen(self): # Does not test runtime behavior, just the code generation. sig = (int32[:],) compiled = cuda.jit(sig)(use_threadfence_block) ary = np.zeros(10, dtype=np.int32) compiled[1, 1](ary) self.assertEqual(123 + 321, ary[0]) if not ENABLE_CUDASIM: self.assertIn("membar.cta;", compiled.ptx[sig]) def test_threadfence_system_codegen(self): # Does not test runtime behavior, just the code generation. sig = (int32[:],) compiled = cuda.jit(sig)(use_threadfence_system) ary = np.zeros(10, dtype=np.int32) compiled[1, 1](ary) self.assertEqual(123 + 321, ary[0]) if not ENABLE_CUDASIM: self.assertIn("membar.sys;", compiled.ptx[sig]) def test_syncthreads_count(self): compiled = cuda.jit("void(int32[:], int32[:])")(use_syncthreads_count) ary_in = np.ones(72, dtype=np.int32) ary_out = np.zeros(72, dtype=np.int32) ary_in[31] = 0 ary_in[42] = 0 compiled[1, 72](ary_in, ary_out) self.assertTrue(np.all(ary_out == 70)) def test_syncthreads_and(self): compiled = cuda.jit("void(int32[:], int32[:])")(use_syncthreads_and) nelem = 100 ary_in = np.ones(nelem, dtype=np.int32) ary_out = np.zeros(nelem, dtype=np.int32) compiled[1, nelem](ary_in, ary_out) self.assertTrue(np.all(ary_out == 1)) ary_in[31] = 0 compiled[1, nelem](ary_in, ary_out) self.assertTrue(np.all(ary_out == 0)) def test_syncthreads_or(self): compiled = cuda.jit("void(int32[:], int32[:])")(use_syncthreads_or) nelem = 100 ary_in = np.zeros(nelem, dtype=np.int32) ary_out = np.zeros(nelem, dtype=np.int32) compiled[1, nelem](ary_in, ary_out) self.assertTrue(np.all(ary_out == 0)) ary_in[31] = 1 compiled[1, nelem](ary_in, ary_out) self.assertTrue(np.all(ary_out == 1)) if __name__ == '__main__': unittest.main()