from __future__ import print_function import numpy as np from numba import config, cuda, int32 from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim, skip_unless_cc_60, skip_if_cudadevrt_missing) @cuda.jit def this_grid(A): cuda.cg.this_grid() A[0] = 1.0 @cuda.jit def sync_group(A): g = cuda.cg.this_grid() g.sync() A[0] = 1.0 @cuda.jit def no_sync(A): A[0] = cuda.grid(1) def sequential_rows(M): # The grid writes rows one at a time. Each thread reads an element from # the previous row written by its "opposite" thread. # # A failure to sync the grid at each row would result in an incorrect # result as some threads could run ahead of threads in other blocks, or # fail to see the update to the previous row from their opposite thread. col = cuda.grid(1) g = cuda.cg.this_grid() rows = M.shape[0] cols = M.shape[1] for row in range(1, rows): opposite = cols - col - 1 M[row, col] = M[row - 1, opposite] + 1 g.sync() @skip_if_cudadevrt_missing class TestCudaCooperativeGroups(CUDATestCase): @skip_unless_cc_60 def test_this_grid(self): A = np.full(1, fill_value=np.nan) this_grid[1, 1](A) # Ensure the kernel executed beyond the call to cuda.this_grid() self.assertFalse(np.isnan(A[0]), 'Value was not set') @skip_unless_cc_60 @skip_on_cudasim("Simulator doesn't differentiate between normal and " "cooperative kernels") def test_this_grid_is_cooperative(self): A = np.full(1, fill_value=np.nan) this_grid[1, 1](A) # this_grid should have been determined to be cooperative for key, overload in this_grid.overloads.items(): self.assertTrue(overload.cooperative) @skip_unless_cc_60 def test_sync_group(self): A = np.full(1, fill_value=np.nan) sync_group[1, 1](A) # Ensure the kernel executed beyond the call to cuda.sync_group() self.assertFalse(np.isnan(A[0]), 'Value was not set') @skip_unless_cc_60 @skip_on_cudasim("Simulator doesn't differentiate between normal and " "cooperative kernels") def test_sync_group_is_cooperative(self): A = np.full(1, fill_value=np.nan) sync_group[1, 1](A) # sync_group should have been determined to be cooperative for key, overload in sync_group.overloads.items(): self.assertTrue(overload.cooperative) @skip_on_cudasim("Simulator does not implement linking") def test_false_cooperative_doesnt_link_cudadevrt(self): """ We should only mark a kernel as cooperative and link cudadevrt if the kernel uses grid sync. Here we ensure that one that doesn't use grid synsync isn't marked as such. """ A = np.full(1, fill_value=np.nan) no_sync[1, 1](A) for key, overload in no_sync.overloads.items(): self.assertFalse(overload.cooperative) for link in overload._codelibrary._linking_files: self.assertNotIn('cudadevrt', link) @skip_unless_cc_60 def test_sync_at_matrix_row(self): if config.ENABLE_CUDASIM: # Use a small matrix to compute using a single block in a # reasonable amount of time shape = (32, 32) else: shape = (1024, 1024) A = np.zeros(shape, dtype=np.int32) blockdim = 32 griddim = A.shape[1] // blockdim sig = (int32[:,::1],) c_sequential_rows = cuda.jit(sig)(sequential_rows) overload = c_sequential_rows.overloads[sig] mb = overload.max_cooperative_grid_blocks(blockdim) if griddim > mb: unittest.skip("GPU cannot support enough cooperative grid blocks") c_sequential_rows[griddim, blockdim](A) reference = np.tile(np.arange(shape[0]), (shape[1], 1)).T np.testing.assert_equal(A, reference) @skip_unless_cc_60 def test_max_cooperative_grid_blocks(self): # The maximum number of blocks will vary based on the device so we # can't test for an expected value, but we can check that the function # doesn't error, and that varying the number of dimensions of the block # whilst keeping the total number of threads constant doesn't change # the maximum to validate some of the logic. sig = (int32[:,::1],) c_sequential_rows = cuda.jit(sig)(sequential_rows) overload = c_sequential_rows.overloads[sig] blocks1d = overload.max_cooperative_grid_blocks(256) blocks2d = overload.max_cooperative_grid_blocks((16, 16)) blocks3d = overload.max_cooperative_grid_blocks((16, 4, 4)) self.assertEqual(blocks1d, blocks2d) self.assertEqual(blocks1d, blocks3d) if __name__ == '__main__': unittest.main()