# -*- coding: utf-8 -*- from __future__ import print_function, absolute_import, division import sys import os import re import multiprocessing import unittest import numpy as np from numba import (njit, set_num_threads, get_num_threads, prange, config, threading_layer, guvectorize) from numba.np.ufunc.parallel import _get_thread_id from numba.core.errors import TypingError from numba.tests.support import TestCase, skip_parfors_unsupported, tag from numba.tests.test_parallel_backend import TestInSubprocess class TestNumThreads(TestCase): _numba_parallel_test_ = False def setUp(self): # Make sure the num_threads is set to the max. This also makes sure # the threads are launched. set_num_threads(config.NUMBA_NUM_THREADS) def check_mask(self, expected, result): # There's no guarantee that TBB will use a full mask worth of # threads if it deems it inefficient to do so if threading_layer() == 'tbb': self.assertTrue(np.all(result <= expected)) elif threading_layer() in ('omp', 'workqueue'): np.testing.assert_equal(expected, result) else: assert 0, 'unreachable' @skip_parfors_unsupported def test_set_num_threads_type(self): @njit def foo(): set_num_threads('wrong_type') expected = "The number of threads specified must be an integer" for fn, errty in ((foo, TypingError), (foo.py_func, TypeError)): with self.assertRaises(errty) as raises: fn() self.assertIn(expected, str(raises.exception)) @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores") def _test_set_num_threads_basic(self): max_threads = config.NUMBA_NUM_THREADS self.assertEqual(get_num_threads(), max_threads) set_num_threads(2) self.assertEqual(get_num_threads(), 2) set_num_threads(max_threads) self.assertEqual(get_num_threads(), max_threads) with self.assertRaises(ValueError): set_num_threads(0) with self.assertRaises(ValueError): set_num_threads(max_threads + 1) @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores") def _test_set_num_threads_basic_jit(self): max_threads = config.NUMBA_NUM_THREADS @njit def get_n(): return get_num_threads() self.assertEqual(get_n(), max_threads) set_num_threads(2) self.assertEqual(get_n(), 2) set_num_threads(max_threads) self.assertEqual(get_n(), max_threads) @njit def set_get_n(n): set_num_threads(n) return get_num_threads() self.assertEqual(set_get_n(2), 2) self.assertEqual(set_get_n(max_threads), max_threads) @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores") def _test_set_num_threads_basic_guvectorize(self): max_threads = config.NUMBA_NUM_THREADS @guvectorize(['void(int64[:])'], '(n)', nopython=True, target='parallel') def get_n(x): x[:] = get_num_threads() x = np.zeros((5000000,), dtype=np.int64) get_n(x) np.testing.assert_equal(x, max_threads) set_num_threads(2) x = np.zeros((5000000,), dtype=np.int64) get_n(x) np.testing.assert_equal(x, 2) set_num_threads(max_threads) x = np.zeros((5000000,), dtype=np.int64) get_n(x) np.testing.assert_equal(x, max_threads) @guvectorize(['void(int64[:])'], '(n)', nopython=True, target='parallel') def set_get_n(n): set_num_threads(n[0]) n[:] = get_num_threads() x = np.zeros((5000000,), dtype=np.int64) x[0] = 2 set_get_n(x) np.testing.assert_equal(x, 2) x = np.zeros((5000000,), dtype=np.int64) x[0] = max_threads set_get_n(x) np.testing.assert_equal(x, max_threads) @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores") def _test_set_num_threads_outside_jit(self): # Test set_num_threads outside a jitted function set_num_threads(2) @njit(parallel=True) def test_func(): x = 5 buf = np.empty((x,)) for i in prange(x): buf[i] = get_num_threads() return buf @guvectorize(['void(int64[:])'], '(n)', nopython=True, target='parallel') def test_gufunc(x): x[:] = get_num_threads() out = test_func() np.testing.assert_equal(out, 2) x = np.zeros((5000000,), dtype=np.int64) test_gufunc(x) np.testing.assert_equal(x, 2) @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores") def _test_set_num_threads_inside_jit(self): # Test set_num_threads inside a jitted function @njit(parallel=True) def test_func(nthreads): x = 5 buf = np.empty((x,)) set_num_threads(nthreads) for i in prange(x): buf[i] = get_num_threads() return buf mask = 2 out = test_func(mask) np.testing.assert_equal(out, mask) @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores") def _test_set_num_threads_inside_guvectorize(self): # Test set_num_threads inside a jitted guvectorize function @guvectorize(['void(int64[:])'], '(n)', nopython=True, target='parallel') def test_func(x): set_num_threads(x[0]) x[:] = get_num_threads() x = np.zeros((5000000,), dtype=np.int64) mask = 2 x[0] = mask test_func(x) np.testing.assert_equal(x, mask) @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores") def _test_get_num_threads_truth_outside_jit(self): for mask in range(2, min(6, config.NUMBA_NUM_THREADS + 1)): set_num_threads(mask) # a lot of work, hopefully will trigger "mask" count of threads to # join the parallel region (for those backends with dynamic threads) @njit(parallel=True) def test_func(): x = 5000000 buf = np.empty((x,)) for i in prange(x): buf[i] = _get_thread_id() return len(np.unique(buf)), get_num_threads() out = test_func() self.check_mask((mask, mask), out) @guvectorize(['void(int64[:], int64[:])'], '(n), (m)', nopython=True, target='parallel') def test_gufunc(x, out): x[:] = _get_thread_id() out[0] = get_num_threads() # Reshape to force parallelism x = np.full((5000000,), -1, dtype=np.int64).reshape((100, 50000)) out = np.zeros((1,), dtype=np.int64) test_gufunc(x, out) self.check_mask(mask, out) self.check_mask(mask, len(np.unique(x))) @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores") def _test_get_num_threads_truth_inside_jit(self): for mask in range(2, min(6, config.NUMBA_NUM_THREADS + 1)): # a lot of work, hopefully will trigger "mask" count of threads to # join the parallel region (for those backends with dynamic threads) @njit(parallel=True) def test_func(): set_num_threads(mask) x = 5000000 buf = np.empty((x,)) for i in prange(x): buf[i] = _get_thread_id() return len(np.unique(buf)), get_num_threads() out = test_func() self.check_mask((mask, mask), out) @guvectorize(['void(int64[:], int64[:])'], '(n), (m)', nopython=True, target='parallel') def test_gufunc(x, out): set_num_threads(mask) x[:] = _get_thread_id() out[0] = get_num_threads() # Reshape to force parallelism x = np.full((5000000,), -1, dtype=np.int64).reshape((100, 50000)) out = np.zeros((1,), dtype=np.int64) test_gufunc(x, out) self.check_mask(mask, out) self.check_mask(mask, len(np.unique(x))) # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not # set or >= 2) and TBB backends @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores") def _test_nested_parallelism_1(self): if threading_layer() == 'workqueue': self.skipTest("workqueue is not threadsafe") # check that get_num_threads is ok in nesting mask = config.NUMBA_NUM_THREADS - 1 N = config.NUMBA_NUM_THREADS M = 2 * config.NUMBA_NUM_THREADS @njit(parallel=True) def child_func(buf, fid): M, N = buf.shape for i in prange(N): buf[fid, i] = get_num_threads() def get_test(test_type): if test_type == 'njit': def test_func(nthreads, py_func=False): @njit(parallel=True) def _test_func(nthreads): acc = 0 buf = np.zeros((M, N)) set_num_threads(nthreads) for i in prange(M): local_mask = 1 + i % mask # set threads in parent function set_num_threads(local_mask) if local_mask < N: child_func(buf, local_mask) acc += get_num_threads() return acc, buf if py_func: return _test_func.py_func(nthreads) else: return _test_func(nthreads) elif test_type == 'guvectorize': def test_func(nthreads, py_func=False): def _test_func(acc, buf, local_mask): set_num_threads(nthreads) # set threads in parent function set_num_threads(local_mask[0]) if local_mask[0] < N: child_func(buf, local_mask[0]) acc[0] += get_num_threads() buf = np.zeros((M, N), dtype=np.int64) acc = np.zeros((M, 1), dtype=np.int64) local_mask = (1 + np.arange(M) % mask).reshape((M, 1)) sig = ['void(int64[:], int64[:, :], int64[:])'] layout = '(p), (n, m), (p)' if not py_func: _test_func = guvectorize(sig, layout, nopython=True, target='parallel')(_test_func) else: _test_func = guvectorize(sig, layout, forceobj=True)(_test_func) _test_func(acc, buf, local_mask) return acc, buf return test_func for test_type in ['njit', 'guvectorize']: test_func = get_test(test_type) got_acc, got_arr = test_func(mask) exp_acc, exp_arr = test_func(mask, py_func=True) np.testing.assert_equal(exp_acc, got_acc) np.testing.assert_equal(exp_arr, got_arr) # check the maths reconciles, guvectorize does not reduce, njit does math_acc_exp = 1 + np.arange(M) % mask if test_type == 'guvectorize': math_acc = math_acc_exp.reshape((M, 1)) else: math_acc = np.sum(math_acc_exp) np.testing.assert_equal(math_acc, got_acc) math_arr = np.zeros((M, N)) for i in range(1, N): # there's branches on 1, ..., num_threads - 1 math_arr[i, :] = i np.testing.assert_equal(math_arr, got_arr) # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not # set or >= 2) and TBB backends @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores") def _test_nested_parallelism_2(self): if threading_layer() == 'workqueue': self.skipTest("workqueue is not threadsafe") # check that get_num_threads is ok in nesting N = config.NUMBA_NUM_THREADS + 1 M = 4 * config.NUMBA_NUM_THREADS + 1 def get_impl(child_type, test_type): if child_type == 'parallel': child_dec = njit(parallel=True) elif child_type == 'njit': child_dec = njit(parallel=False) elif child_type == 'none': def child_dec(x): return x @child_dec def child(buf, fid): M, N = buf.shape set_num_threads(fid) # set threads in child function for i in prange(N): buf[fid, i] = get_num_threads() if test_type in ['parallel', 'njit', 'none']: if test_type == 'parallel': test_dec = njit(parallel=True) elif test_type == 'njit': test_dec = njit(parallel=False) elif test_type == 'none': def test_dec(x): return x @test_dec def test_func(nthreads): buf = np.zeros((M, N)) set_num_threads(nthreads) for i in prange(M): local_mask = 1 + i % mask # when the threads exit the child functions they should # have a TLS slot value of the local mask as it was set # in child if local_mask < config.NUMBA_NUM_THREADS: child(buf, local_mask) assert get_num_threads() == local_mask return buf else: if test_type == 'guvectorize': test_dec = guvectorize(['int64[:,:], int64[:]'], '(n, m), (k)', nopython=True, target='parallel') elif test_type == 'guvectorize-obj': test_dec = guvectorize(['int64[:,:], int64[:]'], '(n, m), (k)', forceobj=True) def test_func(nthreads): @test_dec def _test_func(buf, local_mask): set_num_threads(nthreads) # when the threads exit the child functions they should # have a TLS slot value of the local mask as it was set # in child if local_mask[0] < config.NUMBA_NUM_THREADS: child(buf, local_mask[0]) assert get_num_threads() == local_mask[0] buf = np.zeros((M, N), dtype=np.int64) local_mask = (1 + np.arange(M) % mask).reshape((M, 1)) _test_func(buf, local_mask) return buf return test_func mask = config.NUMBA_NUM_THREADS - 1 res_arrays = {} for test_type in ['parallel', 'njit', 'none', 'guvectorize', 'guvectorize-obj']: for child_type in ['parallel', 'njit', 'none']: if child_type == 'none' and test_type != 'none': continue set_num_threads(mask) res_arrays[test_type, child_type] = get_impl( child_type, test_type)(mask) py_arr = res_arrays['none', 'none'] for arr in res_arrays.values(): np.testing.assert_equal(arr, py_arr) # check the maths reconciles math_arr = np.zeros((M, N)) # there's branches on modulo mask but only NUMBA_NUM_THREADS funcs for i in range(1, config.NUMBA_NUM_THREADS): math_arr[i, :] = i np.testing.assert_equal(math_arr, py_arr) # this test can only run on OpenMP (providing OMP_MAX_ACTIVE_LEVELS is not # set or >= 2) and TBB backends # This test needs at least 3 threads to run, N>=2 for the launch, M>=N+1 for # the nested function @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 3, "Not enough CPU cores") def _test_nested_parallelism_3(self): if threading_layer() == 'workqueue': self.skipTest("workqueue is not threadsafe") # check that the right number of threads are present in nesting # this relies on there being a load of cores present BIG = 1000000 @njit(parallel=True) def work(local_nt): # arg is value 3 tid = np.zeros(BIG) acc = 0 set_num_threads(local_nt) # set to 3 threads for i in prange(BIG): acc += 1 tid[i] = _get_thread_id() return acc, np.unique(tid) @njit(parallel=True) def test_func_jit(nthreads): set_num_threads(nthreads) # set to 2 threads lens = np.zeros(nthreads) total = 0 for i in prange(nthreads): my_acc, tids = work(nthreads + 1) # call with value 3 lens[i] = len(tids) total += my_acc return total, np.unique(lens) NT = 2 expected_acc = BIG * NT expected_thread_count = NT + 1 got_acc, got_tc = test_func_jit(NT) self.assertEqual(expected_acc, got_acc) self.check_mask(expected_thread_count, got_tc) def test_guvectorize(nthreads): @guvectorize(['int64[:], int64[:]'], '(n), (n)', nopython=True, target='parallel') def test_func_guvectorize(total, lens): my_acc, tids = work(nthreads + 1) lens[0] = len(tids) total[0] += my_acc total = np.zeros((nthreads, 1), dtype=np.int64) lens = np.zeros(nthreads, dtype=np.int64).reshape((nthreads, 1)) test_func_guvectorize(total, lens) # vectorize does not reduce, so total is summed return total.sum(), np.unique(lens) got_acc, got_tc = test_guvectorize(NT) self.assertEqual(expected_acc, got_acc) self.check_mask(expected_thread_count, got_tc) @skip_parfors_unsupported @unittest.skipIf(config.NUMBA_NUM_THREADS < 2, "Not enough CPU cores") @unittest.skipIf(not sys.platform.startswith('linux'), "Linux only") def _test_threadmask_across_fork(self): forkctx = multiprocessing.get_context('fork') @njit def foo(): return get_num_threads() def wrap(queue): queue.put(foo()) mask = 1 self.assertEqual(foo(), config.NUMBA_NUM_THREADS) set_num_threads(mask) self.assertEqual(foo(), mask) shared_queue = forkctx.Queue() # check TLS slot inheritance in fork p = forkctx.Process(target=wrap, args=(shared_queue,)) p.start() p.join() self.assertEqual(shared_queue.get(), mask) def tearDown(self): set_num_threads(config.NUMBA_NUM_THREADS) class TestNumThreadsBackends(TestInSubprocess, TestCase): _class = TestNumThreads _DEBUG = False # 1 is mainly here to ensure tests skip correctly num_threads = [i for i in [1, 2, 4, 8, 16] if i <= config.NUMBA_NUM_THREADS] def run_test_in_separate_process(self, test, threading_layer, num_threads): env_copy = os.environ.copy() env_copy['NUMBA_THREADING_LAYER'] = str(threading_layer) env_copy['NUMBA_NUM_THREADS'] = str(num_threads) cmdline = [sys.executable, "-m", "numba.runtests", "-v", test] return self.run_cmd(cmdline, env_copy) @classmethod def _inject(cls, name, backend, backend_guard, num_threads): themod = cls.__module__ thecls = cls._class.__name__ injected_method = '%s.%s.%s' % (themod, thecls, name) def test_template(self): o, e = self.run_test_in_separate_process(injected_method, backend, num_threads) if self._DEBUG: print('stdout:\n "%s"\n stderr:\n "%s"' % (o, e)) self.assertIn('OK', e) self.assertTrue('FAIL' not in e) self.assertTrue('ERROR' not in e) m = re.search(r"\.\.\. skipped '(.*?)'", e) if m: self.skipTest(m.group(1)) injected_test = "%s_%s_%s_threads" % (name[1:], backend, num_threads) setattr(cls, injected_test, tag('long_running')(backend_guard(test_template))) @classmethod def generate(cls): for name in cls._class.__dict__.copy(): for backend, backend_guard in cls.backends.items(): for num_threads in cls.num_threads: if not name.startswith('_test_'): continue cls._inject(name, backend, backend_guard, num_threads) TestNumThreadsBackends.generate() if __name__ == '__main__': unittest.main()