Source code for pysb.simulator.opencl_ssa

from pysb.simulator.base import SimulationResult
from pysb.bng import generate_equations
from pysb.simulator.ssa_base import SSABase
import numpy as np
import os
import random
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial


try:
    import pyopencl as cl
    from pyopencl import array as ocl_array
    from pyopencl import device_type
    from mako.template import Template
except ImportError:
    cl = None
    device_type = None
    ocl_array = None


[docs]class OpenCLSSASimulator(SSABase): """ OpenCL SSA simulator This simulator is capable of using either a GPU or multi-core CPU. The simulator will detect and ask which device you would like to use. Alteratively, you can set the device using with an environment variable `PYOPENCL_CTX` Requires `OpenCL`_ and `PyOpenCL`_. .. _OpenCL : https://www.khronos.org/opencl/ .. _PyOpenCL : https://documen.tician.de/pyopencl/ Parameters ---------- model : pysb.Model Model to simulate. tspan : vector-like, optional Time values over which to simulate. The first and last values define the time range. Returned trajectories are sampled at every value unless the simulation is interrupted for some reason, e.g., due to satisfaction of a logical stopping criterion (see 'tout' below). initials : vector-like or dict, optional Values to use for the initial condition of all species. Ordering is determined by the order of model.species. If not specified, initial conditions will be taken from model.initial_conditions (with initial condition parameter values taken from `param_values` if specified). param_values : vector-like or dict, optional Values to use for every parameter in the model. Ordering is determined by the order of model.parameters. If passed as a dictionary, keys must be parameter names. If not specified, parameter values will be taken directly from model.parameters. verbose : bool, optional (default: False) Verbose output. precision : (np.float64, np.float32) Precision for ssa simulation. Default is np.float64. float32 should be used with caution. Attributes ---------- verbose: bool Verbosity flag passed to the constructor. model : pysb.Model Model passed to the constructor. tspan : vector-like Time values passed to the constructor. """ _supports = {'multi_initials': True, 'multi_param_values': True} def __init__(self, model, verbose=False, tspan=None, precision=np.float64, **kwargs): if cl is None: raise ImportError('pyopencl library required for {}' ''.format(self.__class__.__name__)) super(OpenCLSSASimulator, self).__init__(model, verbose, **kwargs) generate_equations(self._model) self.tspan = tspan self.verbose = verbose # private attribute self._step_0 = True self._dtype = precision template_code = Template( filename=os.path.join(os.path.dirname(__file__), 'templates', 'opencl_ssa.cl') ) args = self._get_template_args() # The use of single precision seems to work for most cases with little # differences in trajectory distributions, however sometimes if there # are really small time scale reactions, tau is below the level of # decimal precision time doesn't progress. Using double for time # seems to fix that for EARM 1.0, however, if there are reactions # who propensities make a0 too large, i think the no change in # time, perpetual running on gpu would occur. Keeping this warning # for now. For all other models that I have simulated, single produces # non-significantly different traces. if self._dtype == np.float32: args['prec'] = '#define USE_SINGLE_PRECISION' self._logger.warn("Should be cautious using single precision.") else: args['prec'] = '#define USE_DOUBLE_PRECISION' # So far, int 32 seems to handle all situations. # For boolean simulations, can use short or possible char. # Keeping option for reference now in hopes of being able to # calculate precision factor prior to simulating. _d = {np.uint32: "uint", np.int32: 'int', np.int16: 'ushort', np.int64: 'long', np.uint64: 'unsigned long'} self._dtype_species = np.int32 args['spc_type'] = _d[self._dtype_species] if verbose == 2: args['verbose'] = '#define VERBOSE' elif verbose > 3: args['verbose'] = '#define VERBOSE_MAX' else: args['verbose'] = '' self._logger.info("Initialized OpenCLSSASimulator class") self._code = template_code.render(**args) def _compile(self): if self.verbose: self._logger.info("Output OpenCl file to ssa_opencl_code.cl") with open("ssa_opencl_code.cl", "w") as source_file: source_file.write(self._code) # allow users to select platform and devices self.context = cl.create_some_context(True) devices = self.context.devices self._device_name = devices[0].name self._n_devices = self.context.num_devices # CPUs can run independently, without any need to sync (sims ind from # one another), so can set work size to 1. # NVIDIA(CUDA) uses warp sizes of 32, where AMD/Intel use wavefront # # of 64 # https://rocmdocs.amd.com/en/latest/Programming_Guides/Kernel_language.html#warpsize # set local work group size. CPU = 1, CUDA = 32, otherwise 64 if devices[0].type == device_type.CPU: self._local_work_size = (1, 1) elif 'CUDA' in devices[0].platform.name.upper(): self._local_work_size = (32, 1) else: self._local_work_size = (64, 1) self._logger.info(f"Using device {self._device_name}") self.devices = devices # there are a lot of optimization options that I explored. # The ones commented out speed up the code, but reduced # precision. The ones remaining are typical optimizations. # Ideally could use -O2 or -O3, but different compilers # make different assumptions about branching that criples the code # for AMD gpus. # https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html # https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compiler-reference/compiler-options/optimization-options/o.html self.program = cl.Program(self.context, self._code).build( options=[ # '-O3', # '-cl-std=2.0', # '-cl-uniform-work-group-size', # '-cl-fast-relaxed-math', # '-cl-single-precision-constant', '-cl-denorms-are-zero', '-cl-no-signed-zeros', '-cl-finite-math-only', '-cl-mad-enable', '-I {}'.format(os.path.join(os.path.dirname(__file__))) ] )
[docs] def run(self, tspan=None, param_values=None, initials=None, number_sim=0, random_seed=0): """ Run a simulation and returns the result (trajectories) .. note:: In early versions of the Simulator class, ``tspan``, ``initials`` and ``param_values`` supplied to this method persisted to future :func:`run` calls. This is no longer the case. Parameters ---------- tspan initials param_values See parameter definitions in :class:`ScipyOdeSimulator`. number_sim: int Number of simulations to perform random_seed: int Seed used for random numbers to be passed to device. Returns ------- A :class:`SimulationResult` object """ super(OpenCLSSASimulator, self).run(tspan=tspan, initials=initials, param_values=param_values, number_sim=number_sim) if tspan is None: if self.tspan is None: raise Exception("Please provide tspan") else: tspan = self.tspan # tspan for each simulation t_out = np.array(tspan, dtype=self._dtype) # compile kernel and send parameters to GPU if self._step_0: self._setup() self._logger.info("Creating content on device") # If there is more than one device, split number of simulations # over all devices equally n_sim_per_device = self.num_sim//self._n_devices local_work_size = self._local_work_size # Number of blocks refers to how many warps/wavefronts to be run # at the same time. Threads is how many simulations per warp/wavefront # Number together will be equal to total number of simulations, or # the next number of blocks up # (100 simulations, 32 threads/block, 4 blocks) # Ideally one would run a multiple of local_work_size[0] blocks, threads = self.get_blocks(n_sim_per_device, local_work_size[0]) total_threads = int(blocks * threads) # retrieve and store results timer_start = time.time() # allows multiple GPUs to be used with ThreadPoolExecutor(self._n_devices) as executor: sim_partial = partial(call, ocl_instance=self, n_sim_per_device=n_sim_per_device, t_out=t_out, total_threads=total_threads, local_work_size=local_work_size, random_seed=random_seed) results = [executor.submit(sim_partial, i) for i in range(self._n_devices)] traj = [r.result() for r in results] traj = [r.reshape((total_threads, len(t_out), self._n_species)) for r in traj] traj = np.vstack(traj) traj = traj[:self.num_sim] self._time = time.time() - timer_start self._logger.info("{} simulations " "in {:.4f}s".format(self.num_sim, self._time)) tout = np.array([tspan] * self.num_sim) return SimulationResult(self, tout, traj)
def _setup(self): self._compile() self._step_0 = False
def call(device_number, ocl_instance, n_sim_per_device, t_out, total_threads, local_work_size, random_seed): """ Function that allows multiple-devices function calls """ # used to split entire simulation array to equal sizes per device start = device_number*n_sim_per_device end = device_number+n_sim_per_device # if no desire to set, will generate seed, to later generate seeds # Will add device number just to ensure that the seeds are different, # though I assume they will be based on different processes. if random_seed is None: r_seed = np.random.randint(0, 2**32, 1, np.uint) rng = np.random.default_rng(r_seed + device_number) else: # generate rng state based on given seed plus device number rng = np.random.default_rng(random_seed + device_number) # create command queue per device with cl.CommandQueue(ocl_instance.context, ocl_instance.devices[device_number]) as queue: # transfer the array of time points to the device time_points_gpu = ocl_array.to_device( queue, np.array(t_out, dtype=ocl_instance._dtype) ) # total_threads = int(self.num_sim) # transfer the array of seeds to the device random_seeds_gpu = ocl_array.to_device( queue, rng.integers(0, 2 ** 32, total_threads, np.uint32) ) # transfer the data structure of # ( number of simulations x different parameter sets ) # to each device param_array_gpu = ocl_array.to_device( queue, _create_gpu_array( ocl_instance.param_values[start:end], total_threads, ocl_instance._dtype ).flatten(order='C') ) species_matrix_gpu = ocl_array.to_device( queue, _create_gpu_array( ocl_instance.initials[start:end], total_threads, ocl_instance._dtype_species ).flatten(order='C') ) result_gpu = ocl_array.zeros( queue, order='C', shape=(total_threads * t_out.shape[0] * ocl_instance._n_species,), dtype=ocl_instance._dtype_species ) global_work_size = (total_threads, 1) ocl_instance._logger.debug( f"""Starting {ocl_instance.num_sim} simulations with {global_work_size} workers and {local_work_size} steps""" ) # perform simulation complete_event = ocl_instance.program.Gillespie_all_steps( queue, global_work_size, local_work_size, species_matrix_gpu.data, result_gpu.data, time_points_gpu.data, param_array_gpu.data, random_seeds_gpu.data, np.int32(len(t_out)), np.int32(ocl_instance.num_sim) ) complete_event.wait() return result_gpu.get(queue, async_=True) def _create_gpu_array(values, total_threads, prec): # Create species matrix on GPU # will make according to number of total threads, not n_simulations gpu_array = np.zeros((total_threads, values.shape[1]), dtype=prec) # Filling species matrix # Note that this might not fill entire array that was created. # The rest of the array will be zeros to fill up GPU. gpu_array[:len(values)] = values return gpu_array