Source code for pysb.simulator.cuda_ssa

from pysb.simulator.base import SimulationResult
from pysb.simulator.ssa_base import SSABase
import numpy as np
import os
import time
from pysb.pathfinder import get_path


try:
    import pycuda
    import pycuda.autoinit
    from pycuda.autoinit import device
    import pycuda as cuda
    import pycuda.compiler
    import pycuda.tools as tools
    import pycuda.driver as driver
    import pycuda.gpuarray as gpuarray
    from mako.template import Template
except ImportError:
    pycuda = None


[docs]class CudaSSASimulator(SSABase): """ SSA simulator for NVIDIA gpus Use `CUDA_VISIBLE_DEVICES` to set the device. Requires `PyCUDA`_, `CUDA`_, and a CUDA compatible NVIDIA gpu. .. _PyCUDA : https://documen.tician.de/pycuda/ .. _CUDA : https://developer.nvidia.com/cuda-zone Parameters ---------- model : pysb.Model Model to simulate. tspan : vector-like, optional Time values over which to simulate. The first and last values define the time range. Returned trajectories are sampled at every value unless the simulation is interrupted for some reason, e.g., due to satisfaction of a logical stopping criterion (see 'tout' below). initials : vector-like or dict, optional Values to use for the initial condition of all species. Ordering is determined by the order of model.species. If not specified, initial conditions will be taken from model.initial_conditions (with initial condition parameter values taken from `param_values` if specified). param_values : vector-like or dict, optional Values to use for every parameter in the model. Ordering is determined by the order of model.parameters. If passed as a dictionary, keys must be parameter names. If not specified, parameter values will be taken directly from model.parameters. verbose : bool, optional (default: False) Verbose output. precision : (np.float64, np.float32) Precision for ssa simulation. Default is np.float64. float32 should be used with caution. Attributes ---------- verbose: bool Verbosity flag passed to the constructor. model : pysb.Model Model passed to the constructor. tspan : vector-like Time values passed to the constructor. """ _supports = {'multi_initials': True, 'multi_param_values': True} def __init__(self, model, verbose=False, tspan=None, precision=np.float64, **kwargs): if pycuda is None: raise ImportError('pycuda library required for {}' ''.format(self.__class__.__name__)) super(CudaSSASimulator, self).__init__(model, verbose, **kwargs) self._device = device.name() self.tspan = tspan self.verbose = verbose # private attribute self._step_0 = True self._dtype = precision template_code = Template( filename=os.path.join(os.path.dirname(__file__), 'templates', 'cuda_ssa.cu') ) args = self._get_template_args() # The use of single precision seems to work for most cases with little # differences in trajectory distributions, however sometimes if there # are really small time scale reactions, tau is below the level of # decimal precision time doesn't progress. Using double for time # seems to fix that for EARM 1.0, however, if there are reactions # who propensities make a0 too large, i think the no change in # time, perpetual running on gpu would occur. Keeping this warning # for now. For all other models that I have simulated, single produces # non-significantly different traces. if self._dtype == np.float32: args['prec'] = '#define USE_SINGLE_PRECISION' self._logger.warn("Should be cautious using single precision.") else: args['prec'] = '#define USE_DOUBLE_PRECISION' # So far, int 32 seems to handle all situations. # For boolean simulations, can use short or possible char. # Keeping option for reference now in hopes of being able to # calculate precision factor prior to simulating. _d = {np.uint32: "uint", np.int32: 'int', np.int16: 'ushort', np.int64: 'long', np.uint64: 'unsigned long'} self._dtype_species = np.int32 args['spc_type'] = _d[self._dtype_species] if verbose == 2: args['verbose'] = '#define VERBOSE' elif verbose > 3: args['verbose'] = '#define VERBOSE_MAX' else: args['verbose'] = '' self._code = template_code.render(**args) self._kernel = None self._param_tex = None self._ssa = None self._logger.info("Initialized CudaSSASimulator class") def _compile(self): if self.verbose: self._logger.info("Output cuda file to ssa_cuda_code.cu") with open("ssa_cuda_code.cu", "w") as source_file: source_file.write(self._code) nvcc_bin = get_path('nvcc') self._logger.debug("Compiling CUDA code") opts = ['-O3', '--use_fast_math'] self._kernel = pycuda.compiler.SourceModule( self._code, nvcc=nvcc_bin, options=opts, no_extern_c=True, ) self._ssa = self._kernel.get_function("Gillespie_all_steps") self._logger.debug("Compiled CUDA code") self._step_0 = False
[docs] def run(self, tspan=None, param_values=None, initials=None, number_sim=0, threads_per_block=None, random_seed=None): """ Run a simulation and returns the result (trajectories) .. note:: In early versions of the Simulator class, ``tspan``, ``initials`` and ``param_values`` supplied to this method persisted to future :func:`run` calls. This is no longer the case. Parameters ---------- tspan initials param_values See parameter definitions in :class:`ScipyOdeSimulator`. number_sim: int Number of simulations to perform threads_per_block: int Number of threads per block. Optimal value is generally 32 random_seed: int Seed used for generate random numbers that will then be seeds on the device (seed of seeds). Returns ------- A :class:`SimulationResult` object """ super(CudaSSASimulator, self).run(tspan=tspan, initials=initials, param_values=param_values, number_sim=number_sim) self._logger.info("Using device {}".format(device.name())) if tspan is None: tspan = self.tspan tout = [tspan] * self.num_sim t_out = np.array(tspan, dtype=self._dtype) # set default threads per block if threads_per_block is None: threads_per_block = 32 blocks, threads = self.get_blocks(self.num_sim, threads_per_block) # compile kernel and send parameters to GPU if self._step_0: self._compile() if self.verbose: self._print_verbose(threads) # Note, this number will be larger than n_simulations if the gpu grid # is not filled. The rest will be filled with zeros and not impact # results. They are trimmed right before passing to simulation results total_threads = int(blocks * threads) self._logger.info("Creating content on device") timer_start = time.time() param_array_gpu = gpuarray.to_gpu( self._create_gpu_array(self.param_values, total_threads, self._dtype) ) species_dtype = np.int32 species_matrix_gpu = gpuarray.to_gpu( self._create_gpu_array(self.initials, total_threads, species_dtype) ) # generate rng state based on given seed plus device number rng = np.random.default_rng(random_seed) random_seeds_gpu = gpuarray.to_gpu( self._create_gpu_array( rng.integers( 0, 2 ** 32, total_threads, np.uint32 )[:, np.newaxis], total_threads, np.uint32 ) ) # allocate and upload time to GPU time_points_gpu = gpuarray.to_gpu(np.array(t_out, dtype=self._dtype)) # allocate space on GPU for results result = driver.managed_zeros( shape=(total_threads, len(t_out), self._n_species), dtype=species_dtype, mem_flags=driver.mem_attach_flags.GLOBAL ) elasped_t = time.time() - timer_start self._logger.info("Completed transfer in: {:.4f}s".format(elasped_t)) self._logger.info("Starting {} simulations on {} blocks" "".format(self.num_sim, blocks)) timer_start = time.time() # perform simulation self._ssa( species_matrix_gpu, result, time_points_gpu, np.int32(len(t_out)), param_array_gpu, random_seeds_gpu, block=(threads, 1, 1), grid=(blocks, 1) ) # Wait for kernel completion before host access pycuda.autoinit.context.synchronize() self._time = time.time() - timer_start self._logger.info("{} simulations " "in {:.4f}s".format(self.num_sim, self._time)) # retrieve and store results, only keeping num_sim (desired quantity) return SimulationResult(self, tout, result[:self.num_sim, :, :])
def _print_verbose(self, threads): # Beyond this point is just pretty printing self._logger.debug("Attributes for device {}".format(device.name())) for (key, value) in device.get_attributes().items(): self._logger.debug("{}:{}".format(key, value)) self._logger.debug("threads = {}".format(threads)) kern = self._ssa self._logger.debug("Local memory = {}".format(kern.local_size_bytes)) self._logger.debug("Shared memory = {}".format(kern.shared_size_bytes)) self._logger.debug("Registers = {}".format(kern.num_regs)) occ = tools.OccupancyRecord(tools.DeviceData(), threads=threads, shared_mem=kern.shared_size_bytes, registers=kern.num_regs) self._logger.debug("tb_per_mp = {}".format(occ.tb_per_mp)) self._logger.debug("limited by = {}".format(occ.limited_by)) self._logger.debug("occupancy = {}".format(occ.occupancy)) self._logger.debug("tb/mp limits = {}".format(occ.tb_per_mp_limits)) @staticmethod def _create_gpu_array(values, total_threads, prec): # Create species matrix on GPU # will make according to number of total threads, not n_simulations gpu_array = np.zeros((total_threads, values.shape[1]), dtype=prec) # Filling species matrix # Note that this might not fill entire array that was created. # The rest of the array will be zeros to fill up GPU. gpu_array[:len(values)] = values return gpu_array