Source code for pysb.simulator.cuda_ssa

from pysb.simulator.base import SimulationResult
from pysb.simulator.ssa_base import SSABase
import numpy as np
import os
import time
from pysb.pathfinder import get_path


try:
    import pycuda
    import pycuda.autoinit
    from pycuda.autoinit import device
    import pycuda as cuda
    import pycuda.compiler
    import pycuda.tools as tools
    import pycuda.driver as driver
    import pycuda.gpuarray as gpuarray
    from mako.template import Template
except ImportError:
    pycuda = None


[docs]class CudaSSASimulator(SSABase):
    """
    SSA simulator for NVIDIA gpus

    Use `CUDA_VISIBLE_DEVICES` to set the device.

    Requires `PyCUDA`_, `CUDA`_, and a CUDA compatible NVIDIA gpu.

    .. _PyCUDA :
        https://documen.tician.de/pycuda/
    .. _CUDA :
        https://developer.nvidia.com/cuda-zone

    Parameters
    ----------
    model : pysb.Model
        Model to simulate.
    tspan : vector-like, optional
        Time values over which to simulate. The first and last values define
        the time range. Returned trajectories are sampled at every value unless
        the simulation is interrupted for some reason, e.g., due to
        satisfaction
        of a logical stopping criterion (see 'tout' below).
    initials : vector-like or dict, optional
        Values to use for the initial condition of all species. Ordering is
        determined by the order of model.species. If not specified, initial
        conditions will be taken from model.initial_conditions (with
        initial condition parameter values taken from `param_values` if
        specified).
    param_values : vector-like or dict, optional
        Values to use for every parameter in the model. Ordering is
        determined by the order of model.parameters.
        If passed as a dictionary, keys must be parameter names.
        If not specified, parameter values will be taken directly from
        model.parameters.
    verbose : bool, optional (default: False)
        Verbose output.
    precision : (np.float64, np.float32)
        Precision for ssa simulation. Default is np.float64. float32 should
        be used with caution.

    Attributes
    ----------
    verbose: bool
        Verbosity flag passed to the constructor.
    model : pysb.Model
        Model passed to the constructor.
    tspan : vector-like
        Time values passed to the constructor.

    """
    _supports = {'multi_initials': True, 'multi_param_values': True}

    def __init__(self, model, verbose=False, tspan=None, precision=np.float64,
                 **kwargs):
        if pycuda is None:
            raise ImportError('pycuda library required for {}'
                              ''.format(self.__class__.__name__))
        super(CudaSSASimulator, self).__init__(model, verbose, **kwargs)
        self._device = device.name()
        self.tspan = tspan
        self.verbose = verbose
        # private attribute
        self._step_0 = True
        self._dtype = precision

        template_code = Template(
            filename=os.path.join(os.path.dirname(__file__),
                                  'templates', 'cuda_ssa.cu')
        )
        args = self._get_template_args()
        # The use of single precision seems to work for most cases with little
        # differences in trajectory distributions, however sometimes if there
        # are really small time scale reactions, tau is below the level of
        # decimal precision time doesn't progress. Using double for time
        # seems to fix that for EARM 1.0, however, if there are reactions
        # who propensities make a0 too large, i think the no change in
        # time, perpetual running on gpu would occur. Keeping this warning
        # for now. For all other models that I have simulated, single produces
        # non-significantly different traces.
        if self._dtype == np.float32:
            args['prec'] = '#define USE_SINGLE_PRECISION'
            self._logger.warn("Should be cautious using single precision.")
        else:
            args['prec'] = '#define USE_DOUBLE_PRECISION'
        # So far, int 32 seems to handle all situations.
        # For boolean simulations, can use short or possible char.
        # Keeping option for reference now in hopes of being able to
        # calculate precision factor prior to simulating.
        _d = {np.uint32: "uint",
              np.int32: 'int',
              np.int16: 'ushort',
              np.int64: 'long',
              np.uint64: 'unsigned long'}
        self._dtype_species = np.int32
        args['spc_type'] = _d[self._dtype_species]
        if verbose == 2:
            args['verbose'] = '#define VERBOSE'
        elif verbose > 3:
            args['verbose'] = '#define VERBOSE_MAX'
        else:
            args['verbose'] = ''
        self._code = template_code.render(**args)
        self._kernel = None
        self._param_tex = None
        self._ssa = None
        self._logger.info("Initialized CudaSSASimulator class")

    def _compile(self):

        if self.verbose:
            self._logger.info("Output cuda file to ssa_cuda_code.cu")
            with open("ssa_cuda_code.cu", "w") as source_file:
                source_file.write(self._code)
        nvcc_bin = get_path('nvcc')
        self._logger.debug("Compiling CUDA code")
        opts = ['-O3',  '--use_fast_math']
        self._kernel = pycuda.compiler.SourceModule(
            self._code, nvcc=nvcc_bin, options=opts, no_extern_c=True,
        )

        self._ssa = self._kernel.get_function("Gillespie_all_steps")
        self._logger.debug("Compiled CUDA code")
        self._step_0 = False

[docs]    def run(self, tspan=None, param_values=None, initials=None, number_sim=0,
            threads_per_block=None, random_seed=None):
        """
        Run a simulation and returns the result (trajectories)

        .. note::
            In early versions of the Simulator class, ``tspan``, ``initials``
            and ``param_values`` supplied to this method persisted to future
            :func:`run` calls. This is no longer the case.

        Parameters
        ----------
        tspan
        initials
        param_values
            See parameter definitions in :class:`ScipyOdeSimulator`.
        number_sim: int
            Number of simulations to perform
        threads_per_block: int
            Number of threads per block. Optimal value is generally 32
        random_seed: int
            Seed used for generate random numbers that will then be seeds on
            the device (seed of seeds).
        Returns
        -------
        A :class:`SimulationResult` object
        """

        super(CudaSSASimulator, self).run(tspan=tspan, initials=initials,
                                          param_values=param_values,
                                          number_sim=number_sim)

        self._logger.info("Using device {}".format(device.name()))

        if tspan is None:
            tspan = self.tspan

        tout = [tspan] * self.num_sim
        t_out = np.array(tspan, dtype=self._dtype)

        # set default threads per block
        if threads_per_block is None:
            threads_per_block = 32

        blocks, threads = self.get_blocks(self.num_sim, threads_per_block)

        # compile kernel and send parameters to GPU
        if self._step_0:
            self._compile()
        if self.verbose:
            self._print_verbose(threads)

        #  Note, this number will be larger than n_simulations if the gpu grid
        #  is not filled. The rest will be filled with zeros and not impact
        #  results. They are trimmed right before passing to simulation results
        total_threads = int(blocks * threads)

        self._logger.info("Creating content on device")
        timer_start = time.time()
        param_array_gpu = gpuarray.to_gpu(
            self._create_gpu_array(self.param_values, total_threads,
                                   self._dtype)
        )
        species_dtype = np.int32
        species_matrix_gpu = gpuarray.to_gpu(
            self._create_gpu_array(self.initials, total_threads, species_dtype)
        )
        # generate rng state based on given seed plus device number
        rng = np.random.default_rng(random_seed)
        random_seeds_gpu = gpuarray.to_gpu(
            self._create_gpu_array(
                rng.integers(
                    0, 2 ** 32, total_threads, np.uint32
                )[:, np.newaxis],
                total_threads,
                np.uint32
            )
        )

        # allocate and upload time to GPU
        time_points_gpu = gpuarray.to_gpu(np.array(t_out, dtype=self._dtype))

        # allocate space on GPU for results
        result = driver.managed_zeros(
            shape=(total_threads, len(t_out), self._n_species),
            dtype=species_dtype, mem_flags=driver.mem_attach_flags.GLOBAL
        )
        elasped_t = time.time() - timer_start
        self._logger.info("Completed transfer in: {:.4f}s".format(elasped_t))

        self._logger.info("Starting {} simulations on {} blocks"
                          "".format(self.num_sim, blocks))

        timer_start = time.time()
        # perform simulation
        self._ssa(
            species_matrix_gpu,
            result,
            time_points_gpu,
            np.int32(len(t_out)),
            param_array_gpu,
            random_seeds_gpu,
            block=(threads, 1, 1),
            grid=(blocks, 1)
        )

        # Wait for kernel completion before host access
        pycuda.autoinit.context.synchronize()

        self._time = time.time() - timer_start
        self._logger.info("{} simulations "
                          "in {:.4f}s".format(self.num_sim, self._time))

        # retrieve and store results, only keeping num_sim (desired quantity)
        return SimulationResult(self, tout, result[:self.num_sim, :, :])

    def _print_verbose(self, threads):
        # Beyond this point is just pretty printing
        self._logger.debug("Attributes for device {}".format(device.name()))
        for (key, value) in device.get_attributes().items():
            self._logger.debug("{}:{}".format(key, value))
        self._logger.debug("threads = {}".format(threads))
        kern = self._ssa
        self._logger.debug("Local memory  = {}".format(kern.local_size_bytes))
        self._logger.debug("Shared memory = {}".format(kern.shared_size_bytes))
        self._logger.debug("Registers  = {}".format(kern.num_regs))

        occ = tools.OccupancyRecord(tools.DeviceData(),
                                    threads=threads,
                                    shared_mem=kern.shared_size_bytes,
                                    registers=kern.num_regs)
        self._logger.debug("tb_per_mp  = {}".format(occ.tb_per_mp))
        self._logger.debug("limited by = {}".format(occ.limited_by))
        self._logger.debug("occupancy  = {}".format(occ.occupancy))
        self._logger.debug("tb/mp limits  = {}".format(occ.tb_per_mp_limits))

    @staticmethod
    def _create_gpu_array(values, total_threads, prec):

        # Create species matrix on GPU
        # will make according to number of total threads, not n_simulations
        gpu_array = np.zeros((total_threads, values.shape[1]), dtype=prec)
        # Filling species matrix
        # Note that this might not fill entire array that was created.
        # The rest of the array will be zeros to fill up GPU.
        gpu_array[:len(values)] = values
        return gpu_array