accel.F90

!! Copyright (C) 2010-2016 X. Andrade
!!
!! This program is free software; you can redistribute it and/or modify
!! it under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 2, or (at your option)
!! any later version.
!!
!! This program is distributed in the hope that it will be useful,
!! but WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!! GNU General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with this program; if not, write to the Free Software
!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
!! 02110-1301, USA.
!!

#include "global.h"

#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
#error "Cannot compile with OpenCL and Cuda support at the same time"
#endif

#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
#define HAVE_ACCEL 1
#endif

module accel_oct_m
  use alloc_cache_oct_m
#ifdef HAVE_OPENCL
  use cl
#endif
#ifdef HAVE_CLBLAS
  use clblas
#endif
  use cuda_oct_m
#ifdef HAVE_CLFFT
  use clfft
#endif
  use global_oct_m
  use iso_c_binding
  use loct_oct_m
  use messages_oct_m
  use mpi_oct_m
  use namespace_oct_m
  use types_oct_m
  use parser_oct_m
  use profiling_oct_m
  use unit_system_oct_m

  implicit none 

  private
  
  public ::                       &
    accel_context_t,              &
    accel_device_t,               &
    accel_mem_t,                  &
    accel_kernel_t,               &
    accel_t,                      &
    accel_is_enabled,             &
    accel_init,                   &
    accel_end,                    &
    accel_padded_size,            &
    accel_mem_nullify,            &
    accel_kernel_start_call,      &
    accel_kernel_build,           &
    accel_create_buffer,          &
    accel_write_buffer,           &
    accel_read_buffer,            &
    accel_release_buffer,         &
    accel_buffer_is_allocated,    &
    accel_finish,                 &
    accel_set_kernel_arg,         &
    accel_max_workgroup_size,     &
    accel_kernel_workgroup_size,  &
    accel_kernel_run,             &
    accel_set_buffer_to_zero,     &
    accel_use_shared_mem,         &
    clblas_print_error,           &
    clfft_print_error,            &
    accel_local_memory_size,      &
    accel_global_memory_size,     &
    accel_max_size_per_dim,       &
    accel_get_device_pointer
  
#ifdef HAVE_OPENCL
  integer, public, parameter ::                 &
    ACCEL_MEM_READ_ONLY  = CL_MEM_READ_ONLY,    &
    ACCEL_MEM_READ_WRITE = CL_MEM_READ_WRITE,   &
    ACCEL_MEM_WRITE_ONLY = CL_MEM_WRITE_ONLY
#else
  integer, public, parameter ::                 &
    ACCEL_MEM_READ_ONLY  = 0,                   &
    ACCEL_MEM_READ_WRITE = 1,                   &
    ACCEL_MEM_WRITE_ONLY = 2
#endif

  type accel_context_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_context) :: cl_context
#elif defined(HAVE_CUDA)
    type(c_ptr)      :: cuda_context
#else
    integer          :: dummy
#endif
  end type accel_context_t

  type accel_device_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_device_id) :: cl_device
#elif defined(HAVE_CUDA)
    type(c_ptr)      :: cuda_device
#else
    integer         :: dummy
#endif
  end type accel_device_t

  type accel_t
    ! Components are public by default
    type(accel_context_t)  :: context
    type(accel_device_t)   :: device
#ifdef HAVE_OPENCL
    type(cl_command_queue) :: command_queue
#endif
    type(c_ptr)            :: cublas_handle
    type(c_ptr)            :: cuda_stream
    type(c_ptr)            :: module_map
    integer                :: max_workgroup_size
    integer(8)             :: local_memory_size
    integer(8)             :: global_memory_size
    logical                :: enabled
    logical                :: shared_mem
    logical                :: cuda_mpi
    integer                :: warp_size
  end type accel_t

  type accel_mem_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_mem)           :: mem
#else
    type(c_ptr)            :: mem
#endif
    integer(SIZEOF_SIZE_T) :: size
    type(type_t)           :: type
    integer                :: flags
    logical                :: allocated
  end type accel_mem_t

  type accel_kernel_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_kernel)               :: kernel
#endif
#ifdef HAVE_CUDA
    type(c_ptr)                   :: cuda_kernel
    type(c_ptr)                   :: cuda_module
    type(c_ptr)                   :: arguments
#endif
    integer(8)                    :: cuda_shared_mem
    logical                       :: initialized = .false.
    type(accel_kernel_t), pointer :: next
    integer                       :: arg_count
  end type accel_kernel_t

  type(accel_t), public :: accel

  ! the kernels
  type(accel_kernel_t), public, target, save :: kernel_vpsi
  type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
  type(accel_kernel_t), public, target, save :: kernel_daxpy
  type(accel_kernel_t), public, target, save :: kernel_zaxpy
  type(accel_kernel_t), public, target, save :: kernel_copy
  type(accel_kernel_t), public, target, save :: dpack
  type(accel_kernel_t), public, target, save :: zpack
  type(accel_kernel_t), public, target, save :: dunpack
  type(accel_kernel_t), public, target, save :: zunpack
  type(accel_kernel_t), public, target, save :: kernel_subarray_gather
  type(accel_kernel_t), public, target, save :: kernel_density_real
  type(accel_kernel_t), public, target, save :: kernel_density_complex
  type(accel_kernel_t), public, target, save :: kernel_phase
  type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
  type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
  type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
  type(accel_kernel_t), public, target, save :: dzmul
  type(accel_kernel_t), public, target, save :: zzmul
  type(accel_kernel_t), public, target, save :: kernel_mod_sqr_real
  type(accel_kernel_t), public, target, save :: kernel_mod_sqr_complex
  type(accel_kernel_t), public, target, save :: set_one

  ! kernels used locally
  type(accel_kernel_t), save :: set_zero

  interface accel_create_buffer
    module procedure accel_create_buffer_4, accel_create_buffer_8
  end interface accel_create_buffer

  interface accel_write_buffer
    module procedure iaccel_write_buffer_0, daccel_write_buffer_0, zaccel_write_buffer_0
    module procedure iaccel_write_buffer_1, daccel_write_buffer_1, zaccel_write_buffer_1
    module procedure iaccel_write_buffer_2, daccel_write_buffer_2, zaccel_write_buffer_2
    module procedure iaccel_write_buffer_3, daccel_write_buffer_3, zaccel_write_buffer_3
    module procedure saccel_write_buffer_1, caccel_write_buffer_1
    module procedure saccel_write_buffer_2, caccel_write_buffer_2
    module procedure saccel_write_buffer_3, caccel_write_buffer_3
  end interface accel_write_buffer

  interface accel_read_buffer
    module procedure iaccel_read_buffer_1, daccel_read_buffer_1, zaccel_read_buffer_1
    module procedure iaccel_read_buffer_2, daccel_read_buffer_2, zaccel_read_buffer_2
    module procedure iaccel_read_buffer_3, daccel_read_buffer_3, zaccel_read_buffer_3
    module procedure saccel_read_buffer_1, caccel_read_buffer_1
    module procedure saccel_read_buffer_2, caccel_read_buffer_2
    module procedure saccel_read_buffer_3, caccel_read_buffer_3
  end interface accel_read_buffer

  interface accel_set_kernel_arg
    module procedure                       &
      accel_set_kernel_arg_buffer,  &
      iaccel_set_kernel_arg_data,   &
      daccel_set_kernel_arg_data,   &
      zaccel_set_kernel_arg_data,   &
      accel_set_kernel_arg_local
  end interface accel_set_kernel_arg

  interface accel_get_device_pointer
    module procedure iaccel_get_device_pointer_1
    module procedure iaccel_get_device_pointer_2
    module procedure daccel_get_device_pointer_1, zaccel_get_device_pointer_1
    module procedure daccel_get_device_pointer_2, zaccel_get_device_pointer_2
    module procedure saccel_get_device_pointer_1, caccel_get_device_pointer_1
    module procedure saccel_get_device_pointer_2, caccel_get_device_pointer_2
  end interface accel_get_device_pointer

  type(profile_t), save :: prof_read, prof_write

  integer, parameter  ::      &
    OPENCL_GPU         = -1,  &
    OPENCL_CPU         = -2,  &
    OPENCL_ACCELERATOR = -3,  &
    OPENCL_DEFAULT     = -4


  integer, parameter  ::      &
    CL_PLAT_INVALID   = -1,   &
    CL_PLAT_AMD       = -2,   &
    CL_PLAT_NVIDIA    = -3,   &
    CL_PLAT_ATI       = -4,   &
    CL_PLAT_INTEL     = -5

  ! a "convenience" public variable
  integer, public :: cl_status

  integer :: buffer_alloc_count
  integer(8) :: allocated_mem
  type(accel_kernel_t), pointer :: head
  type(alloc_cache_t) :: memcache
  
contains

  pure logical function accel_is_enabled() result(enabled)
#ifdef HAVE_ACCEL
    enabled = accel%enabled
#else
    enabled = .false.
#endif
  end function accel_is_enabled

  ! ------------------------------------------

  subroutine accel_init(base_grp, namespace)
    type(mpi_grp_t),     intent(inout) :: base_grp
    type(namespace_t),   intent(in)    :: namespace
    
    logical  :: disable, default, run_benchmark
    integer  :: idevice, iplatform
#ifdef HAVE_OPENCL
    integer  :: device_type
    integer :: cl_status, idev
    integer  :: ndevices, ret_devices, nplatforms, iplat
    character(len=256) :: device_name
    type(cl_platform_id) :: platform_id
    type(cl_program) :: prog
    type(cl_platform_id), allocatable :: allplatforms(:)
    type(cl_device_id), allocatable :: alldevices(:)
    type(profile_t), save :: prof_init
#endif

    PUSH_SUB(accel_init)

    buffer_alloc_count = 0

    !%Variable DisableAccel    
    !%Type logical
    !%Default yes
    !%Section Execution::Accel
    !%Description
    !% If Octopus was compiled with OpenCL or CUDA support, it will
    !% try to initialize and use an accelerator device. By setting this
    !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
    !%End
    call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
#ifdef HAVE_ACCEL
    default = .false.
#else
    default = .true.
#endif
    call parse_variable(namespace, 'DisableAccel', default, disable)
    accel%enabled = .not. disable
    
#ifndef HAVE_ACCEL
    if(accel%enabled) then
      message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
      call messages_fatal(1)
    end if
#endif

    if(.not. accel_is_enabled()) then
      POP_SUB(accel_init)
      return
    end if

    !%Variable AccelPlatform
    !%Type integer
    !%Default 0
    !%Section Execution::Accel
    !%Description
    !% This variable selects the OpenCL platform that Octopus will
    !% use. You can give an explicit platform number or use one of
    !% the options that select a particular vendor
    !% implementation. Platform 0 is used by default.
    !%
    !% This variable has no effect for CUDA.
    !%Option amd -2
    !% Use the AMD OpenCL platform.
    !%Option nvidia -3
    !% Use the Nvidia OpenCL platform.
    !%Option ati -4
    !% Use the ATI (old AMD) OpenCL platform.
    !%Option intel -5
    !% Use the Intel OpenCL platform.
    !%End
    call parse_variable(namespace, 'AccelPlatform', 0, iplatform)

    call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
    
    !%Variable AccelDevice
    !%Type integer
    !%Default gpu
    !%Section Execution::Accel
    !%Description
    !% This variable selects the OpenCL or CUDA accelerator device
    !% that Octopus will use. You can specify one of the options below
    !% or a numerical id to select a specific device.
    !%
    !% Values >= 0 select the device to be used. In case of MPI enabled runs
    !% devices are distributed in a round robin fashion, starting at this value.
    !%Option gpu -1
    !% If available, Octopus will use a GPU.
    !%Option cpu -2
    !% If available, Octopus will use a CPU (only for OpenCL).
    !%Option accelerator -3
    !% If available, Octopus will use an accelerator (only for OpenCL).
    !%Option accel_default -4
    !% Octopus will use the default device specified by the implementation.
    !% implementation.
    !%End
    call parse_variable(namespace, 'AccelDevice', OPENCL_GPU, idevice)

    call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
    
    if(idevice < OPENCL_DEFAULT) then
      call messages_write('Invalid AccelDevice')
      call messages_fatal()
    end if

    call messages_print_stress(stdout, "GPU acceleration")

#ifdef HAVE_CUDA
    if(idevice<0) idevice = 0
    call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
      idevice, base_grp%rank)
#ifdef HAVE_MPI
    write(message(1), '(A, I5.5, A, I5.5)') "Rank ", base_grp%rank, " uses device number ", idevice
    call messages_info(1, all_nodes = .true.)
#endif

    ! no shared mem support in our cuda interface (for the moment)
    accel%shared_mem = .true.

    call cublas_init(accel%cublas_handle, accel%cuda_stream)
#endif
    
#ifdef HAVE_OPENCL
    call profiling_in(prof_init, 'CL_INIT')

    call clGetPlatformIDs(nplatforms, cl_status)
    if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")

    SAFE_ALLOCATE(allplatforms(1:nplatforms))

    call clGetPlatformIDs(allplatforms, iplat, cl_status)
    if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")

    call messages_write('Info: Available CL platforms: ')
    call messages_write(nplatforms)
    call messages_info()

    do iplat = 1, nplatforms

      call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_NAME, device_name, cl_status)

      if(iplatform < 0) then
        if(iplatform == get_platform_id(device_name)) iplatform = iplat - 1
      end if

      if(iplatform == iplat - 1) then
        call messages_write('    * Platform ')
      else
        call messages_write('      Platform ')
      end if

      call messages_write(iplat - 1)
      call messages_write(' : '//device_name)
      call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_VERSION, device_name, cl_status)
      call messages_write(' ('//trim(device_name)//')')
      call messages_info()
    end do

    call messages_info()

    if(iplatform >= nplatforms .or. iplatform < 0) then
      call messages_write('Requested CL platform does not exist')
      if(iplatform > 0) then 
        call messages_write('(platform = ')
        call messages_write(iplatform)
        call messages_write(').')
      end if
      call messages_fatal()
    end if

    platform_id = allplatforms(iplatform + 1)

    SAFE_DEALLOCATE_A(allplatforms)

    call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, ndevices, cl_status)

    call messages_write('Info: Available CL devices: ')
    call messages_write(ndevices)
    call messages_info()

    SAFE_ALLOCATE(alldevices(1:ndevices))

    ! list all devices

    call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)

    do idev = 1, ndevices
      call messages_write('      Device ')
      call messages_write(idev - 1)
      call clGetDeviceInfo(alldevices(idev), CL_DEVICE_NAME, device_name, cl_status)
      call messages_write(' : '//device_name)
      call messages_info()
    end do

    select case(idevice)
    case(OPENCL_GPU)
      device_type = CL_DEVICE_TYPE_GPU
    case(OPENCL_CPU)
      device_type = CL_DEVICE_TYPE_CPU
    case(OPENCL_ACCELERATOR)
      device_type = CL_DEVICE_TYPE_ACCELERATOR
    case(OPENCL_DEFAULT)
      device_type = CL_DEVICE_TYPE_DEFAULT
    case default
      device_type = CL_DEVICE_TYPE_ALL
    end select

    ! now get a list of the selected type
    call clGetDeviceIDs(platform_id, device_type, alldevices, ret_devices, cl_status)

    if(ret_devices < 1) then
      ! we didnt find a device of the selected type, we ask for the default device
      call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, alldevices, ret_devices, cl_status)

      if(ret_devices < 1) then
        ! if this does not work, we ask for all devices
        call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)
      end if

      if(ret_devices < 1) then
        call messages_write('Cannot find an OpenCL device')
        call messages_fatal()
      end if
    end if

    ! the number of devices can be smaller
    ndevices = ret_devices

    if(idevice < 0) then
      if(base_grp%size > 1) then
        ! with MPI we have to select the device so multiple GPUs in one
        ! node are correctly distributed
        call select_device(idevice)
      else
        idevice = 0
      end if
    end if

    if(idevice >= ndevices) then
      call messages_write('Requested CL device does not exist (device = ')
      call messages_write(idevice)
      call messages_write(', platform = ')
      call messages_write(iplatform)
      call messages_write(').')
      call messages_fatal()
    end if

    accel%device%cl_device = alldevices(idevice + 1)

    ! create the context
    accel%context%cl_context = clCreateContext(platform_id, accel%device%cl_device, cl_status)
    if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "CreateContext")

    SAFE_DEALLOCATE_A(alldevices)

    accel%command_queue = clCreateCommandQueue(accel%context%cl_context, accel%device%cl_device, &
      CL_QUEUE_PROFILING_ENABLE, cl_status)
    if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "CreateCommandQueue")

    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_TYPE, device_type, cl_status)

    select case(device_type)
    case(CL_DEVICE_TYPE_GPU)
      accel%shared_mem = .true.
    case(CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_ACCELERATOR)
      accel%shared_mem = .false.
    case default
      accel%shared_mem = .false.
    end select

#ifdef HAVE_CLBLAS
    call clblasSetup(cl_status)
    if(cl_status /= clblasSuccess) call clblas_print_error(cl_status, 'clblasSetup')
#endif

#ifdef HAVE_CLFFT
    call clfftSetup(cl_status)
    if(cl_status /= CLFFT_SUCCESS) call clfft_print_error(cl_status, 'clfftSetup')
#endif

    call profiling_out(prof_init)
#endif

    ! Get some device information that we will need later
    
    ! total memory
#ifdef HAVE_OPENCL
    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_GLOBAL_MEM_SIZE, accel%global_memory_size, cl_status)
    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_LOCAL_MEM_SIZE, accel%local_memory_size, cl_status)
    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_WORK_GROUP_SIZE, accel%max_workgroup_size, cl_status)
    accel%warp_size = 1
#endif
#ifdef HAVE_CUDA
    call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
    call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
    call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
    call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
#endif
      
    if(mpi_grp_is_root(base_grp)) call device_info()

    ! initialize the cache used to speed up allocations
    call alloc_cache_init(memcache, nint(CNST(0.25)*accel%global_memory_size, 8))
    
    ! now initialize the kernels
    call accel_kernel_global_init()

    call accel_kernel_start_call(set_zero, 'set_zero.cl', "set_zero")
    call accel_kernel_start_call(set_one, 'set_one.cl', "set_one")
    call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi")
    call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors")
    call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE')
    call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX')
    call accel_kernel_start_call(dpack, 'pack.cl', "dpack")
    call accel_kernel_start_call(zpack, 'pack.cl', "zpack")
    call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack")
    call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack")
    call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy")
    call accel_kernel_start_call(kernel_subarray_gather, 'subarray.cl', "subarray_gather")
    call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real")
    call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex")
    call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase")
    call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix")
    call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix")
    call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors")
    call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE')
    call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX')
    call accel_kernel_start_call(kernel_mod_sqr_real, 'mod_sqr.cl', "mod_sqr_real")
    call accel_kernel_start_call(kernel_mod_sqr_complex, 'mod_sqr.cl', "mod_sqr_complex")

    !%Variable AccelBenchmark
    !%Type logical
    !%Default no
    !%Section Execution::Accel
    !%Description
    !% If this variable is set to yes, Octopus will run some
    !% routines to benchmark the performance of the accelerator device.
    !%End
    call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)

    call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
    
    if(run_benchmark) then
      call opencl_check_bandwidth()
    end if

    !%Variable CudaAwareMPI
    !%Type logical
    !%Section Execution::Accel
    !%Description
    !% If Octopus was compiled with CUDA support and MPI support and if the MPI
    !% implementation is CUDA-aware (i.e., it supports communication using device pointers),
    !% this switch can be set to true to use the CUDA-aware MPI features. The advantage
    !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
    !% going through the host memmory.
    !% The default is false, except when the configure switch --enable-cudampi is set, in which
    !% case this variable is set to true.
    !%End
#ifdef HAVE_CUDA_MPI
    default = .true.
#else
    default = .false.
#endif
    call parse_variable(namespace, 'CudaAwareMPI', default, accel%cuda_mpi)
    if(accel%cuda_mpi) then
      call messages_write("Using CUDA-aware MPI.")
      call messages_new_line()
      call messages_write("If you use more than one rank per GPU, make sure to run the NVIDIA MPS daemon.")
      call messages_new_line()
      call messages_write("Otherwise, you may get incorrect results.")
      call messages_info()
    end if

    call messages_print_stress(stdout)

    POP_SUB(accel_init)

  contains

    subroutine select_device(idevice)
      integer, intent(inout) :: idevice
#if defined(HAVE_MPI) && defined(HAVE_OPENCL)
      integer :: irank
      character(len=256) :: device_name

      PUSH_SUB(accel_init.select_device)

      idevice = mod(base_grp%rank, ndevices)

      call MPI_Barrier(base_grp%comm, mpi_err)
      call messages_write('Info: CL device distribution:')
      call messages_info()
      do irank = 0, base_grp%size - 1
        if(irank == base_grp%rank) then
          call clGetDeviceInfo(alldevices(idevice + 1), CL_DEVICE_NAME, device_name, cl_status)
          call messages_write('      MPI node ')
          call messages_write(base_grp%rank)
          call messages_write(' -> CL device ')
          call messages_write(idevice)
          call messages_write(' : '//device_name)
          call messages_info(all_nodes = .true.)
        end if
        call MPI_Barrier(base_grp%comm, mpi_err)
      end do
#endif

      POP_SUB(accel_init.select_device)
    end subroutine select_device

    subroutine device_info()
#ifdef HAVE_OPENCL
      integer(8) :: val
#endif
#ifdef HAVE_CUDA
      integer :: version
#endif
      integer :: major, minor
      character(len=256) :: val_str
      
      PUSH_SUB(accel_init.device_info)

      call messages_new_line()
      call messages_write('Selected device:')
      call messages_new_line()

#ifdef HAVE_OPENCL
      call messages_write('      Framework              : OpenCL')
#endif
#ifdef HAVE_CUDA
      call messages_write('      Framework              : CUDA')
#endif
      call messages_info()

#ifdef HAVE_CUDA
      call messages_write('      Device type            : GPU', new_line = .true.)
      call messages_write('      Device vendor          : NVIDIA Corporation', new_line = .true.)
#endif

#ifdef HAVE_OPENCL
      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_TYPE, val, cl_status)
      call messages_write('      Device type            :')
      select case(int(val, 4))
      case(CL_DEVICE_TYPE_GPU)
        call messages_write(' GPU')
      case(CL_DEVICE_TYPE_CPU)
        call messages_write(' CPU')
      case(CL_DEVICE_TYPE_ACCELERATOR)
        call messages_write(' accelerator')
      end select
      call messages_new_line()

      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_VENDOR, val_str, cl_status)
      call messages_write('      Device vendor          : '//trim(val_str))
      call messages_new_line()
#endif
      
#ifdef HAVE_OPENCL
      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_NAME, val_str, cl_status)
#endif
#ifdef HAVE_CUDA
      call cuda_device_name(accel%device%cuda_device, val_str)
#endif
      call messages_write('      Device name            : '//trim(val_str))
      call messages_new_line()
      
#ifdef HAVE_CUDA
      call cuda_device_capability(accel%device%cuda_device, major, minor)
#endif
      call messages_write('      Cuda capabilities      :')
      call messages_write(major, fmt = '(i2)')
      call messages_write('.')
      call messages_write(minor, fmt = '(i1)')
      call messages_new_line()

      ! VERSION
#ifdef HAVE_OPENCL
      call clGetDeviceInfo(accel%device%cl_device, CL_DRIVER_VERSION, val_str, cl_status)
      call messages_write('      Driver version         : '//trim(val_str))
#endif
#ifdef HAVE_CUDA
      call cuda_driver_version(version)
      call messages_write('      Driver version         : ')
      call messages_write(version)
#endif
      call messages_new_line()

      
#ifdef HAVE_OPENCL
      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_COMPUTE_UNITS, val, cl_status)
      call messages_write('      Compute units          :')
      call messages_write(val)
      call messages_new_line()

      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_CLOCK_FREQUENCY, val, cl_status)
      call messages_write('      Clock frequency        :')
      call messages_write(val)
      call messages_write(' GHz')
      call messages_new_line()
#endif

      call messages_write('      Device memory          :')
      call messages_write(accel%global_memory_size, units = unit_megabytes)
      call messages_new_line()

      call messages_write('      Local/shared memory    :')
      call messages_write(accel%local_memory_size, units = unit_kilobytes)
      call messages_new_line()
      
    
#ifdef HAVE_OPENCL
      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, val, cl_status)
      call messages_write('      Max alloc size         :')
      call messages_write(val, units = unit_megabytes)
      call messages_new_line()

      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, val, cl_status)
      call messages_write('      Device cache           :')
      call messages_write(val, units = unit_kilobytes)
      call messages_new_line()

      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, val, cl_status)
      call messages_write('      Constant memory        :')
      call messages_write(val, units = unit_kilobytes)
      call messages_new_line()
#endif

      call messages_write('      Max. group/block size  :')
      call messages_write(accel%max_workgroup_size)
      call messages_new_line()
      

#ifdef HAVE_OPENCL
      call messages_write('      Extension cl_khr_fp64  :')
      call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64"))
      call messages_new_line()

      call messages_write('      Extension cl_amd_fp64  :')
      call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64"))
      call messages_new_line()
#endif
      
      call messages_info()


      POP_SUB(accel_init.device_info)
    end subroutine device_info

  end subroutine accel_init

  ! ------------------------------------------

  integer function get_platform_id(platform_name) result(platform_id)
    character(len=*), intent(in) :: platform_name

    platform_id = CL_PLAT_INVALID
    if(index(platform_name, 'AMD') > 0)    platform_id = CL_PLAT_AMD
    if(index(platform_name, 'ATI') > 0)    platform_id = CL_PLAT_ATI
    if(index(platform_name, 'NVIDIA') > 0) platform_id = CL_PLAT_NVIDIA
    if(index(platform_name, 'Intel') > 0)  platform_id = CL_PLAT_INTEL
  end function get_platform_id

  ! ------------------------------------------

  subroutine accel_end()
#ifdef HAVE_OPENCL
    integer :: ierr
#endif
    integer(8) :: hits, misses
    real(8) :: volume_hits, volume_misses
    logical :: found
    type(accel_mem_t) :: tmp

    PUSH_SUB(accel_end)

    if(accel_is_enabled()) then

      do 
        call alloc_cache_get(memcache, ALLOC_CACHE_ANY_SIZE, found, tmp%mem)
        if(.not. found) exit

#ifdef HAVE_OPENCL
        call clReleaseMemObject(tmp%mem, ierr)
        if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseMemObject")
#endif
#ifdef HAVE_CUDA
        call cuda_mem_free(tmp%mem)
#endif
      end do

      call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)

      call messages_print_stress(stdout, "Acceleration-device allocation cache")

      call messages_new_line()
      call messages_write('    Number of allocations    =')
      call messages_write(hits + misses, new_line = .true.)
      call messages_write('    Volume of allocations    =')
      call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
        new_line = .true.)
      call messages_write('    Hit ratio                =')
      call messages_write(hits/dble(hits + misses)*100, fmt='(f6.1)', align_left = .true.)
      call messages_write('%', new_line = .true.)
      call messages_write('    Volume hit ratio         =')
      call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
      call messages_write('%')
      call messages_new_line()
      call messages_info()

      call messages_print_stress(stdout)
    end if
    
    call accel_kernel_global_end()

#ifdef HAVE_CLBLAS
    call clblasTearDown()
#endif

#ifdef HAVE_CLFFT
    call clfftTearDown()
#endif

    if(accel_is_enabled()) then
#ifdef HAVE_CUDA
      call cublas_end(accel%cublas_handle)
      call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
#endif

#ifdef HAVE_OPENCL
      call clReleaseCommandQueue(accel%command_queue, ierr)

      if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "ReleaseCommandQueue")
      call clReleaseContext(accel%context%cl_context, cl_status)
#endif
      
      if(buffer_alloc_count /= 0) then
        call messages_write('Accel:')
        call messages_write(real(allocated_mem, REAL_PRECISION), fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
        call messages_write(' in ')
        call messages_write(buffer_alloc_count)
        call messages_write(' buffers were not deallocated.')
        call messages_fatal()
      end if

    end if

    POP_SUB(accel_end)
  end subroutine accel_end

  ! ------------------------------------------

  elemental subroutine accel_mem_nullify(this)
    type(accel_mem_t), intent(out) :: this

    !> To be implemented.
    this%size = 0
    this%flags = 0
    this%allocated = .false.
    
  end subroutine accel_mem_nullify

  ! ------------------------------------------

  integer function accel_padded_size(nn) result(psize)
    integer,        intent(in) :: nn

    integer :: modnn, bsize
    
    psize = nn

    if(accel_is_enabled()) then

      bsize = accel_max_workgroup_size()
      
      psize = nn
      modnn = mod(nn, bsize)
      if(modnn /= 0) psize = psize + bsize - modnn

    end if
    
  end function accel_padded_size

  ! ------------------------------------------

  subroutine accel_create_buffer_4(this, flags, type, size)
    type(accel_mem_t),  intent(inout) :: this
    integer,            intent(in)    :: flags
    type(type_t),       intent(in)    :: type
    integer,            intent(in)    :: size

    call accel_create_buffer_8(this, flags, type, int(size, 8))
  end subroutine accel_create_buffer_4

  ! ------------------------------------------

  subroutine accel_create_buffer_8(this, flags, type, size)
    type(accel_mem_t),  intent(inout) :: this
    integer,            intent(in)    :: flags
    type(type_t),       intent(in)    :: type
    integer(8),         intent(in)    :: size

    integer(8) :: fsize
    logical    :: found
#ifdef HAVE_OPENCL
    integer :: ierr
#endif

    PUSH_SUB(accel_create_buffer_8)

    this%type = type
    this%size = size
    this%flags = flags
    fsize = int(size, 8)*types_get_size(type)
    this%allocated = .true.
    
    if(fsize > 0) then

      call alloc_cache_get(memcache, fsize, found, this%mem)

      if(.not. found) then
#ifdef HAVE_OPENCL
        this%mem = clCreateBuffer(accel%context%cl_context, flags, fsize, ierr)
        if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateBuffer")
#endif
#ifdef HAVE_CUDA
        call cuda_mem_alloc(this%mem, fsize)