accel.F90

!! Copyright (C) 2010-2016 X. Andrade
!!
!! This program is free software; you can redistribute it and/or modify
!! it under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 2, or (at your option)
!! any later version.
!!
!! This program is distributed in the hope that it will be useful,
!! but WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!! GNU General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with this program; if not, write to the Free Software
!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
!! 02110-1301, USA.
!!

#include "global.h"

#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
#error "Cannot compile with OpenCL and Cuda support at the same time"
#endif

#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
#define HAVE_ACCEL 1
#endif

module accel_oct_m
  use alloc_cache_oct_m
#ifdef HAVE_OPENCL
  use cl
#endif
#ifdef HAVE_CLBLAS
  use clblas
#endif
  use cuda_oct_m
#ifdef HAVE_CLFFT
  use clfft
#endif
  use debug_oct_m
  use global_oct_m
  use iso_c_binding
  use loct_oct_m
  use messages_oct_m
  use mpi_oct_m
  use namespace_oct_m
  use types_oct_m
  use parser_oct_m
  use profiling_oct_m
  use unit_system_oct_m

  implicit none

  private

  public ::                       &
    accel_context_t,              &
    accel_device_t,               &
    accel_mem_t,                  &
    accel_kernel_t,               &
    accel_t,                      &
    accel_is_enabled,             &
    accel_allow_CPU_only,         &
    accel_init,                   &
    accel_end,                    &
    accel_padded_size,            &
    accel_kernel_start_call,      &
    accel_kernel_build,           &
    accel_create_buffer,          &
    accel_write_buffer,           &
    accel_read_buffer,            &
    accel_release_buffer,         &
    accel_buffer_is_allocated,    &
    accel_finish,                 &
    accel_set_kernel_arg,         &
    accel_max_workgroup_size,     &
    accel_kernel_workgroup_size,  &
    accel_kernel_run,             &
    accel_set_buffer_to_zero,     &
    accel_use_shared_mem,         &
    clblas_print_error,           &
    clfft_print_error,            &
    accel_local_memory_size,      &
    accel_global_memory_size,     &
    accel_max_size_per_dim,       &
    accel_get_device_pointer,     &
    daccel_get_pointer_with_offset,&
    zaccel_get_pointer_with_offset,&
    accel_clean_pointer,          &
    accel_set_stream,             &
    accel_synchronize_all_streams

#ifdef HAVE_OPENCL
  integer, public, parameter ::                 &
    ACCEL_MEM_READ_ONLY  = CL_MEM_READ_ONLY,    &
    ACCEL_MEM_READ_WRITE = CL_MEM_READ_WRITE,   &
    ACCEL_MEM_WRITE_ONLY = CL_MEM_WRITE_ONLY
#else
  integer, public, parameter ::                 &
    ACCEL_MEM_READ_ONLY  = 0,                   &
    ACCEL_MEM_READ_WRITE = 1,                   &
    ACCEL_MEM_WRITE_ONLY = 2
#endif

  type accel_context_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_context) :: cl_context
#elif defined(HAVE_CUDA)
    type(c_ptr)      :: cuda_context
#else
    integer          :: dummy
#endif
  end type accel_context_t

  type accel_device_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_device_id) :: cl_device
#elif defined(HAVE_CUDA)
    type(c_ptr)      :: cuda_device
#else
    integer         :: dummy
#endif
  end type accel_device_t

  type accel_t
    ! Components are public by default
    type(accel_context_t)  :: context
    type(accel_device_t)   :: device
#ifdef HAVE_OPENCL
    type(cl_command_queue) :: command_queue
#endif
    type(c_ptr)            :: cublas_handle
    type(c_ptr)            :: cuda_stream
    type(c_ptr)            :: module_map
    integer                :: max_workgroup_size
    integer(i8)            :: local_memory_size
    integer(i8)            :: global_memory_size
    logical                :: enabled
    logical                :: allow_CPU_only
    logical                :: shared_mem
    logical                :: cuda_mpi
    integer                :: warp_size
  end type accel_t

  type accel_mem_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_mem)           :: mem
#else
    type(c_ptr)            :: mem
#endif
    integer(SIZEOF_SIZE_T) :: size = 0
    type(type_t)           :: type
    integer                :: flags = 0
    logical                :: allocated = .false.
  end type accel_mem_t

  type accel_kernel_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_kernel)               :: kernel
#endif
#ifdef HAVE_CUDA
    type(c_ptr)                   :: cuda_kernel
    type(c_ptr)                   :: cuda_module
    type(c_ptr)                   :: arguments
#endif
    integer(i8)                   :: cuda_shared_mem
    logical                       :: initialized = .false.
    type(accel_kernel_t), pointer :: next
    integer                       :: arg_count
  end type accel_kernel_t

  type(accel_t), public :: accel

  ! the kernels
  type(accel_kernel_t), public, target, save :: kernel_vpsi
  type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
  type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
  type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
  type(accel_kernel_t), public, target, save :: kernel_daxpy
  type(accel_kernel_t), public, target, save :: kernel_zaxpy
  type(accel_kernel_t), public, target, save :: kernel_copy
  type(accel_kernel_t), public, target, save :: dpack
  type(accel_kernel_t), public, target, save :: zpack
  type(accel_kernel_t), public, target, save :: dunpack
  type(accel_kernel_t), public, target, save :: zunpack
  type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
  type(accel_kernel_t), public, target, save :: kernel_density_real
  type(accel_kernel_t), public, target, save :: kernel_density_complex
  type(accel_kernel_t), public, target, save :: kernel_density_spinors
  type(accel_kernel_t), public, target, save :: kernel_phase
  type(accel_kernel_t), public, target, save :: kernel_phase_spiral
  type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
  type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
  type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
  type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
  type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
  type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
  type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
  type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
  type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
  type(accel_kernel_t), public, target, save :: dzmul
  type(accel_kernel_t), public, target, save :: zzmul
  type(accel_kernel_t), public, target, save :: set_one

  ! kernels used locally
  type(accel_kernel_t), save :: set_zero

  interface accel_padded_size
    module procedure accel_padded_size_i8, accel_padded_size_i4
  end interface accel_padded_size

  interface accel_create_buffer
    module procedure accel_create_buffer_4, accel_create_buffer_8
  end interface accel_create_buffer

  interface accel_kernel_run
    module procedure accel_kernel_run_4, accel_kernel_run_8
  end interface accel_kernel_run

  interface accel_set_buffer_to_zero
    module procedure accel_set_buffer_to_zero_i8, accel_set_buffer_to_zero_i4
  end interface accel_set_buffer_to_zero

  interface accel_write_buffer
    module procedure iaccel_write_buffer_single, laccel_write_buffer_single, daccel_write_buffer_single, zaccel_write_buffer_single
    module procedure iaccel_write_buffer_0, laccel_write_buffer_0, daccel_write_buffer_0, zaccel_write_buffer_0
    module procedure iaccel_write_buffer_1, laccel_write_buffer_1, daccel_write_buffer_1, zaccel_write_buffer_1
    module procedure iaccel_write_buffer_2, laccel_write_buffer_2, daccel_write_buffer_2, zaccel_write_buffer_2
    module procedure iaccel_write_buffer_3, laccel_write_buffer_3, daccel_write_buffer_3, zaccel_write_buffer_3
    module procedure iaccel_write_buffer_0_i4, laccel_write_buffer_0_i4, daccel_write_buffer_0_i4, zaccel_write_buffer_0_i4
    module procedure iaccel_write_buffer_1_i4, laccel_write_buffer_1_i4, daccel_write_buffer_1_i4, zaccel_write_buffer_1_i4
    module procedure iaccel_write_buffer_2_i4, laccel_write_buffer_2_i4, daccel_write_buffer_2_i4, zaccel_write_buffer_2_i4
    module procedure iaccel_write_buffer_3_i4, laccel_write_buffer_3_i4, daccel_write_buffer_3_i4, zaccel_write_buffer_3_i4
  end interface accel_write_buffer

  interface accel_read_buffer
    module procedure iaccel_read_buffer_0, laccel_read_buffer_0, daccel_read_buffer_0, zaccel_read_buffer_0
    module procedure iaccel_read_buffer_1, laccel_read_buffer_1, daccel_read_buffer_1, zaccel_read_buffer_1
    module procedure iaccel_read_buffer_2, laccel_read_buffer_2, daccel_read_buffer_2, zaccel_read_buffer_2
    module procedure iaccel_read_buffer_3, laccel_read_buffer_3, daccel_read_buffer_3, zaccel_read_buffer_3
    module procedure iaccel_read_buffer_0_i4, laccel_read_buffer_0_i4, daccel_read_buffer_0_i4, zaccel_read_buffer_0_i4
    module procedure iaccel_read_buffer_1_i4, laccel_read_buffer_1_i4, daccel_read_buffer_1_i4, zaccel_read_buffer_1_i4
    module procedure iaccel_read_buffer_2_i4, laccel_read_buffer_2_i4, daccel_read_buffer_2_i4, zaccel_read_buffer_2_i4
    module procedure iaccel_read_buffer_3_i4, laccel_read_buffer_3_i4, daccel_read_buffer_3_i4, zaccel_read_buffer_3_i4
  end interface accel_read_buffer

  interface accel_set_kernel_arg
    module procedure                       &
      accel_set_kernel_arg_buffer,  &
      iaccel_set_kernel_arg_data,   &
      laccel_set_kernel_arg_data,   &
      daccel_set_kernel_arg_data,   &
      zaccel_set_kernel_arg_data,   &
      accel_set_kernel_arg_local
  end interface accel_set_kernel_arg

  interface accel_get_device_pointer
    module procedure iaccel_get_device_pointer_1, laccel_get_device_pointer_1
    module procedure iaccel_get_device_pointer_2, laccel_get_device_pointer_2
    module procedure iaccel_get_device_pointer_3, laccel_get_device_pointer_3
    module procedure daccel_get_device_pointer_1, zaccel_get_device_pointer_1
    module procedure daccel_get_device_pointer_2, zaccel_get_device_pointer_2
    module procedure daccel_get_device_pointer_3, zaccel_get_device_pointer_3
    module procedure iaccel_get_device_pointer_1l, laccel_get_device_pointer_1l
    module procedure iaccel_get_device_pointer_2l, laccel_get_device_pointer_2l
    module procedure iaccel_get_device_pointer_3l, laccel_get_device_pointer_3l
    module procedure daccel_get_device_pointer_1l, zaccel_get_device_pointer_1l
    module procedure daccel_get_device_pointer_2l, zaccel_get_device_pointer_2l
    module procedure daccel_get_device_pointer_3l, zaccel_get_device_pointer_3l
  end interface accel_get_device_pointer

  type(profile_t), save :: prof_read, prof_write

  integer, parameter  ::      &
    OPENCL_GPU         = -1,  &
    OPENCL_CPU         = -2,  &
    OPENCL_ACCELERATOR = -3,  &
    OPENCL_DEFAULT     = -4


  integer, parameter  ::      &
    CL_PLAT_INVALID   = -1,   &
    CL_PLAT_AMD       = -2,   &
    CL_PLAT_NVIDIA    = -3,   &
    CL_PLAT_ATI       = -4,   &
    CL_PLAT_INTEL     = -5

  ! a "convenience" public variable
  integer, public :: cl_status

  integer :: buffer_alloc_count
  integer(i8) :: allocated_mem
  type(accel_kernel_t), pointer :: head
  type(alloc_cache_t) :: memcache

contains

  pure logical function accel_is_enabled() result(enabled)
#ifdef HAVE_ACCEL
    enabled = accel%enabled
#else
    enabled = .false.
#endif
  end function accel_is_enabled

  ! ------------------------------------------

  pure logical function accel_allow_CPU_only() result(allow)
#ifdef HAVE_ACCEL
    allow = accel%allow_CPU_only
#else
    allow = .true.
#endif
  end function accel_allow_CPU_only

  ! ------------------------------------------

  subroutine accel_init(base_grp, namespace)
    type(mpi_grp_t),     intent(inout) :: base_grp
    type(namespace_t),   intent(in)    :: namespace

    logical  :: disable, default, run_benchmark
    integer  :: idevice, iplatform
#ifdef HAVE_OPENCL
    integer  :: device_type
    integer :: cl_status, idev
    integer  :: ndevices, ret_devices, nplatforms, iplat
    character(len=256) :: device_name
    type(cl_platform_id) :: platform_id
    type(cl_program) :: prog
    type(cl_platform_id), allocatable :: allplatforms(:)
    type(cl_device_id), allocatable :: alldevices(:)
    type(profile_t), save :: prof_init
#endif
#ifdef HAVE_CUDA
#ifdef HAVE_MPI
    character(len=256) :: sys_name
#endif
#endif

    PUSH_SUB(accel_init)

    buffer_alloc_count = 0

    !%Variable DisableAccel
    !%Type logical
    !%Default yes
    !%Section Execution::Accel
    !%Description
    !% If Octopus was compiled with OpenCL or CUDA support, it will
    !% try to initialize and use an accelerator device. By setting this
    !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
    !%End
    call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
#ifdef HAVE_ACCEL
    default = .false.
#else
    default = .true.
#endif
    call parse_variable(namespace, 'DisableAccel', default, disable)
    accel%enabled = .not. disable

#ifndef HAVE_ACCEL
    if (accel%enabled) then
      message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
      call messages_fatal(1)
    end if
#endif

    if (.not. accel_is_enabled()) then
      POP_SUB(accel_init)
      return
    end if

    !%Variable AccelPlatform
    !%Type integer
    !%Default 0
    !%Section Execution::Accel
    !%Description
    !% This variable selects the OpenCL platform that Octopus will
    !% use. You can give an explicit platform number or use one of
    !% the options that select a particular vendor
    !% implementation. Platform 0 is used by default.
    !%
    !% This variable has no effect for CUDA.
    !%Option amd -2
    !% Use the AMD OpenCL platform.
    !%Option nvidia -3
    !% Use the Nvidia OpenCL platform.
    !%Option ati -4
    !% Use the ATI (old AMD) OpenCL platform.
    !%Option intel -5
    !% Use the Intel OpenCL platform.
    !%End
    call parse_variable(namespace, 'AccelPlatform', 0, iplatform)

    call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')

    !%Variable AccelDevice
    !%Type integer
    !%Default gpu
    !%Section Execution::Accel
    !%Description
    !% This variable selects the OpenCL or CUDA accelerator device
    !% that Octopus will use. You can specify one of the options below
    !% or a numerical id to select a specific device.
    !%
    !% Values >= 0 select the device to be used. In case of MPI enabled runs
    !% devices are distributed in a round robin fashion, starting at this value.
    !%Option gpu -1
    !% If available, Octopus will use a GPU.
    !%Option cpu -2
    !% If available, Octopus will use a CPU (only for OpenCL).
    !%Option accelerator -3
    !% If available, Octopus will use an accelerator (only for OpenCL).
    !%Option accel_default -4
    !% Octopus will use the default device specified by the implementation.
    !% implementation.
    !%End
    call parse_variable(namespace, 'AccelDevice', OPENCL_GPU, idevice)

    call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')

    if (idevice < OPENCL_DEFAULT) then
      call messages_write('Invalid AccelDevice')
      call messages_fatal()
    end if

    call messages_print_stress(msg="GPU acceleration", namespace=namespace)

#ifdef HAVE_CUDA
    if (idevice<0) idevice = 0
    call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
      idevice, base_grp%rank)
#ifdef HAVE_MPI
    call loct_sysname(sys_name)
    write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
      " on ", trim(sys_name)
    call messages_info(1, all_nodes = .true.)
#endif

    ! no shared mem support in our cuda interface (for the moment)
    accel%shared_mem = .true.

    call cublas_init(accel%cublas_handle, accel%cuda_stream)
#endif

#ifdef HAVE_OPENCL
    call profiling_in(prof_init, 'CL_INIT')

    call clGetPlatformIDs(nplatforms, cl_status)
    if (cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")

    SAFE_ALLOCATE(allplatforms(1:nplatforms))

    call clGetPlatformIDs(allplatforms, iplat, cl_status)
    if (cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")

    call messages_write('Info: Available CL platforms: ')
    call messages_write(nplatforms)
    call messages_info()

    do iplat = 1, nplatforms

      call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_NAME, device_name, cl_status)

      if (iplatform < 0) then
        if (iplatform == get_platform_id(device_name)) iplatform = iplat - 1
      end if

      if (iplatform == iplat - 1) then
        call messages_write('    * Platform ')
      else
        call messages_write('      Platform ')
      end if

      call messages_write(iplat - 1)
      call messages_write(' : '//device_name)
      call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_VERSION, device_name, cl_status)
      call messages_write(' ('//trim(device_name)//')')
      call messages_info()
    end do

    call messages_info()

    if (iplatform >= nplatforms .or. iplatform < 0) then
      call messages_write('Requested CL platform does not exist')
      if (iplatform > 0) then
        call messages_write('(platform = ')
        call messages_write(iplatform)
        call messages_write(').')
      end if
      call messages_fatal()
    end if

    platform_id = allplatforms(iplatform + 1)

    SAFE_DEALLOCATE_A(allplatforms)

    call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, ndevices, cl_status)

    call messages_write('Info: Available CL devices: ')
    call messages_write(ndevices)
    call messages_info()

    SAFE_ALLOCATE(alldevices(1:ndevices))

    ! list all devices

    call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)

    do idev = 1, ndevices
      call messages_write('      Device ')
      call messages_write(idev - 1)
      call clGetDeviceInfo(alldevices(idev), CL_DEVICE_NAME, device_name, cl_status)
      call messages_write(' : '//device_name)
      call messages_info()
    end do

    select case (idevice)
    case (OPENCL_GPU)
      device_type = CL_DEVICE_TYPE_GPU
    case (OPENCL_CPU)
      device_type = CL_DEVICE_TYPE_CPU
    case (OPENCL_ACCELERATOR)
      device_type = CL_DEVICE_TYPE_ACCELERATOR
    case (OPENCL_DEFAULT)
      device_type = CL_DEVICE_TYPE_DEFAULT
    case default
      device_type = CL_DEVICE_TYPE_ALL
    end select

    ! now get a list of the selected type
    call clGetDeviceIDs(platform_id, device_type, alldevices, ret_devices, cl_status)

    if (ret_devices < 1) then
      ! we didnt find a device of the selected type, we ask for the default device
      call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, alldevices, ret_devices, cl_status)

      if (ret_devices < 1) then
        ! if this does not work, we ask for all devices
        call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)
      end if

      if (ret_devices < 1) then
        call messages_write('Cannot find an OpenCL device')
        call messages_fatal()
      end if
    end if

    ! the number of devices can be smaller
    ndevices = ret_devices

    if (idevice < 0) then
      if (base_grp%size > 1) then
        ! with MPI we have to select the device so multiple GPUs in one
        ! node are correctly distributed
        call select_device(idevice)
      else
        idevice = 0
      end if
    end if

    if (idevice >= ndevices) then
      call messages_write('Requested CL device does not exist (device = ')
      call messages_write(idevice)
      call messages_write(', platform = ')
      call messages_write(iplatform)
      call messages_write(').')
      call messages_fatal()
    end if

    accel%device%cl_device = alldevices(idevice + 1)

    ! create the context
    accel%context%cl_context = clCreateContext(platform_id, accel%device%cl_device, cl_status)
    if (cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "CreateContext")

    SAFE_DEALLOCATE_A(alldevices)

    accel%command_queue = clCreateCommandQueue(accel%context%cl_context, accel%device%cl_device, &
      CL_QUEUE_PROFILING_ENABLE, cl_status)
    if (cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "CreateCommandQueue")

    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_TYPE, device_type, cl_status)

    select case (device_type)
    case (CL_DEVICE_TYPE_GPU)
      accel%shared_mem = .true.
    case (CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_ACCELERATOR)
      accel%shared_mem = .false.
    case default
      accel%shared_mem = .false.
    end select

#ifdef HAVE_CLBLAS
    call clblasSetup(cl_status)
    if (cl_status /= clblasSuccess) call clblas_print_error(cl_status, 'clblasSetup')
#endif

#ifdef HAVE_CLFFT
    call clfftSetup(cl_status)
    if (cl_status /= CLFFT_SUCCESS) call clfft_print_error(cl_status, 'clfftSetup')
#endif

    call profiling_out(prof_init)
#endif

    ! Get some device information that we will need later

    ! total memory
#ifdef HAVE_OPENCL
    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_GLOBAL_MEM_SIZE, accel%global_memory_size, cl_status)
    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_LOCAL_MEM_SIZE, accel%local_memory_size, cl_status)
    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_WORK_GROUP_SIZE, accel%max_workgroup_size, cl_status)
    accel%warp_size = 1
#endif
#ifdef HAVE_CUDA
    call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
    call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
    call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
    call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
#endif

    if (mpi_grp_is_root(base_grp)) call device_info()

    ! initialize the cache used to speed up allocations
    call alloc_cache_init(memcache, nint(CNST(0.25)*accel%global_memory_size, 8))

    ! now initialize the kernels
    call accel_kernel_global_init()

    call accel_kernel_start_call(set_zero, 'set_zero.cl', "set_zero")
    call accel_kernel_start_call(set_one, 'set_one.cl', "set_one")
    call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi")
    call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cl', "vpsi_complex")
    call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors")
    call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cl', "vpsi_spinors_complex")
    call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE')
    call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX')
    call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cl', "dbatch_axpy_function", flags = '-lineinfo -DRTYPE_DOUBLE')
    call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cl', "zbatch_axpy_function", flags = '-lineinfo -DRTYPE_COMPLEX')
    call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cl', "dbatch_ax_function_py", flags = '-lineinfo -DRTYPE_DOUBLE')
    call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cl', "zbatch_ax_function_py", flags = '-lineinfo -DRTYPE_COMPLEX')
    call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cl', "dbatch_mf_dotp", flags = '-lineinfo')
    call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cl', "zbatch_mf_dotp", flags = '-lineinfo')
    call accel_kernel_start_call(dpack, 'pack.cl', "dpack")
    call accel_kernel_start_call(zpack, 'pack.cl', "zpack")
    call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack")
    call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack")
    call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy")
    call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cl', "ghost_reorder")
    call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real")
    call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex")
    call accel_kernel_start_call(kernel_density_spinors, 'density.cl', "density_spinors")
    call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase")
    call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix")
    call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix")
    call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors")


    call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE')
    call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX')

    !%Variable AccelBenchmark
    !%Type logical
    !%Default no
    !%Section Execution::Accel
    !%Description
    !% If this variable is set to yes, Octopus will run some
    !% routines to benchmark the performance of the accelerator device.
    !%End
    call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)

    call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')

    if (run_benchmark) then
      call opencl_check_bandwidth()
    end if

    !%Variable CudaAwareMPI
    !%Type logical
    !%Section Execution::Accel
    !%Description
    !% If Octopus was compiled with CUDA support and MPI support and if the MPI
    !% implementation is CUDA-aware (i.e., it supports communication using device pointers),
    !% this switch can be set to true to use the CUDA-aware MPI features. The advantage
    !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
    !% going through the host memmory.
    !% The default is false, except when the configure switch --enable-cudampi is set, in which
    !% case this variable is set to true.
    !%End
#ifdef HAVE_CUDA_MPI
    default = .true.
#else
    default = .false.
#endif
    call parse_variable(namespace, 'CudaAwareMPI', default, accel%cuda_mpi)
    if (accel%cuda_mpi) then
      call messages_write("Using CUDA-aware MPI.")
      call messages_info()
    end if


    !%Variable AllowCPUonly
    !%Type logical
    !%Section Execution::Accel
    !%Description
    !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
    !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
    !% code execution also in these cases.
    !%End
#if defined (HAVE_ACCEL)
    default = .false.
#else
    default = .true.
#endif
    call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)


    call messages_print_stress(namespace=namespace)

    POP_SUB(accel_init)

  contains

#if defined(HAVE_OPENCL)
    subroutine select_device(idevice)
      integer, intent(inout) :: idevice
      integer :: irank
      character(len=256) :: device_name

      PUSH_SUB(accel_init.select_device)

      idevice = mod(base_grp%rank, ndevices)

      call base_grp%barrier()
      call messages_write('Info: CL device distribution:')
      call messages_info()
      do irank = 0, base_grp%size - 1
        if (irank == base_grp%rank) then
          call clGetDeviceInfo(alldevices(idevice + 1), CL_DEVICE_NAME, device_name, cl_status)
          call messages_write('      MPI node ')
          call messages_write(base_grp%rank)
          call messages_write(' -> CL device ')
          call messages_write(idevice)
          call messages_write(' : '//device_name)
          call messages_info(all_nodes = .true.)
        end if
        call base_grp%barrier()
      end do

      POP_SUB(accel_init.select_device)
    end subroutine select_device
#endif

    subroutine device_info()
#ifdef HAVE_OPENCL
      integer(i8) :: val
#endif
#ifdef HAVE_CUDA
      integer :: version
#endif
      integer :: major, minor
      character(len=256) :: val_str

      PUSH_SUB(accel_init.device_info)

      call messages_new_line()
      call messages_write('Selected device:')
      call messages_new_line()

#ifdef HAVE_OPENCL
      call messages_write('      Framework              : OpenCL')
#endif
#ifdef HAVE_CUDA
      call messages_write('      Framework              : CUDA')
#endif
      call messages_info()

#ifdef HAVE_CUDA
      call messages_write('      Device type            : GPU', new_line = .true.)
      call messages_write('      Device vendor          : NVIDIA Corporation', new_line = .true.)
#endif

#ifdef HAVE_OPENCL
      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_TYPE, val, cl_status)
      call messages_write('      Device type            :')
      select case (int(val, 4))
      case (CL_DEVICE_TYPE_GPU)
        call messages_write(' GPU')
      case (CL_DEVICE_TYPE_CPU)
        call messages_write(' CPU')
      case (CL_DEVICE_TYPE_ACCELERATOR)
        call messages_write(' accelerator')
      end select
      call messages_new_line()

      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_VENDOR, val_str, cl_status)
      call messages_write('      Device vendor          : '//trim(val_str))
      call messages_new_line()
#endif

#ifdef HAVE_OPENCL
      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_NAME, val_str, cl_status)
#endif
#ifdef HAVE_CUDA
      call cuda_device_name(accel%device%cuda_device, val_str)
#endif
      call messages_write('      Device name            : '//trim(val_str))
      call messages_new_line()

#ifdef HAVE_CUDA
      call cuda_device_capability(accel%device%cuda_device, major, minor)
#endif
      call messages_write('      Cuda capabilities      :')
      call messages_write(major, fmt = '(i2)')
      call messages_write('.')
      call messages_write(minor, fmt = '(i1)')
      call messages_new_line()

      ! VERSION
#ifdef HAVE_OPENCL
      call clGetDeviceInfo(accel%device%cl_device, CL_DRIVER_VERSION, val_str, cl_status)
      call messages_write('      Driver version         : '//trim(val_str))
#endif
#ifdef HAVE_CUDA
      call cuda_driver_version(version)
      call messages_write('      Driver version         : ')
      call messages_write(version)
#endif
      call messages_new_line()


#ifdef HAVE_OPENCL
      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_COMPUTE_UNITS, val, cl_status)
      call messages_write('      Compute units          :')
      call messages_write(val)
      call messages_new_line()

      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_CLOCK_FREQUENCY, val, cl_status)
      call messages_write('      Clock frequency        :')
      call messages_write(val)
      call messages_write(' GHz')
      call messages_new_line()
#endif

      call messages_write('      Device memory          :')
      call messages_write(accel%global_memory_size, units=unit_megabytes)
      call messages_new_line()

      call messages_write('      Local/shared memory    :')
      call messages_write(accel%local_memory_size, units=unit_kilobytes)
      call messages_new_line()


#ifdef HAVE_OPENCL
      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, val, cl_status)
      call messages_write('      Max alloc size         :')
      call messages_write(val, units = unit_megabytes)
      call messages_new_line()

      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, val, cl_status)
      call messages_write('      Device cache           :')
      call messages_write(val, units = unit_kilobytes)
      call messages_new_line()

      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, val, cl_status)
      call messages_write('      Constant memory        :')
      call messages_write(val, units = unit_kilobytes)
      call messages_new_line()
#endif

      call messages_write('      Max. group/block size  :')
      call messages_write(accel%max_workgroup_size)
      call messages_new_line()


#ifdef HAVE_OPENCL
      call messages_write('      Extension cl_khr_fp64  :')
      call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64"))
      call messages_new_line()

      call messages_write('      Extension cl_amd_fp64  :')
      call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64"))
      call messages_new_line()
#endif

      call messages_info()


      POP_SUB(accel_init.device_info)
    end subroutine device_info

  end subroutine accel_init

  ! ------------------------------------------
#ifdef HAVE_OPENCL
  integer function get_platform_id(platform_name) result(platform_id)
    character(len=*), intent(in) :: platform_name

    platform_id = CL_PLAT_INVALID
    if (index(platform_name, 'AMD') > 0)    platform_id = CL_PLAT_AMD
    if (index(platform_name, 'ATI') > 0)    platform_id = CL_PLAT_ATI
    if (index(platform_name, 'NVIDIA') > 0) platform_id = CL_PLAT_NVIDIA
    if (index(platform_name, 'Intel') > 0)  platform_id = CL_PLAT_INTEL
  end function get_platform_id
#endif
  ! ------------------------------------------

  subroutine accel_end(namespace)
    type(namespace_t), intent(in) :: namespace

#ifdef HAVE_OPENCL
    integer :: ierr
#endif
    integer(i8) :: hits, misses
    real(r8) :: volume_hits, volume_misses
    logical :: found
    type(accel_mem_t) :: tmp

    PUSH_SUB(accel_end)

    if (accel_is_enabled()) then

      do
        call alloc_cache_get(memcache, ALLOC_CACHE_ANY_SIZE, found, tmp%mem)
        if (.not. found) exit

#ifdef HAVE_OPENCL
        call clReleaseMemObject(tmp%mem, ierr)
        if (ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseMemObject")
#endif
#ifdef HAVE_CUDA
        call cuda_mem_free(tmp%mem)
#endif
      end do

      call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)

      call messages_print_stress(msg="Acceleration-device allocation cache", namespace=namespace)

      call messages_new_line()
      call messages_write('    Number of allocations    =')
      call messages_write(hits + misses, new_line = .true.)
      call messages_write('    Volume of allocations    =')
      call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
        new_line = .true.)
      call messages_write('    Hit ratio                =')
      call messages_write(hits/TOFLOAT(hits + misses)*100, fmt='(f6.1)', align_left = .true.)
      call messages_write('%', new_line = .true.)
      call messages_write('    Volume hit ratio         =')
      call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
      call messages_write('%')
      call messages_new_line()
      call messages_info()

      call messages_print_stress(namespace=namespace)
    end if

    call accel_kernel_global_end()

#ifdef HAVE_CLBLAS
    call clblasTearDown()
#endif

#ifdef HAVE_CLFFT
    call clfftTearDown()
#endif

    if (accel_is_enabled()) then
#ifdef HAVE_CUDA
      call cublas_end(accel%cublas_handle)
      if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
        call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
      end if
#endif

#ifdef HAVE_OPENCL
      call clReleaseCommandQueue(accel%command_queue, ierr)

      if (ierr /= CL_SUCCESS) call opencl_print_error(ierr, "ReleaseCommandQueue")
      call clReleaseContext(accel%context%cl_context, cl_status)
#endif

      if (buffer_alloc_count /= 0) then
        call messages_write('Accel:')
        call messages_write(TOFLOAT(allocated_mem), fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
        call messages_write(' in ')
        call messages_write(buffer_alloc_count)
        call messages_write(' buffers were not deallocated.')
        call messages_fatal()
      end if