accel.F90

!! Copyright (C) 2010-2016 X. Andrade
!!
!! This program is free software; you can redistribute it and/or modify
!! it under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 2, or (at your option)
!! any later version.
!!
!! This program is distributed in the hope that it will be useful,
!! but WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!! GNU General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with this program; if not, write to the Free Software
!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
!! 02110-1301, USA.
!!

#include "global.h"

#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
#error "Cannot compile with OpenCL and Cuda support at the same time"
#endif

#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
#define HAVE_ACCEL 1
#endif

module accel_oct_m
  use alloc_cache_oct_m
#ifdef HAVE_OPENCL
  use cl
#endif
#ifdef HAVE_CLBLAS
  use clblas
#endif
  use cuda_oct_m
#ifdef HAVE_CLFFT
  use clfft
#endif
  use global_oct_m
  use iso_c_binding
  use loct_oct_m
  use messages_oct_m
  use mpi_oct_m
  use namespace_oct_m
  use types_oct_m
  use parser_oct_m
  use profiling_oct_m
  use unit_system_oct_m

  implicit none 

  private
  
  public ::                       &
    accel_context_t,              &
    accel_device_t,               &
    accel_mem_t,                  &
    accel_kernel_t,               &
    accel_t,                      &
    accel_is_enabled,             &
    accel_init,                   &
    accel_end,                    &
    accel_padded_size,            &
    accel_mem_nullify,            &
    accel_kernel_start_call,      &
    accel_kernel_build,           &
    accel_create_buffer,          &
    accel_write_buffer,           &
    accel_read_buffer,            &
    accel_release_buffer,         &
    accel_buffer_is_allocated,    &
    accel_finish,                 &
    accel_set_kernel_arg,         &
    accel_max_workgroup_size,     &
    accel_kernel_workgroup_size,  &
    accel_kernel_run,             &
    accel_set_buffer_to_zero,     &
    accel_use_shared_mem,         &
    clblas_print_error,           &
    clfft_print_error,            &
    accel_local_memory_size,      &
    accel_global_memory_size,     &
    accel_max_size_per_dim,       &
    accel_get_device_pointer,     &
    accel_set_stream,             &
    accel_synchronize_all_streams
  
#ifdef HAVE_OPENCL
  integer, public, parameter ::                 &
    ACCEL_MEM_READ_ONLY  = CL_MEM_READ_ONLY,    &
    ACCEL_MEM_READ_WRITE = CL_MEM_READ_WRITE,   &
    ACCEL_MEM_WRITE_ONLY = CL_MEM_WRITE_ONLY
#else
  integer, public, parameter ::                 &
    ACCEL_MEM_READ_ONLY  = 0,                   &
    ACCEL_MEM_READ_WRITE = 1,                   &
    ACCEL_MEM_WRITE_ONLY = 2
#endif

  type accel_context_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_context) :: cl_context
#elif defined(HAVE_CUDA)
    type(c_ptr)      :: cuda_context
#else
    integer          :: dummy
#endif
  end type accel_context_t

  type accel_device_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_device_id) :: cl_device
#elif defined(HAVE_CUDA)
    type(c_ptr)      :: cuda_device
#else
    integer         :: dummy
#endif
  end type accel_device_t

  type accel_t
    ! Components are public by default
    type(accel_context_t)  :: context
    type(accel_device_t)   :: device
#ifdef HAVE_OPENCL
    type(cl_command_queue) :: command_queue
#endif
    type(c_ptr)            :: cublas_handle
    type(c_ptr)            :: cuda_stream
    type(c_ptr)            :: module_map
    integer                :: max_workgroup_size
    integer(8)             :: local_memory_size
    integer(8)             :: global_memory_size
    logical                :: enabled
    logical                :: shared_mem
    logical                :: cuda_mpi
    integer                :: warp_size
  end type accel_t

  type accel_mem_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_mem)           :: mem
#else
    type(c_ptr)            :: mem
#endif
    integer(SIZEOF_SIZE_T) :: size
    type(type_t)           :: type
    integer                :: flags
    logical                :: allocated
  end type accel_mem_t

  type accel_kernel_t
    ! Components are public by default
#ifdef HAVE_OPENCL
    type(cl_kernel)               :: kernel
#endif
#ifdef HAVE_CUDA
    type(c_ptr)                   :: cuda_kernel
    type(c_ptr)                   :: cuda_module
    type(c_ptr)                   :: arguments
#endif
    integer(8)                    :: cuda_shared_mem
    logical                       :: initialized = .false.
    type(accel_kernel_t), pointer :: next
    integer                       :: arg_count
  end type accel_kernel_t

  type(accel_t), public :: accel

  ! the kernels
  type(accel_kernel_t), public, target, save :: kernel_vpsi
  type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
  type(accel_kernel_t), public, target, save :: kernel_daxpy
  type(accel_kernel_t), public, target, save :: kernel_zaxpy
  type(accel_kernel_t), public, target, save :: kernel_copy
  type(accel_kernel_t), public, target, save :: kernel_complex_conj
  type(accel_kernel_t), public, target, save :: kernel_complex_conj_combine
  type(accel_kernel_t), public, target, save :: dpack
  type(accel_kernel_t), public, target, save :: zpack
  type(accel_kernel_t), public, target, save :: dunpack
  type(accel_kernel_t), public, target, save :: zunpack
  type(accel_kernel_t), public, target, save :: kernel_subarray_gather
  type(accel_kernel_t), public, target, save :: kernel_density_real
  type(accel_kernel_t), public, target, save :: kernel_density_complex
  type(accel_kernel_t), public, target, save :: kernel_density_spinors
  type(accel_kernel_t), public, target, save :: kernel_phase
  type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
  type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
  type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
  type(accel_kernel_t), public, target, save :: dzmul
  type(accel_kernel_t), public, target, save :: zzmul
  type(accel_kernel_t), public, target, save :: set_one

  ! kernels used locally
  type(accel_kernel_t), save :: set_zero

  interface accel_create_buffer
    module procedure accel_create_buffer_4, accel_create_buffer_8
  end interface accel_create_buffer

  interface accel_write_buffer
    module procedure iaccel_write_buffer_0, daccel_write_buffer_0, zaccel_write_buffer_0
    module procedure iaccel_write_buffer_1, daccel_write_buffer_1, zaccel_write_buffer_1
    module procedure iaccel_write_buffer_2, daccel_write_buffer_2, zaccel_write_buffer_2
    module procedure iaccel_write_buffer_3, daccel_write_buffer_3, zaccel_write_buffer_3
  end interface accel_write_buffer

  interface accel_read_buffer
    module procedure iaccel_read_buffer_1, daccel_read_buffer_1, zaccel_read_buffer_1
    module procedure iaccel_read_buffer_2, daccel_read_buffer_2, zaccel_read_buffer_2
    module procedure iaccel_read_buffer_3, daccel_read_buffer_3, zaccel_read_buffer_3
  end interface accel_read_buffer

  interface accel_set_kernel_arg
    module procedure                       &
      accel_set_kernel_arg_buffer,  &
      iaccel_set_kernel_arg_data,   &
      daccel_set_kernel_arg_data,   &
      zaccel_set_kernel_arg_data,   &
      accel_set_kernel_arg_local
  end interface accel_set_kernel_arg

  interface accel_get_device_pointer
    module procedure iaccel_get_device_pointer_1
    module procedure iaccel_get_device_pointer_2
    module procedure daccel_get_device_pointer_1, zaccel_get_device_pointer_1
    module procedure daccel_get_device_pointer_2, zaccel_get_device_pointer_2
  end interface accel_get_device_pointer

  type(profile_t), save :: prof_read, prof_write

  integer, parameter  ::      &
    OPENCL_GPU         = -1,  &
    OPENCL_CPU         = -2,  &
    OPENCL_ACCELERATOR = -3,  &
    OPENCL_DEFAULT     = -4


  integer, parameter  ::      &
    CL_PLAT_INVALID   = -1,   &
    CL_PLAT_AMD       = -2,   &
    CL_PLAT_NVIDIA    = -3,   &
    CL_PLAT_ATI       = -4,   &
    CL_PLAT_INTEL     = -5

  ! a "convenience" public variable
  integer, public :: cl_status

  integer :: buffer_alloc_count
  integer(8) :: allocated_mem
  type(accel_kernel_t), pointer :: head
  type(alloc_cache_t) :: memcache
  
contains

  pure logical function accel_is_enabled() result(enabled)
#ifdef HAVE_ACCEL
    enabled = accel%enabled
#else
    enabled = .false.
#endif
  end function accel_is_enabled

  ! ------------------------------------------

  subroutine accel_init(base_grp, namespace)
    type(mpi_grp_t),     intent(inout) :: base_grp
    type(namespace_t),   intent(in)    :: namespace
    
    logical  :: disable, default, run_benchmark
    integer  :: idevice, iplatform
#ifdef HAVE_OPENCL
    integer  :: device_type
    integer :: cl_status, idev
    integer  :: ndevices, ret_devices, nplatforms, iplat
    character(len=256) :: device_name
    type(cl_platform_id) :: platform_id
    type(cl_program) :: prog
    type(cl_platform_id), allocatable :: allplatforms(:)
    type(cl_device_id), allocatable :: alldevices(:)
    type(profile_t), save :: prof_init
#endif

    PUSH_SUB(accel_init)

    buffer_alloc_count = 0

    !%Variable DisableAccel    
    !%Type logical
    !%Default yes
    !%Section Execution::Accel
    !%Description
    !% If Octopus was compiled with OpenCL or CUDA support, it will
    !% try to initialize and use an accelerator device. By setting this
    !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
    !%End
    call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
#ifdef HAVE_ACCEL
    default = .false.
#else
    default = .true.
#endif
    call parse_variable(namespace, 'DisableAccel', default, disable)
    accel%enabled = .not. disable
    
#ifndef HAVE_ACCEL
    if(accel%enabled) then
      message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
      call messages_fatal(1)
    end if
#endif

    if(.not. accel_is_enabled()) then
      POP_SUB(accel_init)
      return
    end if

    !%Variable AccelPlatform
    !%Type integer
    !%Default 0
    !%Section Execution::Accel
    !%Description
    !% This variable selects the OpenCL platform that Octopus will
    !% use. You can give an explicit platform number or use one of
    !% the options that select a particular vendor
    !% implementation. Platform 0 is used by default.
    !%
    !% This variable has no effect for CUDA.
    !%Option amd -2
    !% Use the AMD OpenCL platform.
    !%Option nvidia -3
    !% Use the Nvidia OpenCL platform.
    !%Option ati -4
    !% Use the ATI (old AMD) OpenCL platform.
    !%Option intel -5
    !% Use the Intel OpenCL platform.
    !%End
    call parse_variable(namespace, 'AccelPlatform', 0, iplatform)

    call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
    
    !%Variable AccelDevice
    !%Type integer
    !%Default gpu
    !%Section Execution::Accel
    !%Description
    !% This variable selects the OpenCL or CUDA accelerator device
    !% that Octopus will use. You can specify one of the options below
    !% or a numerical id to select a specific device.
    !%
    !% Values >= 0 select the device to be used. In case of MPI enabled runs
    !% devices are distributed in a round robin fashion, starting at this value.
    !%Option gpu -1
    !% If available, Octopus will use a GPU.
    !%Option cpu -2
    !% If available, Octopus will use a CPU (only for OpenCL).
    !%Option accelerator -3
    !% If available, Octopus will use an accelerator (only for OpenCL).
    !%Option accel_default -4
    !% Octopus will use the default device specified by the implementation.
    !% implementation.
    !%End
    call parse_variable(namespace, 'AccelDevice', OPENCL_GPU, idevice)

    call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
    
    if(idevice < OPENCL_DEFAULT) then
      call messages_write('Invalid AccelDevice')
      call messages_fatal()
    end if

    call messages_print_stress(stdout, "GPU acceleration")

#ifdef HAVE_CUDA
    if(idevice<0) idevice = 0
    call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
      idevice, base_grp%rank)
#ifdef HAVE_MPI
    write(message(1), '(A, I5.5, A, I5.5)') "Rank ", base_grp%rank, " uses device number ", idevice
    call messages_info(1, all_nodes = .true.)
#endif

    ! no shared mem support in our cuda interface (for the moment)
    accel%shared_mem = .true.

    call cublas_init(accel%cublas_handle, accel%cuda_stream)
#endif
    
#ifdef HAVE_OPENCL
    call profiling_in(prof_init, 'CL_INIT')

    call clGetPlatformIDs(nplatforms, cl_status)
    if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")

    SAFE_ALLOCATE(allplatforms(1:nplatforms))

    call clGetPlatformIDs(allplatforms, iplat, cl_status)
    if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")

    call messages_write('Info: Available CL platforms: ')
    call messages_write(nplatforms)
    call messages_info()

    do iplat = 1, nplatforms

      call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_NAME, device_name, cl_status)

      if(iplatform < 0) then
        if(iplatform == get_platform_id(device_name)) iplatform = iplat - 1
      end if

      if(iplatform == iplat - 1) then
        call messages_write('    * Platform ')
      else
        call messages_write('      Platform ')
      end if

      call messages_write(iplat - 1)
      call messages_write(' : '//device_name)
      call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_VERSION, device_name, cl_status)
      call messages_write(' ('//trim(device_name)//')')
      call messages_info()
    end do

    call messages_info()

    if(iplatform >= nplatforms .or. iplatform < 0) then
      call messages_write('Requested CL platform does not exist')
      if(iplatform > 0) then 
        call messages_write('(platform = ')
        call messages_write(iplatform)
        call messages_write(').')
      end if
      call messages_fatal()
    end if

    platform_id = allplatforms(iplatform + 1)

    SAFE_DEALLOCATE_A(allplatforms)

    call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, ndevices, cl_status)

    call messages_write('Info: Available CL devices: ')
    call messages_write(ndevices)
    call messages_info()

    SAFE_ALLOCATE(alldevices(1:ndevices))

    ! list all devices

    call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)

    do idev = 1, ndevices
      call messages_write('      Device ')
      call messages_write(idev - 1)
      call clGetDeviceInfo(alldevices(idev), CL_DEVICE_NAME, device_name, cl_status)
      call messages_write(' : '//device_name)
      call messages_info()
    end do

    select case(idevice)
    case(OPENCL_GPU)
      device_type = CL_DEVICE_TYPE_GPU
    case(OPENCL_CPU)
      device_type = CL_DEVICE_TYPE_CPU
    case(OPENCL_ACCELERATOR)
      device_type = CL_DEVICE_TYPE_ACCELERATOR
    case(OPENCL_DEFAULT)
      device_type = CL_DEVICE_TYPE_DEFAULT
    case default
      device_type = CL_DEVICE_TYPE_ALL
    end select

    ! now get a list of the selected type
    call clGetDeviceIDs(platform_id, device_type, alldevices, ret_devices, cl_status)

    if(ret_devices < 1) then
      ! we didnt find a device of the selected type, we ask for the default device
      call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, alldevices, ret_devices, cl_status)

      if(ret_devices < 1) then
        ! if this does not work, we ask for all devices
        call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)
      end if

      if(ret_devices < 1) then
        call messages_write('Cannot find an OpenCL device')
        call messages_fatal()
      end if
    end if

    ! the number of devices can be smaller
    ndevices = ret_devices

    if(idevice < 0) then
      if(base_grp%size > 1) then
        ! with MPI we have to select the device so multiple GPUs in one
        ! node are correctly distributed
        call select_device(idevice)
      else
        idevice = 0
      end if
    end if

    if(idevice >= ndevices) then
      call messages_write('Requested CL device does not exist (device = ')
      call messages_write(idevice)
      call messages_write(', platform = ')
      call