Newer
Older
!!
!! This program is free software; you can redistribute it and/or modify
!! it under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 2, or (at your option)
!! any later version.
!!
!! This program is distributed in the hope that it will be useful,
!! but WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
!! GNU General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with this program; if not, write to the Free Software
!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
!! 02110-1301, USA.
!!
#include "global.h"
#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
#error "Cannot compile with OpenCL and Cuda support at the same time"
#endif
#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
#define HAVE_ACCEL 1
#endif
#ifdef HAVE_OPENCL
use clblas
#ifdef HAVE_CLFFT
use loct_oct_m
use messages_oct_m
use mpi_oct_m
use types_oct_m
use parser_oct_m
use profiling_oct_m
use unit_system_oct_m
private
accel_context_t, &
accel_device_t, &
accel_mem_t, &
accel_kernel_t, &
accel_t, &
accel_allow_CPU_only, &
accel_init, &
accel_end, &
accel_padded_size, &
accel_kernel_build, &
accel_create_buffer, &
accel_write_buffer, &
accel_read_buffer, &
accel_release_buffer, &
accel_buffer_is_allocated, &
accel_finish, &
accel_set_kernel_arg, &
accel_max_workgroup_size, &
accel_kernel_workgroup_size, &
accel_kernel_run, &
accel_set_buffer_to_zero, &
accel_use_shared_mem, &
clblas_print_error, &
clfft_print_error, &
accel_local_memory_size, &
accel_global_memory_size, &
accel_get_device_pointer, &
daccel_get_pointer_with_offset,&
zaccel_get_pointer_with_offset,&
accel_clean_pointer, &
accel_set_stream, &
accel_synchronize_all_streams
integer, public, parameter :: &
ACCEL_MEM_READ_ONLY = CL_MEM_READ_ONLY, &
ACCEL_MEM_READ_WRITE = CL_MEM_READ_WRITE, &
ACCEL_MEM_WRITE_ONLY = CL_MEM_WRITE_ONLY
#else
integer, public, parameter :: &
ACCEL_MEM_READ_ONLY = 0, &
ACCEL_MEM_READ_WRITE = 1, &
ACCEL_MEM_WRITE_ONLY = 2
#endif
! Components are public by default
#ifdef HAVE_OPENCL
type(cl_context) :: cl_context
#elif defined(HAVE_CUDA)
type(c_ptr) :: cuda_context
#else
integer :: dummy
#endif
end type accel_context_t
type accel_device_t
! Components are public by default
#ifdef HAVE_OPENCL
type(cl_device_id) :: cl_device
#elif defined(HAVE_CUDA)
type(c_ptr) :: cuda_device
#else
integer :: dummy
#endif
end type accel_device_t
type accel_t
! Components are public by default
type(accel_context_t) :: context
type(accel_device_t) :: device
#ifdef HAVE_OPENCL
type(cl_command_queue) :: command_queue
type(c_ptr) :: module_map
integer(i8) :: local_memory_size
integer(i8) :: global_memory_size
logical :: allow_CPU_only
end type accel_t
type accel_mem_t
! Components are public by default
#ifdef HAVE_OPENCL
type(cl_mem) :: mem

Micael Oliveira
已提交
integer(SIZEOF_SIZE_T) :: size = 0

Micael Oliveira
已提交
integer :: flags = 0
logical :: allocated = .false.
end type accel_mem_t
type accel_kernel_t
! Components are public by default
#ifdef HAVE_OPENCL
type(cl_kernel) :: kernel
#endif
#ifdef HAVE_CUDA
type(c_ptr) :: cuda_kernel
type(c_ptr) :: cuda_module
type(c_ptr) :: arguments
integer(i8) :: cuda_shared_mem
logical :: initialized = .false.
type(accel_kernel_t), pointer :: next
integer :: arg_count
end type accel_kernel_t
type(accel_t), public :: accel
! the kernels
type(accel_kernel_t), public, target, save :: kernel_vpsi
type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
type(accel_kernel_t), public, target, save :: kernel_daxpy
type(accel_kernel_t), public, target, save :: kernel_zaxpy
type(accel_kernel_t), public, target, save :: kernel_copy
type(accel_kernel_t), public, target, save :: dpack
type(accel_kernel_t), public, target, save :: zpack
type(accel_kernel_t), public, target, save :: dunpack
type(accel_kernel_t), public, target, save :: zunpack
type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
type(accel_kernel_t), public, target, save :: kernel_density_real
type(accel_kernel_t), public, target, save :: kernel_density_complex
type(accel_kernel_t), public, target, save :: kernel_density_spinors
type(accel_kernel_t), public, target, save :: kernel_phase
type(accel_kernel_t), public, target, save :: kernel_phase_spiral
type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
type(accel_kernel_t), public, target, save :: dzmul
type(accel_kernel_t), public, target, save :: zzmul

Martin Lueders
已提交
type(accel_kernel_t), public, target, save :: set_one
type(accel_kernel_t), save :: set_zero
interface accel_padded_size
module procedure accel_padded_size_i8, accel_padded_size_i4
end interface accel_padded_size
interface accel_create_buffer
module procedure accel_create_buffer_4, accel_create_buffer_8
end interface accel_create_buffer
interface accel_kernel_run
module procedure accel_kernel_run_4, accel_kernel_run_8
end interface accel_kernel_run
interface accel_set_buffer_to_zero
module procedure accel_set_buffer_to_zero_i8, accel_set_buffer_to_zero_i4
end interface accel_set_buffer_to_zero
interface accel_write_buffer
module procedure iaccel_write_buffer_single, laccel_write_buffer_single, daccel_write_buffer_single, zaccel_write_buffer_single
module procedure iaccel_write_buffer_0, laccel_write_buffer_0, daccel_write_buffer_0, zaccel_write_buffer_0
module procedure iaccel_write_buffer_1, laccel_write_buffer_1, daccel_write_buffer_1, zaccel_write_buffer_1
module procedure iaccel_write_buffer_2, laccel_write_buffer_2, daccel_write_buffer_2, zaccel_write_buffer_2
module procedure iaccel_write_buffer_3, laccel_write_buffer_3, daccel_write_buffer_3, zaccel_write_buffer_3
module procedure iaccel_write_buffer_0_i4, laccel_write_buffer_0_i4, daccel_write_buffer_0_i4, zaccel_write_buffer_0_i4
module procedure iaccel_write_buffer_1_i4, laccel_write_buffer_1_i4, daccel_write_buffer_1_i4, zaccel_write_buffer_1_i4
module procedure iaccel_write_buffer_2_i4, laccel_write_buffer_2_i4, daccel_write_buffer_2_i4, zaccel_write_buffer_2_i4
module procedure iaccel_write_buffer_3_i4, laccel_write_buffer_3_i4, daccel_write_buffer_3_i4, zaccel_write_buffer_3_i4
end interface accel_write_buffer
interface accel_read_buffer
module procedure iaccel_read_buffer_0, laccel_read_buffer_0, daccel_read_buffer_0, zaccel_read_buffer_0
module procedure iaccel_read_buffer_1, laccel_read_buffer_1, daccel_read_buffer_1, zaccel_read_buffer_1
module procedure iaccel_read_buffer_2, laccel_read_buffer_2, daccel_read_buffer_2, zaccel_read_buffer_2
module procedure iaccel_read_buffer_3, laccel_read_buffer_3, daccel_read_buffer_3, zaccel_read_buffer_3
module procedure iaccel_read_buffer_0_i4, laccel_read_buffer_0_i4, daccel_read_buffer_0_i4, zaccel_read_buffer_0_i4
module procedure iaccel_read_buffer_1_i4, laccel_read_buffer_1_i4, daccel_read_buffer_1_i4, zaccel_read_buffer_1_i4
module procedure iaccel_read_buffer_2_i4, laccel_read_buffer_2_i4, daccel_read_buffer_2_i4, zaccel_read_buffer_2_i4
module procedure iaccel_read_buffer_3_i4, laccel_read_buffer_3_i4, daccel_read_buffer_3_i4, zaccel_read_buffer_3_i4
end interface accel_read_buffer
accel_set_kernel_arg_buffer, &
iaccel_set_kernel_arg_data, &
daccel_set_kernel_arg_data, &
zaccel_set_kernel_arg_data, &
accel_set_kernel_arg_local
end interface accel_set_kernel_arg
interface accel_get_device_pointer
module procedure iaccel_get_device_pointer_1, laccel_get_device_pointer_1
module procedure iaccel_get_device_pointer_2, laccel_get_device_pointer_2
module procedure daccel_get_device_pointer_1, zaccel_get_device_pointer_1
module procedure daccel_get_device_pointer_2, zaccel_get_device_pointer_2
module procedure iaccel_get_device_pointer_1l, laccel_get_device_pointer_1l
module procedure iaccel_get_device_pointer_2l, laccel_get_device_pointer_2l
module procedure iaccel_get_device_pointer_3l, laccel_get_device_pointer_3l
module procedure daccel_get_device_pointer_1l, zaccel_get_device_pointer_1l
module procedure daccel_get_device_pointer_2l, zaccel_get_device_pointer_2l
module procedure daccel_get_device_pointer_3l, zaccel_get_device_pointer_3l
end interface accel_get_device_pointer
type(profile_t), save :: prof_read, prof_write
integer, parameter :: &
OPENCL_GPU = -1, &
OPENCL_CPU = -2, &
OPENCL_ACCELERATOR = -3, &
OPENCL_DEFAULT = -4
integer, parameter :: &
CL_PLAT_INVALID = -1, &
CL_PLAT_AMD = -2, &
CL_PLAT_NVIDIA = -3, &
CL_PLAT_ATI = -4, &
CL_PLAT_INTEL = -5
! a "convenience" public variable
integer, public :: cl_status

Xavier Andrade
已提交
integer :: buffer_alloc_count
type(accel_kernel_t), pointer :: head
contains
pure logical function accel_is_enabled() result(enabled)
enabled = .false.
end function accel_is_enabled
! ------------------------------------------
pure logical function accel_allow_CPU_only() result(allow)
#ifdef HAVE_ACCEL
allow = accel%allow_CPU_only
#else
allow = .true.
#endif
end function accel_allow_CPU_only
! ------------------------------------------
subroutine accel_init(base_grp, namespace)
type(mpi_grp_t), intent(inout) :: base_grp
type(namespace_t), intent(in) :: namespace
logical :: disable, default, run_benchmark
#ifdef HAVE_OPENCL
integer :: ndevices, ret_devices, nplatforms, iplat
type(cl_platform_id) :: platform_id
type(cl_program) :: prog
type(cl_platform_id), allocatable :: allplatforms(:)
type(cl_device_id), allocatable :: alldevices(:)
type(profile_t), save :: prof_init
#endif
#ifdef HAVE_CUDA
#ifdef HAVE_MPI
character(len=256) :: sys_name
#endif

Xavier Andrade
已提交
buffer_alloc_count = 0

Xavier Andrade
已提交
!%Type logical
!%Default yes
!%Section Execution::Accel
!%Description
!% If Octopus was compiled with OpenCL or CUDA support, it will
!% try to initialize and use an accelerator device. By setting this
!% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
default = .false.
call parse_variable(namespace, 'DisableAccel', default, disable)

Micael Oliveira
已提交
if (accel%enabled) then
message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
call messages_fatal(1)
end if

Micael Oliveira
已提交
if (.not. accel_is_enabled()) then
return
end if
!%Type integer
!%Default 0
!%Section Execution::Accel
!%Description
!% This variable selects the OpenCL platform that Octopus will
!% use. You can give an explicit platform number or use one of
!% the options that select a particular vendor
!% implementation. Platform 0 is used by default.
!%
!% This variable has no effect for CUDA.
!%Option amd -2
!% Use the AMD OpenCL platform.
!%Option nvidia -3
!% Use the Nvidia OpenCL platform.
!%Option ati -4
!% Use the ATI (old AMD) OpenCL platform.
!%Option intel -5
!% Use the Intel OpenCL platform.
!%End
call parse_variable(namespace, 'AccelPlatform', 0, iplatform)
call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
!%Type integer
!%Section Execution::Accel
!%Description
!% This variable selects the OpenCL or CUDA accelerator device
!% that Octopus will use. You can specify one of the options below
!% or a numerical id to select a specific device.
!%
!% Values >= 0 select the device to be used. In case of MPI enabled runs
!% devices are distributed in a round robin fashion, starting at this value.
!%Option gpu -1
!% If available, Octopus will use a GPU.
!%Option cpu -2
!% If available, Octopus will use a CPU (only for OpenCL).
!%Option accelerator -3
!% If available, Octopus will use an accelerator (only for OpenCL).
!%Option accel_default -4
!% Octopus will use the default device specified by the implementation.
!% implementation.
!%End
call parse_variable(namespace, 'AccelDevice', OPENCL_GPU, idevice)
call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')

Micael Oliveira
已提交
if (idevice < OPENCL_DEFAULT) then
call messages_write('Invalid AccelDevice')
call messages_fatal()

Micael Oliveira
已提交
call messages_print_stress(msg="GPU acceleration", namespace=namespace)

Micael Oliveira
已提交
if (idevice<0) idevice = 0
call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
idevice, base_grp%rank)
call loct_sysname(sys_name)
write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
" on ", trim(sys_name)
call messages_info(1, all_nodes = .true.)
#endif
! no shared mem support in our cuda interface (for the moment)
call cublas_init(accel%cublas_handle, accel%cuda_stream)
#ifdef HAVE_OPENCL
call profiling_in(prof_init, 'CL_INIT')
call clGetPlatformIDs(nplatforms, cl_status)

Micael Oliveira
已提交
if (cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")
SAFE_ALLOCATE(allplatforms(1:nplatforms))
call clGetPlatformIDs(allplatforms, iplat, cl_status)

Micael Oliveira
已提交
if (cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")
call messages_write('Info: Available CL platforms: ')
call messages_write(nplatforms)
call messages_info()
do iplat = 1, nplatforms
call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_NAME, device_name, cl_status)

Micael Oliveira
已提交
if (iplatform < 0) then
if (iplatform == get_platform_id(device_name)) iplatform = iplat - 1
end if

Micael Oliveira
已提交
if (iplatform == iplat - 1) then
call messages_write(' * Platform ')
else
call messages_write(' Platform ')
end if
call messages_write(iplat - 1)
call messages_write(' : '//device_name)
call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_VERSION, device_name, cl_status)
call messages_write(' ('//trim(device_name)//')')
call messages_info()
end do
call messages_info()

Micael Oliveira
已提交
if (iplatform >= nplatforms .or. iplatform < 0) then
call messages_write('Requested CL platform does not exist')

Micael Oliveira
已提交
if (iplatform > 0) then
call messages_write('(platform = ')
call messages_write(iplatform)
call messages_write(').')
end if
call messages_fatal()
end if
platform_id = allplatforms(iplatform + 1)
SAFE_DEALLOCATE_A(allplatforms)
call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, ndevices, cl_status)
call messages_write('Info: Available CL devices: ')
call messages_write(ndevices)
call messages_info()
SAFE_ALLOCATE(alldevices(1:ndevices))
! list all devices
call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)
do idev = 1, ndevices
call messages_write(' Device ')
call messages_write(idev - 1)
call clGetDeviceInfo(alldevices(idev), CL_DEVICE_NAME, device_name, cl_status)
call messages_write(' : '//device_name)
call messages_info()
end do

Micael Oliveira
已提交
select case (idevice)
case (OPENCL_GPU)
device_type = CL_DEVICE_TYPE_GPU

Micael Oliveira
已提交
case (OPENCL_CPU)
device_type = CL_DEVICE_TYPE_CPU

Micael Oliveira
已提交
case (OPENCL_ACCELERATOR)
device_type = CL_DEVICE_TYPE_ACCELERATOR

Micael Oliveira
已提交
case (OPENCL_DEFAULT)
device_type = CL_DEVICE_TYPE_DEFAULT
case default
device_type = CL_DEVICE_TYPE_ALL
end select
! now get a list of the selected type
call clGetDeviceIDs(platform_id, device_type, alldevices, ret_devices, cl_status)

Micael Oliveira
已提交
if (ret_devices < 1) then
! we didnt find a device of the selected type, we ask for the default device
call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, alldevices, ret_devices, cl_status)

Micael Oliveira
已提交
if (ret_devices < 1) then
! if this does not work, we ask for all devices
call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)
end if

Micael Oliveira
已提交
if (ret_devices < 1) then
call messages_write('Cannot find an OpenCL device')
call messages_fatal()
end if
end if
! the number of devices can be smaller
ndevices = ret_devices

Micael Oliveira
已提交
if (idevice < 0) then
if (base_grp%size > 1) then
! with MPI we have to select the device so multiple GPUs in one
! node are correctly distributed
call select_device(idevice)
else
idevice = 0
end if
end if

Micael Oliveira
已提交
if (idevice >= ndevices) then
call messages_write('Requested CL device does not exist (device = ')
call messages_write(idevice)
call messages_write(', platform = ')
call messages_write(iplatform)
call messages_write(').')
call messages_fatal()
end if
accel%device%cl_device = alldevices(idevice + 1)
! create the context
accel%context%cl_context = clCreateContext(platform_id, accel%device%cl_device, cl_status)

Micael Oliveira
已提交
if (cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "CreateContext")
SAFE_DEALLOCATE_A(alldevices)
accel%command_queue = clCreateCommandQueue(accel%context%cl_context, accel%device%cl_device, &
CL_QUEUE_PROFILING_ENABLE, cl_status)

Micael Oliveira
已提交
if (cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "CreateCommandQueue")
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_TYPE, device_type, cl_status)

Micael Oliveira
已提交
select case (device_type)
case (CL_DEVICE_TYPE_GPU)

Micael Oliveira
已提交
case (CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_ACCELERATOR)
case default
end select
#ifdef HAVE_CLBLAS
call clblasSetup(cl_status)

Micael Oliveira
已提交
if (cl_status /= clblasSuccess) call clblas_print_error(cl_status, 'clblasSetup')
#endif
#ifdef HAVE_CLFFT
call clfftSetup(cl_status)

Micael Oliveira
已提交
if (cl_status /= CLFFT_SUCCESS) call clfft_print_error(cl_status, 'clfftSetup')
#endif
call profiling_out(prof_init)
#endif
! Get some device information that we will need later
! total memory
#ifdef HAVE_OPENCL
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_GLOBAL_MEM_SIZE, accel%global_memory_size, cl_status)
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_LOCAL_MEM_SIZE, accel%local_memory_size, cl_status)
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_WORK_GROUP_SIZE, accel%max_workgroup_size, cl_status)
#endif
#ifdef HAVE_CUDA
call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)

Micael Oliveira
已提交
if (mpi_grp_is_root(base_grp)) call device_info()
! initialize the cache used to speed up allocations
call alloc_cache_init(memcache, nint(CNST(0.25)*accel%global_memory_size, 8))
! now initialize the kernels
call accel_kernel_start_call(set_zero, 'set_zero.cl', "set_zero")

Martin Lueders
已提交
call accel_kernel_start_call(set_one, 'set_one.cl', "set_one")
call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi")
call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cl', "vpsi_complex")
call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors")
call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cl', "vpsi_spinors_complex")
call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE')
call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX')
call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cl', "dbatch_axpy_function", flags = '-lineinfo -DRTYPE_DOUBLE')
call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cl', "zbatch_axpy_function", flags = '-lineinfo -DRTYPE_COMPLEX')
call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cl', "dbatch_ax_function_py", flags = '-lineinfo -DRTYPE_DOUBLE')
call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cl', "zbatch_ax_function_py", flags = '-lineinfo -DRTYPE_COMPLEX')
call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cl', "dbatch_mf_dotp", flags = '-lineinfo')
call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cl', "zbatch_mf_dotp", flags = '-lineinfo')
call accel_kernel_start_call(dpack, 'pack.cl', "dpack")
call accel_kernel_start_call(zpack, 'pack.cl', "zpack")
call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack")
call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack")
call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy")
call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cl', "ghost_reorder")
call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real")
call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex")
call accel_kernel_start_call(kernel_density_spinors, 'density.cl', "density_spinors")
call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase")
call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix")
call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix")
call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors")
call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE')
call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX')
!%Type logical
!%Default no
!%Section Execution::Accel
!%Description
!% If this variable is set to yes, Octopus will run some
!% routines to benchmark the performance of the accelerator device.
call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')

Micael Oliveira
已提交
if (run_benchmark) then
call opencl_check_bandwidth()
end if
!%Variable CudaAwareMPI
!%Type logical
!%Section Execution::Accel
!%Description
!% If Octopus was compiled with CUDA support and MPI support and if the MPI
!% implementation is CUDA-aware (i.e., it supports communication using device pointers),
!% this switch can be set to true to use the CUDA-aware MPI features. The advantage
!% of this approach is that it can do, e.g., peer-to-peer copies between devices without
!% going through the host memmory.
!% The default is false, except when the configure switch --enable-cudampi is set, in which
!% case this variable is set to true.
!%End
#ifdef HAVE_CUDA_MPI
default = .true.
#else
default = .false.
#endif
call parse_variable(namespace, 'CudaAwareMPI', default, accel%cuda_mpi)

Micael Oliveira
已提交
if (accel%cuda_mpi) then
call messages_write("Using CUDA-aware MPI.")
call messages_info()
end if
!%Variable AllowCPUonly
!%Type logical
!%Section Execution::Accel
!%Description
!% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
!% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
!% code execution also in these cases.
!%End
#if defined (HAVE_ACCEL)
default = .false.
#else
default = .true.
#endif
call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)

Micael Oliveira
已提交
call messages_print_stress(namespace=namespace)
contains
subroutine select_device(idevice)
integer, intent(inout) :: idevice
integer :: irank
character(len=256) :: device_name
PUSH_SUB(accel_init.select_device)
idevice = mod(base_grp%rank, ndevices)
call messages_write('Info: CL device distribution:')
call messages_info()
do irank = 0, base_grp%size - 1

Micael Oliveira
已提交
if (irank == base_grp%rank) then
call clGetDeviceInfo(alldevices(idevice + 1), CL_DEVICE_NAME, device_name, cl_status)
call messages_write(' MPI node ')
call messages_write(base_grp%rank)
call messages_write(' -> CL device ')
call messages_write(idevice)
call messages_write(' : '//device_name)
call messages_info(all_nodes = .true.)
end if
POP_SUB(accel_init.select_device)
end subroutine select_device

Micael Oliveira
已提交
#endif
subroutine device_info()
#endif
#ifdef HAVE_CUDA
integer :: version
#endif
integer :: major, minor
character(len=256) :: val_str
PUSH_SUB(accel_init.device_info)
call messages_new_line()
call messages_write('Selected device:')
call messages_new_line()
#ifdef HAVE_OPENCL
call messages_write(' Framework : OpenCL')
#endif
#ifdef HAVE_CUDA
call messages_write(' Framework : CUDA')
#endif
call messages_info()
#ifdef HAVE_CUDA
call messages_write(' Device type : GPU', new_line = .true.)
call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
#endif
#ifdef HAVE_OPENCL
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_TYPE, val, cl_status)
call messages_write(' Device type :')

Micael Oliveira
已提交
select case (int(val, 4))
case (CL_DEVICE_TYPE_GPU)
call messages_write(' GPU')

Micael Oliveira
已提交
case (CL_DEVICE_TYPE_CPU)
call messages_write(' CPU')

Micael Oliveira
已提交
case (CL_DEVICE_TYPE_ACCELERATOR)
call messages_write(' accelerator')
end select
call messages_new_line()
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_VENDOR, val_str, cl_status)
call messages_write(' Device vendor : '//trim(val_str))
call messages_new_line()
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_NAME, val_str, cl_status)
#endif
#ifdef HAVE_CUDA
call cuda_device_name(accel%device%cuda_device, val_str)
#endif
call messages_write(' Device name : '//trim(val_str))
call messages_new_line()
call cuda_device_capability(accel%device%cuda_device, major, minor)
#endif
call messages_write(' Cuda capabilities :')
call messages_write(major, fmt = '(i2)')
call messages_write('.')
call messages_write(minor, fmt = '(i1)')
call messages_new_line()
! VERSION
#ifdef HAVE_OPENCL
call clGetDeviceInfo(accel%device%cl_device, CL_DRIVER_VERSION, val_str, cl_status)
call messages_write(' Driver version : '//trim(val_str))
#endif
#ifdef HAVE_CUDA
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_COMPUTE_UNITS, val, cl_status)
call messages_write(' Compute units :')
call messages_write(val)
call messages_new_line()
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_CLOCK_FREQUENCY, val, cl_status)
call messages_write(' Clock frequency :')
call messages_write(val)
call messages_write(' GHz')
call messages_new_line()
call messages_write(' Device memory :')
call messages_write(accel%global_memory_size, units=unit_megabytes)
call messages_new_line()
call messages_write(' Local/shared memory :')
call messages_write(accel%local_memory_size, units=unit_kilobytes)
call messages_new_line()
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, val, cl_status)
call messages_write(' Max alloc size :')
call messages_write(val, units = unit_megabytes)
call messages_new_line()
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, val, cl_status)
call messages_write(' Device cache :')
call messages_write(val, units = unit_kilobytes)
call messages_new_line()
call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, val, cl_status)
call messages_write(' Constant memory :')
call messages_write(val, units = unit_kilobytes)
call messages_new_line()
call messages_write(' Max. group/block size :')
call messages_write(accel%max_workgroup_size)
call messages_new_line()
call messages_write(' Extension cl_khr_fp64 :')
call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64"))
call messages_new_line()
call messages_write(' Extension cl_amd_fp64 :')
call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64"))
call messages_new_line()
call messages_info()
POP_SUB(accel_init.device_info)
end subroutine device_info
end subroutine accel_init
! ------------------------------------------

Micael Oliveira
已提交
#ifdef HAVE_OPENCL
integer function get_platform_id(platform_name) result(platform_id)
character(len=*), intent(in) :: platform_name
platform_id = CL_PLAT_INVALID

Micael Oliveira
已提交
if (index(platform_name, 'AMD') > 0) platform_id = CL_PLAT_AMD
if (index(platform_name, 'ATI') > 0) platform_id = CL_PLAT_ATI
if (index(platform_name, 'NVIDIA') > 0) platform_id = CL_PLAT_NVIDIA
if (index(platform_name, 'Intel') > 0) platform_id = CL_PLAT_INTEL
end function get_platform_id

Micael Oliveira
已提交
#endif
! ------------------------------------------

Micael Oliveira
已提交
subroutine accel_end(namespace)
type(namespace_t), intent(in) :: namespace
integer :: ierr
real(r8) :: volume_hits, volume_misses
logical :: found
type(accel_mem_t) :: tmp

Micael Oliveira
已提交
if (accel_is_enabled()) then
call alloc_cache_get(memcache, ALLOC_CACHE_ANY_SIZE, found, tmp%mem)

Micael Oliveira
已提交
if (.not. found) exit
#ifdef HAVE_OPENCL
call clReleaseMemObject(tmp%mem, ierr)

Micael Oliveira
已提交
if (ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseMemObject")
#endif
#ifdef HAVE_CUDA
call cuda_mem_free(tmp%mem)
#endif
end do
call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)

Micael Oliveira
已提交
call messages_print_stress(msg="Acceleration-device allocation cache", namespace=namespace)
call messages_new_line()
call messages_write(' Number of allocations =')
call messages_write(hits + misses, new_line = .true.)
call messages_write(' Volume of allocations =')
call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
new_line = .true.)
call messages_write(hits/TOFLOAT(hits + misses)*100, fmt='(f6.1)', align_left = .true.)
call messages_write('%', new_line = .true.)
call messages_write(' Volume hit ratio =')
call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)

Micael Oliveira
已提交
call messages_print_stress(namespace=namespace)
call clblasTearDown()
#endif
#ifdef HAVE_CLFFT

Micael Oliveira
已提交
if (accel_is_enabled()) then
if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
end if
#ifdef HAVE_OPENCL
call clReleaseCommandQueue(accel%command_queue, ierr)

Micael Oliveira
已提交
if (ierr /= CL_SUCCESS) call opencl_print_error(ierr, "ReleaseCommandQueue")
call clReleaseContext(accel%context%cl_context, cl_status)

Micael Oliveira
已提交
if (buffer_alloc_count /= 0) then
call messages_write(TOFLOAT(allocated_mem), fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
call messages_write(' in ')
call messages_write(buffer_alloc_count)
call messages_write(' buffers were not deallocated.')