Newer
Older
!!
!! This program is free software; you can redistribute it and/or modify
!! it under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 2, or (at your option)
!! any later version.
!!
!! This program is distributed in the hope that it will be useful,
!! but WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
!! GNU General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with this program; if not, write to the Free Software
!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
!! 02110-1301, USA.
!!
#include "global.h"
#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
#error "Cannot compile with OpenCL and Cuda support at the same time"
#endif
#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
#define HAVE_ACCEL 1
#endif
#ifdef HAVE_OPENCL
use clblas
#ifdef HAVE_CLFFT
use loct_oct_m
use messages_oct_m
use mpi_oct_m
use types_oct_m
use parser_oct_m
use profiling_oct_m
use unit_system_oct_m
implicit none
private
accel_context_t, &
accel_device_t, &
accel_mem_t, &
accel_kernel_t, &
accel_t, &
accel_is_enabled, &
accel_init, &
accel_end, &
accel_padded_size, &
accel_mem_nullify, &
accel_kernel_build, &
accel_create_buffer, &
accel_write_buffer, &
accel_read_buffer, &
accel_release_buffer, &
accel_buffer_is_allocated, &
accel_finish, &
accel_set_kernel_arg, &
accel_max_workgroup_size, &
accel_kernel_workgroup_size, &
accel_kernel_run, &
accel_set_buffer_to_zero, &
accel_use_shared_mem, &
clblas_print_error, &
clfft_print_error, &
accel_local_memory_size, &
accel_global_memory_size, &
accel_get_device_pointer, &
accel_set_stream, &
accel_synchronize_all_streams
integer, public, parameter :: &
ACCEL_MEM_READ_ONLY = CL_MEM_READ_ONLY, &
ACCEL_MEM_READ_WRITE = CL_MEM_READ_WRITE, &
ACCEL_MEM_WRITE_ONLY = CL_MEM_WRITE_ONLY
#else
integer, public, parameter :: &
ACCEL_MEM_READ_ONLY = 0, &
ACCEL_MEM_READ_WRITE = 1, &
ACCEL_MEM_WRITE_ONLY = 2
#endif
! Components are public by default
#ifdef HAVE_OPENCL
type(cl_context) :: cl_context
#elif defined(HAVE_CUDA)
type(c_ptr) :: cuda_context
#else
integer :: dummy
#endif
end type accel_context_t
type accel_device_t
! Components are public by default
#ifdef HAVE_OPENCL
type(cl_device_id) :: cl_device
#elif defined(HAVE_CUDA)
type(c_ptr) :: cuda_device
#else
integer :: dummy
#endif
end type accel_device_t
type accel_t
! Components are public by default
type(accel_context_t) :: context
type(accel_device_t) :: device
#ifdef HAVE_OPENCL
type(cl_command_queue) :: command_queue
type(c_ptr) :: module_map
integer :: max_workgroup_size
integer(8) :: local_memory_size
integer(8) :: global_memory_size
logical :: enabled
logical :: shared_mem
end type accel_t
type accel_mem_t
! Components are public by default
#ifdef HAVE_OPENCL
type(cl_mem) :: mem
#endif
integer(SIZEOF_SIZE_T) :: size
type(type_t) :: type
integer :: flags
logical :: allocated
end type accel_mem_t
type accel_kernel_t
! Components are public by default
#ifdef HAVE_OPENCL
type(cl_kernel) :: kernel
#endif
#ifdef HAVE_CUDA
type(c_ptr) :: cuda_kernel
type(c_ptr) :: cuda_module
type(c_ptr) :: arguments
logical :: initialized = .false.
type(accel_kernel_t), pointer :: next
integer :: arg_count
end type accel_kernel_t
type(accel_t), public :: accel
! the kernels
type(accel_kernel_t), public, target, save :: kernel_vpsi
type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
type(accel_kernel_t), public, target, save :: kernel_daxpy
type(accel_kernel_t), public, target, save :: kernel_zaxpy
type(accel_kernel_t), public, target, save :: kernel_copy
type(accel_kernel_t), public, target, save :: kernel_complex_conj
type(accel_kernel_t), public, target, save :: kernel_complex_conj_combine
type(accel_kernel_t), public, target, save :: dpack
type(accel_kernel_t), public, target, save :: zpack
type(accel_kernel_t), public, target, save :: dunpack
type(accel_kernel_t), public, target, save :: zunpack
type(accel_kernel_t), public, target, save :: kernel_subarray_gather
type(accel_kernel_t), public, target, save :: kernel_density_real
type(accel_kernel_t), public, target, save :: kernel_density_complex
type(accel_kernel_t), public, target, save :: kernel_density_spinors
type(accel_kernel_t), public, target, save :: kernel_phase
type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
type(accel_kernel_t), public, target, save :: dzmul
type(accel_kernel_t), public, target, save :: zzmul

Martin Lueders
已提交
type(accel_kernel_t), public, target, save :: set_one
type(accel_kernel_t), save :: set_zero
interface accel_create_buffer
module procedure accel_create_buffer_4, accel_create_buffer_8
end interface accel_create_buffer
interface accel_write_buffer
module procedure iaccel_write_buffer_0, daccel_write_buffer_0, zaccel_write_buffer_0
module procedure iaccel_write_buffer_1, daccel_write_buffer_1, zaccel_write_buffer_1
module procedure iaccel_write_buffer_2, daccel_write_buffer_2, zaccel_write_buffer_2
module procedure iaccel_write_buffer_3, daccel_write_buffer_3, zaccel_write_buffer_3
end interface accel_write_buffer
interface accel_read_buffer
module procedure iaccel_read_buffer_1, daccel_read_buffer_1, zaccel_read_buffer_1
module procedure iaccel_read_buffer_2, daccel_read_buffer_2, zaccel_read_buffer_2
module procedure iaccel_read_buffer_3, daccel_read_buffer_3, zaccel_read_buffer_3
end interface accel_read_buffer
accel_set_kernel_arg_buffer, &
iaccel_set_kernel_arg_data, &
daccel_set_kernel_arg_data, &
zaccel_set_kernel_arg_data, &
accel_set_kernel_arg_local
end interface accel_set_kernel_arg
interface accel_get_device_pointer
module procedure iaccel_get_device_pointer_1
module procedure iaccel_get_device_pointer_2
module procedure daccel_get_device_pointer_1, zaccel_get_device_pointer_1
module procedure daccel_get_device_pointer_2, zaccel_get_device_pointer_2
end interface accel_get_device_pointer
type(profile_t), save :: prof_read, prof_write
integer, parameter :: &
OPENCL_GPU = -1, &
OPENCL_CPU = -2, &
OPENCL_ACCELERATOR = -3, &
OPENCL_DEFAULT = -4
integer, parameter :: &
CL_PLAT_INVALID = -1, &
CL_PLAT_AMD = -2, &
CL_PLAT_NVIDIA = -3, &
CL_PLAT_ATI = -4, &
CL_PLAT_INTEL = -5
! a "convenience" public variable
integer, public :: cl_status

Xavier Andrade
已提交
integer :: buffer_alloc_count
integer(8) :: allocated_mem
type(accel_kernel_t), pointer :: head
contains
pure logical function accel_is_enabled() result(enabled)
enabled = .false.
end function accel_is_enabled
! ------------------------------------------
subroutine accel_init(base_grp, namespace)
type(mpi_grp_t), intent(inout) :: base_grp
type(namespace_t), intent(in) :: namespace
logical :: disable, default, run_benchmark
#ifdef HAVE_OPENCL
integer :: ndevices, ret_devices, nplatforms, iplat
type(cl_platform_id) :: platform_id
type(cl_program) :: prog
type(cl_platform_id), allocatable :: allplatforms(:)
type(cl_device_id), allocatable :: alldevices(:)
type(profile_t), save :: prof_init

Xavier Andrade
已提交
buffer_alloc_count = 0

Xavier Andrade
已提交
!%Type logical
!%Default yes
!%Section Execution::Accel
!%Description
!% If Octopus was compiled with OpenCL or CUDA support, it will
!% try to initialize and use an accelerator device. By setting this
!% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
default = .false.
call parse_variable(namespace, 'DisableAccel', default, disable)
message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
call messages_fatal(1)
end if
if(.not. accel_is_enabled()) then
POP_SUB(accel_init)
return
end if
!%Type integer
!%Default 0
!%Section Execution::Accel
!%Description
!% This variable selects the OpenCL platform that Octopus will
!% use. You can give an explicit platform number or use one of
!% the options that select a particular vendor
!% implementation. Platform 0 is used by default.
!%
!% This variable has no effect for CUDA.
!%Option amd -2
!% Use the AMD OpenCL platform.
!%Option nvidia -3
!% Use the Nvidia OpenCL platform.
!%Option ati -4
!% Use the ATI (old AMD) OpenCL platform.
!%Option intel -5
!% Use the Intel OpenCL platform.
!%End
call parse_variable(namespace, 'AccelPlatform', 0, iplatform)
call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
!%Type integer
!%Section Execution::Accel
!%Description
!% This variable selects the OpenCL or CUDA accelerator device
!% that Octopus will use. You can specify one of the options below
!% or a numerical id to select a specific device.
!%
!% Values >= 0 select the device to be used. In case of MPI enabled runs
!% devices are distributed in a round robin fashion, starting at this value.
!%Option gpu -1
!% If available, Octopus will use a GPU.
!%Option cpu -2
!% If available, Octopus will use a CPU (only for OpenCL).
!%Option accelerator -3
!% If available, Octopus will use an accelerator (only for OpenCL).
!%Option accel_default -4
!% Octopus will use the default device specified by the implementation.
!% implementation.
!%End
call parse_variable(namespace, 'AccelDevice', OPENCL_GPU, idevice)
call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
if(idevice < OPENCL_DEFAULT) then
call messages_write('Invalid AccelDevice')
call messages_fatal()
call messages_print_stress(stdout, "GPU acceleration")
if(idevice<0) idevice = 0
call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
idevice, base_grp%rank)
#ifdef HAVE_MPI
write(message(1), '(A, I5.5, A, I5.5)') "Rank ", base_grp%rank, " uses device number ", idevice
call messages_info(1, all_nodes = .true.)
#endif
! no shared mem support in our cuda interface (for the moment)
call cublas_init(accel%cublas_handle, accel%cuda_stream)
#ifdef HAVE_OPENCL
call profiling_in(prof_init, 'CL_INIT')
call clGetPlatformIDs(nplatforms, cl_status)
if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")
SAFE_ALLOCATE(allplatforms(1:nplatforms))
call clGetPlatformIDs(allplatforms, iplat, cl_status)
if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")
call messages_write('Info: Available CL platforms: ')
call messages_write(nplatforms)
call messages_info()
do iplat = 1, nplatforms
call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_NAME, device_name, cl_status)
if(iplatform < 0) then
if(iplatform == get_platform_id(device_name)) iplatform = iplat - 1
end if
if(iplatform == iplat - 1) then
call messages_write(' * Platform ')
else
call messages_write(' Platform ')
end if
call messages_write(iplat - 1)
call messages_write(' : '//device_name)
call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_VERSION, device_name, cl_status)
call messages_write(' ('//trim(device_name)//')')
call messages_info()
end do
call messages_info()
if(iplatform >= nplatforms .or. iplatform < 0) then
call messages_write('Requested CL platform does not exist')
if(iplatform > 0) then
call messages_write('(platform = ')
call messages_write(iplatform)
call messages_write(').')
end if
call messages_fatal()
end if
platform_id = allplatforms(iplatform + 1)
SAFE_DEALLOCATE_A(allplatforms)
call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, ndevices, cl_status)
call messages_write('Info: Available CL devices: ')
call messages_write(ndevices)
call messages_info()
SAFE_ALLOCATE(alldevices(1:ndevices))
! list all devices
call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)
do idev = 1, ndevices
call messages_write(' Device ')
call messages_write(idev - 1)
call clGetDeviceInfo(alldevices(idev), CL_DEVICE_NAME, device_name, cl_status)
call messages_write(' : '//device_name)
call messages_info()
end do
select case(idevice)
case(OPENCL_GPU)
device_type = CL_DEVICE_TYPE_GPU
case(OPENCL_CPU)
device_type = CL_DEVICE_TYPE_CPU
case(OPENCL_ACCELERATOR)
device_type = CL_DEVICE_TYPE_ACCELERATOR
case(OPENCL_DEFAULT)
device_type = CL_DEVICE_TYPE_DEFAULT
case default
device_type = CL_DEVICE_TYPE_ALL
end select
! now get a list of the selected type
call clGetDeviceIDs(platform_id, device_type, alldevices, ret_devices, cl_status)
if(ret_devices < 1) then
! we didnt find a device of the selected type, we ask for the default device
call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, alldevices, ret_devices, cl_status)
if(ret_devices < 1) then
! if this does not work, we ask for all devices
call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)
end if
if(ret_devices < 1) then
call messages_write('Cannot find an OpenCL device')
call messages_fatal()
end if
end if
! the number of devices can be smaller
ndevices = ret_devices
if(idevice < 0) then
if(base_grp%size > 1) then
! with MPI we have to select the device so multiple GPUs in one
! node are correctly distributed
call select_device(idevice)
else
idevice = 0
end if
end if
if(idevice >= ndevices) then
call messages_write('Requested CL device does not exist (device = ')
call messages_write(idevice)
call messages_write(', platform = ')
call
正在加载完整的 blame...