Newer
Older
integer :: ierr
logical :: put
integer(8) :: fsize
PUSH_SUB(accel_release_buffer)
if(this%size > 0) then
fsize = int(this%size, 8)*types_get_size(this%type)
call alloc_cache_put(memcache, fsize, this%mem, put)
if(.not. put) then
#ifdef HAVE_OPENCL
call clReleaseMemObject(this%mem, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseMemObject")
#endif
INCR(buffer_alloc_count, -1)
end if
this%size = 0
POP_SUB(accel_release_buffer)
end subroutine accel_release_buffer
! ------------------------------------------
logical pure function accel_buffer_is_allocated(this) result(allocated)
type(accel_mem_t), intent(in) :: this
end function accel_buffer_is_allocated
! ------------------------------------------
integer(SIZEOF_SIZE_T) pure function opencl_get_buffer_size(this) result(size)
type(accel_mem_t), intent(in) :: this
size = this%size
end function opencl_get_buffer_size
! -----------------------------------------
type(type_t) pure function opencl_get_buffer_type(this) result(type)
type(accel_mem_t), intent(in) :: this
type = this%type
end function opencl_get_buffer_type
! -----------------------------------------
integer :: ierr
! no push_sub, called too frequently
#ifdef HAVE_OPENCL
call clFinish(accel%command_queue, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, 'clFinish')
#endif
#ifdef HAVE_CUDA
call cuda_context_synchronize()
#endif
! ------------------------------------------
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
type(accel_kernel_t), intent(inout) :: kernel
integer, intent(in) :: narg
type(accel_mem_t), intent(in) :: buffer
ASSERT(accel_buffer_is_allocated(buffer))
! no push_sub, called too frequently
#ifdef HAVE_OPENCL
call clSetKernelArg(kernel%kernel, narg, buffer%mem, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clSetKernelArg_buf")
#endif
call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
end subroutine accel_set_kernel_arg_buffer
! ------------------------------------------
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
type(accel_kernel_t), intent(inout) :: kernel
integer, intent(in) :: narg
type(type_t), intent(in) :: type
integer, intent(in) :: size
integer(8) :: size_in_bytes
PUSH_SUB(accel_set_kernel_arg_local)
size_in_bytes = int(size, 8)*types_get_size(type)
if(size_in_bytes > accel%local_memory_size) then
write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", dble(size_in_bytes)/1024.0, " Kb"
write(message(2), '(a,f12.6,a)') " available local memory: ", dble(accel%local_memory_size)/1024.0, " Kb"
call messages_fatal(2)
else if(size_in_bytes <= 0) then
write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
call messages_fatal(1)
end if
#ifdef HAVE_CUDA
kernel%cuda_shared_mem = size_in_bytes
#endif
#ifdef HAVE_OPENCL
call clSetKernelArgLocal(kernel%kernel, narg, size_in_bytes, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "set_kernel_arg_local")
#endif
POP_SUB(accel_set_kernel_arg_local)
end subroutine accel_set_kernel_arg_local
! ------------------------------------------
subroutine accel_kernel_run(kernel, globalsizes, localsizes)
type(accel_kernel_t), intent(inout) :: kernel
integer, intent(in) :: globalsizes(:)
integer, intent(in) :: localsizes(:)
integer :: dim
#ifdef HAVE_OPENCL
integer :: ierr
#endif
integer(8) :: gsizes(1:3)
integer(8) :: lsizes(1:3)
! no push_sub, called too frequently
! cuda needs all dimensions
gsizes = 1
lsizes = 1
dim = ubound(globalsizes, dim = 1)
ASSERT(dim == ubound(localsizes, dim = 1))
! if one size is zero, there is nothing to do
if(any(globalsizes == 0)) return
ASSERT(all(localsizes <= accel_max_workgroup_size()))
ASSERT(all(mod(globalsizes, localsizes) == 0))
gsizes(1:dim) = int(globalsizes(1:dim), 8)
lsizes(1:dim) = int(localsizes(1:dim), 8)
#ifdef HAVE_OPENCL
call clEnqueueNDRangeKernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
#endif
gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
ASSERT(gsizes(1) < 2_8**31 - 1_8)
ASSERT(all(gsizes(2:3) <= 65535_8))
call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
kernel%cuda_shared_mem = 0
end subroutine accel_kernel_run
! -----------------------------------------------
integer pure function accel_max_workgroup_size() result(max_workgroup_size)
max_workgroup_size = accel%max_workgroup_size
end function accel_max_workgroup_size
! -----------------------------------------------
integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
type(accel_kernel_t), intent(inout) :: kernel
integer(8) :: workgroup_size8
#ifdef HAVE_OPENCL
integer :: ierr
#endif
#ifdef HAVE_OPENCL
call clGetKernelWorkGroupInfo(kernel%kernel, accel%device%cl_device, CL_KERNEL_WORK_GROUP_SIZE, workgroup_size8, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
#endif
workgroup_size = workgroup_size8
#ifdef HAVE_CUDA
workgroup_size = accel%max_workgroup_size
#endif
end function accel_kernel_workgroup_size
! -----------------------------------------------
#ifdef HAVE_OPENCL
subroutine opencl_build_program(prog, filename, flags)
type(cl_program), intent(inout) :: prog
character(len=*), intent(in) :: filename
character(len=*), optional, intent(in) :: flags
character(len = 1000) :: string
character(len = 256) :: share_string
integer :: ierr, ierrlog, iunit, irec, newlen
PUSH_SUB(opencl_build_program)
string = '#include "'//trim(filename)//'"'
if(debug%info) then
call messages_write("Building CL program '"//trim(filename)//"'.")
call messages_info()
end if
prog = clCreateProgramWithSource(accel%context%cl_context, trim(string), ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateProgramWithSource")
! build the compilation flags
string='-w'
! full optimization
string=trim(string)//' -cl-denorms-are-zero'
! The following flag gives an error with the Xeon Phi
! string=trim(string)//' -cl-strict-aliasing'
string=trim(string)//' -cl-mad-enable'
string=trim(string)//' -cl-unsafe-math-optimizations'
string=trim(string)//' -cl-finite-math-only'
string=trim(string)//' -cl-fast-relaxed-math'
share_string='-I'//trim(conf%share)//'/opencl/'
if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then
string = trim(string)//' -DEXT_KHR_FP64'
else if(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then
string = trim(string)//' -DEXT_AMD_FP64'
else
call messages_write('Octopus requires an OpenCL device with double-precision support.')
call messages_fatal()
end if
if(accel_use_shared_mem()) then
string = trim(string)//' -DSHARED_MEM'
end if
if(present(flags)) then
string = trim(string)//' '//trim(flags)
end if
if(debug%info) then
call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.)
call messages_write(' '//trim(share_string)//"'.")
call messages_info()
end if
string = trim(string)//' '//trim(share_string)
call clBuildProgram(prog, trim(string), ierr)
call clGetProgramBuildInfo(prog, accel%device%cl_device, CL_PROGRAM_BUILD_LOG, string, ierrlog)
if(ierrlog /= CL_SUCCESS) call opencl_print_error(ierrlog, "clGetProgramBuildInfo")
! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it
newlen = scan(string, achar(010), back = .true.) - 1
if(newlen >= 0) string = string(1:newlen)
if(len(trim(string)) > 0) write(stderr, '(a)') trim(string)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clBuildProgram")
POP_SUB(opencl_build_program)
end subroutine opencl_build_program
#endif
! -----------------------------------------------
#ifdef HAVE_OPENCL
subroutine opencl_release_program(prog)
type(cl_program), intent(inout) :: prog
integer :: ierr
PUSH_SUB(opencl_release_program)
call clReleaseProgram(prog, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseProgram")
POP_SUB(opencl_release_program)
end subroutine opencl_release_program
#endif
! -----------------------------------------------
#ifdef HAVE_OPENCL
subroutine opencl_release_kernel(prog)
type(cl_kernel), intent(inout) :: prog
integer :: ierr
PUSH_SUB(opencl_release_kernel)
#ifdef HAVE_OPENCL
call clReleaseKernel(prog, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseKernel")
#endif
POP_SUB(opencl_release_kernel)
end subroutine opencl_release_kernel
#endif
#ifdef HAVE_OPENCL
! -----------------------------------------------
subroutine opencl_create_kernel(kernel, prog, name)
type(cl_kernel), intent(inout) :: kernel
type(cl_program), intent(inout) :: prog
character(len=*), intent(in) :: name
integer :: ierr
type(profile_t), save :: prof
PUSH_SUB(opencl_create_kernel)
call profiling_in(prof, "CL_BUILD_KERNEL", exclude = .true.)
#ifdef HAVE_OPENCL
kernel = clCreateKernel(prog, name, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateKernel")
#endif
call profiling_out(prof)
POP_SUB(opencl_create_kernel)
end subroutine opencl_create_kernel
#endif
! ------------------------------------------------
subroutine opencl_print_error(ierr, name)
integer, intent(in) :: ierr
character(len=*), intent(in) :: name
character(len=40) :: errcode
PUSH_SUB(opencl_print_error)
#ifdef HAVE_OPENCL
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
select case(ierr)
case(CL_SUCCESS); errcode = 'CL_SUCCESS '
case(CL_DEVICE_NOT_FOUND); errcode = 'CL_DEVICE_NOT_FOUND '
case(CL_DEVICE_NOT_AVAILABLE); errcode = 'CL_DEVICE_NOT_AVAILABLE '
case(CL_COMPILER_NOT_AVAILABLE); errcode = 'CL_COMPILER_NOT_AVAILABLE '
case(CL_MEM_OBJECT_ALLOCATION_FAILURE); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE '
case(CL_OUT_OF_RESOURCES); errcode = 'CL_OUT_OF_RESOURCES '
case(CL_OUT_OF_HOST_MEMORY); errcode = 'CL_OUT_OF_HOST_MEMORY '
case(CL_PROFILING_INFO_NOT_AVAILABLE); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE '
case(CL_MEM_COPY_OVERLAP); errcode = 'CL_MEM_COPY_OVERLAP '
case(CL_IMAGE_FORMAT_MISMATCH); errcode = 'CL_IMAGE_FORMAT_MISMATCH '
case(CL_IMAGE_FORMAT_NOT_SUPPORTED); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED '
case(CL_BUILD_PROGRAM_FAILURE); errcode = 'CL_BUILD_PROGRAM_FAILURE '
case(CL_MAP_FAILURE); errcode = 'CL_MAP_FAILURE '
case(CL_INVALID_VALUE); errcode = 'CL_INVALID_VALUE '
case(CL_INVALID_DEVICE_TYPE); errcode = 'CL_INVALID_DEVICE_TYPE '
case(CL_INVALID_PLATFORM); errcode = 'CL_INVALID_PLATFORM '
case(CL_INVALID_DEVICE); errcode = 'CL_INVALID_DEVICE '
case(CL_INVALID_CONTEXT); errcode = 'CL_INVALID_CONTEXT '
case(CL_INVALID_QUEUE_PROPERTIES); errcode = 'CL_INVALID_QUEUE_PROPERTIES '
case(CL_INVALID_COMMAND_QUEUE); errcode = 'CL_INVALID_COMMAND_QUEUE '
case(CL_INVALID_HOST_PTR); errcode = 'CL_INVALID_HOST_PTR '
case(CL_INVALID_MEM_OBJECT); errcode = 'CL_INVALID_MEM_OBJECT '
case(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
case(CL_INVALID_IMAGE_SIZE); errcode = 'CL_INVALID_IMAGE_SIZE '
case(CL_INVALID_SAMPLER); errcode = 'CL_INVALID_SAMPLER '
case(CL_INVALID_BINARY); errcode = 'CL_INVALID_BINARY '
case(CL_INVALID_BUILD_OPTIONS); errcode = 'CL_INVALID_BUILD_OPTIONS '
case(CL_INVALID_PROGRAM); errcode = 'CL_INVALID_PROGRAM '
case(CL_INVALID_PROGRAM_EXECUTABLE); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE '
case(CL_INVALID_KERNEL_NAME); errcode = 'CL_INVALID_KERNEL_NAME '
case(CL_INVALID_KERNEL_DEFINITION); errcode = 'CL_INVALID_KERNEL_DEFINITION '
case(CL_INVALID_KERNEL); errcode = 'CL_INVALID_KERNEL '
case(CL_INVALID_ARG_INDEX); errcode = 'CL_INVALID_ARG_INDEX '
case(CL_INVALID_ARG_VALUE); errcode = 'CL_INVALID_ARG_VALUE '
case(CL_INVALID_ARG_SIZE); errcode = 'CL_INVALID_ARG_SIZE '
case(CL_INVALID_KERNEL_ARGS); errcode = 'CL_INVALID_KERNEL_ARGS '
case(CL_INVALID_WORK_DIMENSION); errcode = 'CL_INVALID_WORK_DIMENSION '
case(CL_INVALID_WORK_GROUP_SIZE); errcode = 'CL_INVALID_WORK_GROUP_SIZE '
case(CL_INVALID_WORK_ITEM_SIZE); errcode = 'CL_INVALID_WORK_ITEM_SIZE '
case(CL_INVALID_GLOBAL_OFFSET); errcode = 'CL_INVALID_GLOBAL_OFFSET '
case(CL_INVALID_EVENT_WAIT_LIST); errcode = 'CL_INVALID_EVENT_WAIT_LIST '
case(CL_INVALID_EVENT); errcode = 'CL_INVALID_EVENT '
case(CL_INVALID_OPERATION); errcode = 'CL_INVALID_OPERATION '
case(CL_INVALID_GL_OBJECT); errcode = 'CL_INVALID_GL_OBJECT '
case(CL_INVALID_BUFFER_SIZE); errcode = 'CL_INVALID_BUFFER_SIZE '
case(CL_INVALID_MIP_LEVEL); errcode = 'CL_INVALID_MIP_LEVEL '
case(CL_INVALID_GLOBAL_WORK_SIZE); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE '
case(CL_PLATFORM_NOT_FOUND_KHR); errcode = 'CL_PLATFORM_NOT_FOUND_KHR'
case default
write(errcode, '(i10)') ierr
errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
end select
#endif
message(1) = 'OpenCL '//trim(name)//' '//trim(errcode)
call messages_fatal(1)
POP_SUB(opencl_print_error)
end subroutine opencl_print_error
! ----------------------------------------------------
subroutine clblas_print_error(ierr, name)
integer, intent(in) :: ierr
character(len=*), intent(in) :: name
character(len=40) :: errcode
PUSH_SUB(clblas_print_error)
select case(ierr)
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
case(clblasSuccess); errcode = 'clblasSuccess'
case(clblasInvalidValue); errcode = 'clblasInvalidValue'
case(clblasInvalidCommandQueue); errcode = 'clblasInvalidCommandQueue'
case(clblasInvalidContext); errcode = 'clblasInvalidContext'
case(clblasInvalidMemObject); errcode = 'clblasInvalidMemObject'
case(clblasInvalidDevice); errcode = 'clblasInvalidDevice'
case(clblasInvalidEventWaitList); errcode = 'clblasInvalidEventWaitList'
case(clblasOutOfResources); errcode = 'clblasOutOfResources'
case(clblasOutOfHostMemory); errcode = 'clblasOutOfHostMemory'
case(clblasInvalidOperation); errcode = 'clblasInvalidOperation'
case(clblasCompilerNotAvailable); errcode = 'clblasCompilerNotAvailable'
case(clblasBuildProgramFailure ); errcode = 'clblasBuildProgramFailure'
case(clblasNotImplemented); errcode = 'clblasNotImplemented'
case(clblasNotInitialized); errcode = 'clblasNotInitialized'
case(clblasInvalidMatA); errcode = 'clblasInvalidMatA'
case(clblasInvalidMatB); errcode = 'clblasInvalidMatB'
case(clblasInvalidMatC); errcode = 'clblasInvalidMatC'
case(clblasInvalidVecX); errcode = 'clblasInvalidVecX'
case(clblasInvalidVecY); errcode = 'clblasInvalidVecY'
case(clblasInvalidDim); errcode = 'clblasInvalidDim'
case(clblasInvalidLeadDimA); errcode = 'clblasInvalidLeadDimA'
case(clblasInvalidLeadDimB); errcode = 'clblasInvalidLeadDimB'
case(clblasInvalidLeadDimC); errcode = 'clblasInvalidLeadDimC'
case(clblasInvalidIncX); errcode = 'clblasInvalidIncX'
case(clblasInvalidIncY); errcode = 'clblasInvalidIncY'
case(clblasInsufficientMemMatA); errcode = 'clblasInsufficientMemMatA'
case(clblasInsufficientMemMatB); errcode = 'clblasInsufficientMemMatB'
case(clblasInsufficientMemMatC); errcode = 'clblasInsufficientMemMatC'
case(clblasInsufficientMemVecX); errcode = 'clblasInsufficientMemVecX'
case(clblasInsufficientMemVecY); errcode = 'clblasInsufficientMemVecY'
case default
write(errcode, '(i10)') ierr
errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
end select
message(1) = 'clblas '//trim(name)//' '//trim(errcode)
call messages_fatal(1)
POP_SUB(clblas_print_error)
end subroutine clblas_print_error
! ----------------------------------------------------
subroutine clfft_print_error(ierr, name)
integer, intent(in) :: ierr
character(len=*), intent(in) :: name
character(len=40) :: errcode
PUSH_SUB(clfft_print_error)
#ifdef HAVE_CLFFT
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
select case(ierr)
case(CLFFT_INVALID_GLOBAL_WORK_SIZE); errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE'
case(CLFFT_INVALID_MIP_LEVEL); errcode = 'CLFFT_INVALID_MIP_LEVEL'
case(CLFFT_INVALID_BUFFER_SIZE); errcode = 'CLFFT_INVALID_BUFFER_SIZE'
case(CLFFT_INVALID_GL_OBJECT); errcode = 'CLFFT_INVALID_GL_OBJECT'
case(CLFFT_INVALID_OPERATION); errcode = 'CLFFT_INVALID_OPERATION'
case(CLFFT_INVALID_EVENT); errcode = 'CLFFT_INVALID_EVENT'
case(CLFFT_INVALID_EVENT_WAIT_LIST); errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST'
case(CLFFT_INVALID_GLOBAL_OFFSET); errcode = 'CLFFT_INVALID_GLOBAL_OFFSET'
case(CLFFT_INVALID_WORK_ITEM_SIZE); errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE'
case(CLFFT_INVALID_WORK_GROUP_SIZE); errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE'
case(CLFFT_INVALID_WORK_DIMENSION); errcode = 'CLFFT_INVALID_WORK_DIMENSION'
case(CLFFT_INVALID_KERNEL_ARGS); errcode = 'CLFFT_INVALID_KERNEL_ARGS'
case(CLFFT_INVALID_ARG_SIZE); errcode = 'CLFFT_INVALID_ARG_SIZE'
case(CLFFT_INVALID_ARG_VALUE); errcode = 'CLFFT_INVALID_ARG_VALUE'
case(CLFFT_INVALID_ARG_INDEX); errcode = 'CLFFT_INVALID_ARG_INDEX'
case(CLFFT_INVALID_KERNEL); errcode = 'CLFFT_INVALID_KERNEL'
case(CLFFT_INVALID_KERNEL_DEFINITION); errcode = 'CLFFT_INVALID_KERNEL_DEFINITION'
case(CLFFT_INVALID_KERNEL_NAME); errcode = 'CLFFT_INVALID_KERNEL_NAME'
case(CLFFT_INVALID_PROGRAM_EXECUTABLE); errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE'
case(CLFFT_INVALID_PROGRAM); errcode = 'CLFFT_INVALID_PROGRAM'
case(CLFFT_INVALID_BUILD_OPTIONS); errcode = 'CLFFT_INVALID_BUILD_OPTIONS'
case(CLFFT_INVALID_BINARY); errcode = 'CLFFT_INVALID_BINARY'
case(CLFFT_INVALID_SAMPLER); errcode = 'CLFFT_INVALID_SAMPLER'
case(CLFFT_INVALID_IMAGE_SIZE); errcode = 'CLFFT_INVALID_IMAGE_SIZE'
case(CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR); errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
case(CLFFT_INVALID_MEM_OBJECT); errcode = 'CLFFT_INVALID_MEM_OBJECT'
case(CLFFT_INVALID_HOST_PTR); errcode = 'CLFFT_INVALID_HOST_PTR'
case(CLFFT_INVALID_COMMAND_QUEUE); errcode = 'CLFFT_INVALID_COMMAND_QUEUE'
case(CLFFT_INVALID_QUEUE_PROPERTIES); errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES'
case(CLFFT_INVALID_CONTEXT); errcode = 'CLFFT_INVALID_CONTEXT'
case(CLFFT_INVALID_DEVICE); errcode = 'CLFFT_INVALID_DEVICE'
case(CLFFT_INVALID_PLATFORM); errcode = 'CLFFT_INVALID_PLATFORM'
case(CLFFT_INVALID_DEVICE_TYPE); errcode = 'CLFFT_INVALID_DEVICE_TYPE'
case(CLFFT_INVALID_VALUE); errcode = 'CLFFT_INVALID_VALUE'
case(CLFFT_MAP_FAILURE); errcode = 'CLFFT_MAP_FAILURE'
case(CLFFT_BUILD_PROGRAM_FAILURE); errcode = 'CLFFT_BUILD_PROGRAM_FAILURE'
case(CLFFT_IMAGE_FORMAT_NOT_SUPPORTED); errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
case(CLFFT_IMAGE_FORMAT_MISMATCH); errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH'
case(CLFFT_MEM_COPY_OVERLAP); errcode = 'CLFFT_MEM_COPY_OVERLAP'
case(CLFFT_PROFILING_INFO_NOT_AVAILABLE); errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
case(CLFFT_OUT_OF_HOST_MEMORY); errcode = 'CLFFT_OUT_OF_HOST_MEMORY'
case(CLFFT_OUT_OF_RESOURCES); errcode = 'CLFFT_OUT_OF_RESOURCES'
case(CLFFT_MEM_OBJECT_ALLOCATION_FAILURE); errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
case(CLFFT_COMPILER_NOT_AVAILABLE); errcode = 'CLFFT_COMPILER_NOT_AVAILABLE'
case(CLFFT_DEVICE_NOT_AVAILABLE); errcode = 'CLFFT_DEVICE_NOT_AVAILABLE'
case(CLFFT_DEVICE_NOT_FOUND); errcode = 'CLFFT_DEVICE_NOT_FOUND'
case(CLFFT_SUCCESS); errcode = 'CLFFT_SUCCESS'
case(CLFFT_BUGCHECK); errcode = 'CLFFT_BUGCHECK'
case(CLFFT_NOTIMPLEMENTED); errcode = 'CLFFT_NOTIMPLEMENTED'
case(CLFFT_FILE_NOT_FOUND); errcode = 'CLFFT_FILE_NOT_FOUND'
case(CLFFT_FILE_CREATE_FAILURE); errcode = 'CLFFT_FILE_CREATE_FAILURE'
case(CLFFT_VERSION_MISMATCH); errcode = 'CLFFT_VERSION_MISMATCH'
case(CLFFT_INVALID_PLAN); errcode = 'CLFFT_INVALID_PLAN'
case(CLFFT_DEVICE_NO_DOUBLE); errcode = 'CLFFT_DEVICE_NO_DOUBLE'
case(CLFFT_ENDSTATUS); errcode = 'CLFFT_ENDSTATUS'
case default
write(errcode, '(i10)') ierr
errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
end select
message(1) = 'clfft '//trim(name)//' '//trim(errcode)
call messages_fatal(1)
POP_SUB(clfft_print_error)
end subroutine clfft_print_error
! ----------------------------------------------------
#ifdef HAVE_OPENCL
logical function f90_cl_device_has_extension(device, extension) result(has)
type(cl_device_id), intent(inout) :: device
character(len=*), intent(in) :: extension
integer :: cl_status
character(len=2048) :: all_extensions
#ifdef HAVE_OPENCL
call clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, all_extensions, cl_status)
#endif
has = index(all_extensions, extension) /= 0
end function f90_cl_device_has_extension
#endif
! ---------------------------------------------------------
integer pure function opencl_pad(size, blk) result(pad)
integer, intent(in) :: size
integer, intent(in) :: blk
integer :: mm
mm = mod(size, blk)
if(mm == 0) then
pad = size
else
pad = size + blk - mm
end if
end function opencl_pad
! ----------------------------------------------------
subroutine accel_set_buffer_to_zero(buffer, type, nval, offset)
type(accel_mem_t), intent(inout) :: buffer
type(type_t), intent(in) :: type
integer, intent(in) :: nval
integer, optional, intent(in) :: offset
integer :: nval_real, bsize, offset_real
PUSH_SUB(accel_set_buffer_to_zero)
ASSERT(type == TYPE_CMPLX .or. type == TYPE_FLOAT)
nval_real = nval*(types_get_size(type)/8)
offset_real = optional_default(offset, 0)*(types_get_size(type)/8)
ASSERT(nval_real > 0)
call accel_set_kernel_arg(set_zero, 0, nval_real)
call accel_set_kernel_arg(set_zero, 1, offset_real)
call accel_set_kernel_arg(set_zero, 2, buffer)
bsize = accel_kernel_workgroup_size(set_zero)
call accel_kernel_run(set_zero, (/ opencl_pad(nval_real, bsize) /), (/ bsize /))
call accel_finish()
POP_SUB(accel_set_buffer_to_zero)
end subroutine accel_set_buffer_to_zero
! ----------------------------------------------------
subroutine opencl_check_bandwidth()
integer :: itime
integer, parameter :: times = 10
integer :: size
real(8) :: time, stime
real(8) :: read_bw, write_bw
FLOAT, allocatable :: data(:)
call messages_new_line()
call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
call messages_new_line()
call messages_info()
call messages_write(' Buffer size Read bw Write bw')
call messages_new_line()
call messages_write(' [MiB] [MiB/s] [MiB/s]')
call messages_info()
size = 15000
do
SAFE_ALLOCATE(data(1:size))
call accel_create_buffer(buff, ACCEL_MEM_READ_WRITE, TYPE_FLOAT, size)
stime = loct_clock()
do itime = 1, times
call accel_write_buffer(buff, size, data)
end do
time = (loct_clock() - stime)/dble(times)
write_bw = dble(size)*8.0_8/time
stime = loct_clock()
do itime = 1, times
call accel_read_buffer(buff, size, data)
time = (loct_clock() - stime)/dble(times)
read_bw = dble(size)*8.0_8/time
call messages_write(size*8.0_8/1024.0**2)
call messages_write(write_bw/1024.0**2, fmt = '(f10.1)')
call messages_write(read_bw/1024.0**2, fmt = '(f10.1)')
call messages_info()
call accel_release_buffer(buff)
SAFE_DEALLOCATE_A(data)
size = int(size*2.0)
if(size > 50000000) exit
end do
end subroutine opencl_check_bandwidth
! ----------------------------------------------------
logical pure function accel_use_shared_mem() result(use_shared_mem)
end function accel_use_shared_mem
!------------------------------------------------------------
call cuda_module_map_init(accel%module_map)
POP_SUB(accel_kernel_global_init)
end subroutine accel_kernel_global_init
!------------------------------------------------------------
type(accel_kernel_t), pointer :: next_head
do
if(.not. associated(head)) exit
next_head => head%next
head => next_head
end do
if(accel_is_enabled()) then
call cuda_module_map_end(accel%module_map)
end if
POP_SUB(accel_kernel_global_end)
end subroutine accel_kernel_global_end
!------------------------------------------------------------
subroutine accel_kernel_build(this, file_name, kernel_name, flags)
type(accel_kernel_t), intent(inout) :: this
character(len=*), intent(in) :: file_name
character(len=*), intent(in) :: kernel_name
character(len=*), optional, intent(in) :: flags
#ifdef HAVE_OPENCL
type(cl_program) :: prog
#endif
type(c_ptr) :: cuda_module
PUSH_SUB(accel_kernel_build)
call profiling_in(prof, "ACCEL_COMPILE", exclude = .true.)
all_flags = '-I'//trim(conf%share)//'/opencl/'
if(accel_use_shared_mem()) then
all_flags = trim(all_flags)//' -DSHARED_MEM'
end if
all_flags = trim(all_flags)//' '//trim(flags)

Xavier Andrade
已提交
call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, trim(file_name), trim(all_flags))
call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name))
call cuda_alloc_arg_array(this%arguments)
call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags)
call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
call opencl_release_program(prog)
POP_SUB(accel_kernel_build)
end subroutine accel_kernel_build
!------------------------------------------------------------
type(accel_kernel_t), intent(inout) :: this
#ifdef HAVE_OPENCL
integer :: ierr
#endif
call cuda_free_arg_array(this%arguments)
call cuda_release_kernel(this%cuda_kernel)
! modules are not released here, since they are not associated to a kernel
#ifdef HAVE_OPENCL
call clReleaseKernel(this%kernel, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "release_kernel")
#endif
this%initialized = .false.
POP_SUB(accel_kernel_end)
end subroutine accel_kernel_end
!------------------------------------------------------------
subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
type(accel_kernel_t), target, intent(inout) :: this
character(len=*), intent(in) :: file_name
character(len=*), intent(in) :: kernel_name
character(len=*), optional, intent(in) :: flags
if(.not. this%initialized) then
call accel_kernel_build(this, file_name, kernel_name, flags)
this%next => head
head => this
end if
POP_SUB(accel_kernel_start_call)
end subroutine accel_kernel_start_call
!--------------------------------------------------------------
integer(8) pure function accel_global_memory_size() result(size)
size = accel%global_memory_size
end function accel_global_memory_size
!--------------------------------------------------------------
integer(8) pure function accel_local_memory_size() result(size)
size = accel%local_memory_size
end function accel_local_memory_size
!--------------------------------------------------------------
integer pure function accel_max_size_per_dim(dim) result(size)
integer, intent(in) :: dim
#ifdef HAVE_OPENCL
#endif
#ifdef HAVE_CUDA
#endif
end function accel_max_size_per_dim
! ------------------------------------------------------
#include "undef.F90"
#include "real.F90"
#include "undef.F90"
#include "complex.F90"
#include "undef.F90"
#include "real_single.F90"
#include "undef.F90"
#include "complex_single.F90"
#include "undef.F90"
#include "integer.F90"
!! Local Variables:
!! mode: f90
!! coding: utf-8
!! End: