Newer
Older
write(message(2), '(a,f12.6,a)') " available local memory: ", dble(accel%local_memory_size)/1024.0, " Kb"
call messages_fatal(2)
else if(size_in_bytes <= 0) then
write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
call messages_fatal(1)
end if
#ifdef HAVE_CUDA
kernel%cuda_shared_mem = size_in_bytes
#endif
#ifdef HAVE_OPENCL
call clSetKernelArgLocal(kernel%kernel, narg, size_in_bytes, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "set_kernel_arg_local")
#endif
POP_SUB(accel_set_kernel_arg_local)
end subroutine accel_set_kernel_arg_local
! ------------------------------------------
subroutine accel_kernel_run(kernel, globalsizes, localsizes)
type(accel_kernel_t), intent(inout) :: kernel
integer, intent(in) :: globalsizes(:)
integer, intent(in) :: localsizes(:)
integer :: dim, ierr
integer(8) :: gsizes(1:3)
integer(8) :: lsizes(1:3)
! no push_sub, called too frequently
! cuda needs all dimensions
gsizes = 1
lsizes = 1
dim = ubound(globalsizes, dim = 1)
ASSERT(dim == ubound(localsizes, dim = 1))
! if one size is zero, there is nothing to do
if(any(globalsizes == 0)) return
ASSERT(all(localsizes <= accel_max_workgroup_size()))
ASSERT(all(mod(globalsizes, localsizes) == 0))
gsizes(1:dim) = int(globalsizes(1:dim), 8)
lsizes(1:dim) = int(localsizes(1:dim), 8)
#ifdef HAVE_OPENCL
call clEnqueueNDRangeKernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
#endif
gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
ASSERT(gsizes(1) < 2_8**31 - 1_8)
ASSERT(all(gsizes(2:3) <= 65535_8))
call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
kernel%cuda_shared_mem = 0
end subroutine accel_kernel_run
! -----------------------------------------------
integer pure function accel_max_workgroup_size() result(max_workgroup_size)
max_workgroup_size = accel%max_workgroup_size
end function accel_max_workgroup_size
! -----------------------------------------------
integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
type(accel_kernel_t), intent(inout) :: kernel
integer(8) :: workgroup_size8
integer :: ierr
#ifdef HAVE_OPENCL
call clGetKernelWorkGroupInfo(kernel%kernel, accel%device%cl_device, CL_KERNEL_WORK_GROUP_SIZE, workgroup_size8, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
#endif
workgroup_size = workgroup_size8
#ifdef HAVE_CUDA
workgroup_size = accel%max_workgroup_size
#endif
end function accel_kernel_workgroup_size
! -----------------------------------------------
#ifdef HAVE_OPENCL
subroutine opencl_build_program(prog, filename, flags)
type(cl_program), intent(inout) :: prog
character(len=*), intent(in) :: filename
character(len=*), optional, intent(in) :: flags
character(len = 1000) :: string
character(len = 256) :: share_string
integer :: ierr, ierrlog, iunit, irec, newlen
PUSH_SUB(opencl_build_program)
string = '#include "'//trim(filename)//'"'
if(debug%info) then
call messages_write("Building CL program '"//trim(filename)//"'.")
call messages_info()
end if
prog = clCreateProgramWithSource(accel%context%cl_context, trim(string), ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateProgramWithSource")
! build the compilation flags
string='-w'
! full optimization
string=trim(string)//' -cl-denorms-are-zero'
! The following flag gives an error with the Xeon Phi
! string=trim(string)//' -cl-strict-aliasing'
string=trim(string)//' -cl-mad-enable'
string=trim(string)//' -cl-unsafe-math-optimizations'
string=trim(string)//' -cl-finite-math-only'
string=trim(string)//' -cl-fast-relaxed-math'
share_string='-I'//trim(conf%share)//'/opencl/'
if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then
string = trim(string)//' -DEXT_KHR_FP64'
else if(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then
string = trim(string)//' -DEXT_AMD_FP64'
else
call messages_write('Octopus requires an OpenCL device with double-precision support.')
call messages_fatal()
end if
if(accel_use_shared_mem()) then
string = trim(string)//' -DSHARED_MEM'
end if
if(present(flags)) then
string = trim(string)//' '//trim(flags)
end if
if(debug%info) then
call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.)
call messages_write(' '//trim(share_string)//"'.")
call messages_info()
end if
string = trim(string)//' '//trim(share_string)
call clBuildProgram(prog, trim(string), ierr)
call clGetProgramBuildInfo(prog, accel%device%cl_device, CL_PROGRAM_BUILD_LOG, string, ierrlog)
if(ierrlog /= CL_SUCCESS) call opencl_print_error(ierrlog, "clGetProgramBuildInfo")
! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it
newlen = scan(string, achar(010), back = .true.) - 1
if(newlen >= 0) string = string(1:newlen)
if(len(trim(string)) > 0) write(stderr, '(a)') trim(string)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clBuildProgram")
POP_SUB(opencl_build_program)
end subroutine opencl_build_program
#endif
! -----------------------------------------------
#ifdef HAVE_OPENCL
subroutine opencl_release_program(prog)
type(cl_program), intent(inout) :: prog
integer :: ierr
PUSH_SUB(opencl_release_program)
call clReleaseProgram(prog, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseProgram")
POP_SUB(opencl_release_program)
end subroutine opencl_release_program
#endif
! -----------------------------------------------
#ifdef HAVE_OPENCL
subroutine opencl_release_kernel(prog)
type(cl_kernel), intent(inout) :: prog
integer :: ierr
PUSH_SUB(opencl_release_kernel)
#ifdef HAVE_OPENCL
call clReleaseKernel(prog, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseKernel")
#endif
POP_SUB(opencl_release_kernel)
end subroutine opencl_release_kernel
#endif
#ifdef HAVE_OPENCL
! -----------------------------------------------
subroutine opencl_create_kernel(kernel, prog, name)
type(cl_kernel), intent(inout) :: kernel
type(cl_program), intent(inout) :: prog
character(len=*), intent(in) :: name
integer :: ierr
type(profile_t), save :: prof
PUSH_SUB(opencl_create_kernel)
call profiling_in(prof, "CL_BUILD_KERNEL", exclude = .true.)
#ifdef HAVE_OPENCL
kernel = clCreateKernel(prog, name, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateKernel")
#endif
call profiling_out(prof)
POP_SUB(opencl_create_kernel)
end subroutine opencl_create_kernel
#endif
! ------------------------------------------------
subroutine opencl_print_error(ierr, name)
integer, intent(in) :: ierr
character(len=*), intent(in) :: name
character(len=40) :: errcode
PUSH_SUB(opencl_print_error)
#ifdef HAVE_OPENCL
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
select case(ierr)
case(CL_SUCCESS); errcode = 'CL_SUCCESS '
case(CL_DEVICE_NOT_FOUND); errcode = 'CL_DEVICE_NOT_FOUND '
case(CL_DEVICE_NOT_AVAILABLE); errcode = 'CL_DEVICE_NOT_AVAILABLE '
case(CL_COMPILER_NOT_AVAILABLE); errcode = 'CL_COMPILER_NOT_AVAILABLE '
case(CL_MEM_OBJECT_ALLOCATION_FAILURE); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE '
case(CL_OUT_OF_RESOURCES); errcode = 'CL_OUT_OF_RESOURCES '
case(CL_OUT_OF_HOST_MEMORY); errcode = 'CL_OUT_OF_HOST_MEMORY '
case(CL_PROFILING_INFO_NOT_AVAILABLE); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE '
case(CL_MEM_COPY_OVERLAP); errcode = 'CL_MEM_COPY_OVERLAP '
case(CL_IMAGE_FORMAT_MISMATCH); errcode = 'CL_IMAGE_FORMAT_MISMATCH '
case(CL_IMAGE_FORMAT_NOT_SUPPORTED); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED '
case(CL_BUILD_PROGRAM_FAILURE); errcode = 'CL_BUILD_PROGRAM_FAILURE '
case(CL_MAP_FAILURE); errcode = 'CL_MAP_FAILURE '
case(CL_INVALID_VALUE); errcode = 'CL_INVALID_VALUE '
case(CL_INVALID_DEVICE_TYPE); errcode = 'CL_INVALID_DEVICE_TYPE '
case(CL_INVALID_PLATFORM); errcode = 'CL_INVALID_PLATFORM '
case(CL_INVALID_DEVICE); errcode = 'CL_INVALID_DEVICE '
case(CL_INVALID_CONTEXT); errcode = 'CL_INVALID_CONTEXT '
case(CL_INVALID_QUEUE_PROPERTIES); errcode = 'CL_INVALID_QUEUE_PROPERTIES '
case(CL_INVALID_COMMAND_QUEUE); errcode = 'CL_INVALID_COMMAND_QUEUE '
case(CL_INVALID_HOST_PTR); errcode = 'CL_INVALID_HOST_PTR '
case(CL_INVALID_MEM_OBJECT); errcode = 'CL_INVALID_MEM_OBJECT '
case(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
case(CL_INVALID_IMAGE_SIZE); errcode = 'CL_INVALID_IMAGE_SIZE '
case(CL_INVALID_SAMPLER); errcode = 'CL_INVALID_SAMPLER '
case(CL_INVALID_BINARY); errcode = 'CL_INVALID_BINARY '
case(CL_INVALID_BUILD_OPTIONS); errcode = 'CL_INVALID_BUILD_OPTIONS '
case(CL_INVALID_PROGRAM); errcode = 'CL_INVALID_PROGRAM '
case(CL_INVALID_PROGRAM_EXECUTABLE); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE '
case(CL_INVALID_KERNEL_NAME); errcode = 'CL_INVALID_KERNEL_NAME '
case(CL_INVALID_KERNEL_DEFINITION); errcode = 'CL_INVALID_KERNEL_DEFINITION '
case(CL_INVALID_KERNEL); errcode = 'CL_INVALID_KERNEL '
case(CL_INVALID_ARG_INDEX); errcode = 'CL_INVALID_ARG_INDEX '
case(CL_INVALID_ARG_VALUE); errcode = 'CL_INVALID_ARG_VALUE '
case(CL_INVALID_ARG_SIZE); errcode = 'CL_INVALID_ARG_SIZE '
case(CL_INVALID_KERNEL_ARGS); errcode = 'CL_INVALID_KERNEL_ARGS '
case(CL_INVALID_WORK_DIMENSION); errcode = 'CL_INVALID_WORK_DIMENSION '
case(CL_INVALID_WORK_GROUP_SIZE); errcode = 'CL_INVALID_WORK_GROUP_SIZE '
case(CL_INVALID_WORK_ITEM_SIZE); errcode = 'CL_INVALID_WORK_ITEM_SIZE '
case(CL_INVALID_GLOBAL_OFFSET); errcode = 'CL_INVALID_GLOBAL_OFFSET '
case(CL_INVALID_EVENT_WAIT_LIST); errcode = 'CL_INVALID_EVENT_WAIT_LIST '
case(CL_INVALID_EVENT); errcode = 'CL_INVALID_EVENT '
case(CL_INVALID_OPERATION); errcode = 'CL_INVALID_OPERATION '
case(CL_INVALID_GL_OBJECT); errcode = 'CL_INVALID_GL_OBJECT '
case(CL_INVALID_BUFFER_SIZE); errcode = 'CL_INVALID_BUFFER_SIZE '
case(CL_INVALID_MIP_LEVEL); errcode = 'CL_INVALID_MIP_LEVEL '
case(CL_INVALID_GLOBAL_WORK_SIZE); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE '
case(CL_PLATFORM_NOT_FOUND_KHR); errcode = 'CL_PLATFORM_NOT_FOUND_KHR'
case default
write(errcode, '(i10)') ierr
errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
end select
#endif
message(1) = 'OpenCL '//trim(name)//' '//trim(errcode)
call messages_fatal(1)
POP_SUB(opencl_print_error)
end subroutine opencl_print_error
! ----------------------------------------------------
subroutine clblas_print_error(ierr, name)
integer, intent(in) :: ierr
character(len=*), intent(in) :: name
character(len=40) :: errcode
PUSH_SUB(clblas_print_error)
select case(ierr)
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
case(clblasSuccess); errcode = 'clblasSuccess'
case(clblasInvalidValue); errcode = 'clblasInvalidValue'
case(clblasInvalidCommandQueue); errcode = 'clblasInvalidCommandQueue'
case(clblasInvalidContext); errcode = 'clblasInvalidContext'
case(clblasInvalidMemObject); errcode = 'clblasInvalidMemObject'
case(clblasInvalidDevice); errcode = 'clblasInvalidDevice'
case(clblasInvalidEventWaitList); errcode = 'clblasInvalidEventWaitList'
case(clblasOutOfResources); errcode = 'clblasOutOfResources'
case(clblasOutOfHostMemory); errcode = 'clblasOutOfHostMemory'
case(clblasInvalidOperation); errcode = 'clblasInvalidOperation'
case(clblasCompilerNotAvailable); errcode = 'clblasCompilerNotAvailable'
case(clblasBuildProgramFailure ); errcode = 'clblasBuildProgramFailure'
case(clblasNotImplemented); errcode = 'clblasNotImplemented'
case(clblasNotInitialized); errcode = 'clblasNotInitialized'
case(clblasInvalidMatA); errcode = 'clblasInvalidMatA'
case(clblasInvalidMatB); errcode = 'clblasInvalidMatB'
case(clblasInvalidMatC); errcode = 'clblasInvalidMatC'
case(clblasInvalidVecX); errcode = 'clblasInvalidVecX'
case(clblasInvalidVecY); errcode = 'clblasInvalidVecY'
case(clblasInvalidDim); errcode = 'clblasInvalidDim'
case(clblasInvalidLeadDimA); errcode = 'clblasInvalidLeadDimA'
case(clblasInvalidLeadDimB); errcode = 'clblasInvalidLeadDimB'
case(clblasInvalidLeadDimC); errcode = 'clblasInvalidLeadDimC'
case(clblasInvalidIncX); errcode = 'clblasInvalidIncX'
case(clblasInvalidIncY); errcode = 'clblasInvalidIncY'
case(clblasInsufficientMemMatA); errcode = 'clblasInsufficientMemMatA'
case(clblasInsufficientMemMatB); errcode = 'clblasInsufficientMemMatB'
case(clblasInsufficientMemMatC); errcode = 'clblasInsufficientMemMatC'
case(clblasInsufficientMemVecX); errcode = 'clblasInsufficientMemVecX'
case(clblasInsufficientMemVecY); errcode = 'clblasInsufficientMemVecY'
case default
write(errcode, '(i10)') ierr
errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
end select
message(1) = 'clblas '//trim(name)//' '//trim(errcode)
call messages_fatal(1)
POP_SUB(clblas_print_error)
end subroutine clblas_print_error
! ----------------------------------------------------
subroutine clfft_print_error(ierr, name)
integer, intent(in) :: ierr
character(len=*), intent(in) :: name
character(len=40) :: errcode
PUSH_SUB(clfft_print_error)
#ifdef HAVE_CLFFT
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
select case(ierr)
case(CLFFT_INVALID_GLOBAL_WORK_SIZE); errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE'
case(CLFFT_INVALID_MIP_LEVEL); errcode = 'CLFFT_INVALID_MIP_LEVEL'
case(CLFFT_INVALID_BUFFER_SIZE); errcode = 'CLFFT_INVALID_BUFFER_SIZE'
case(CLFFT_INVALID_GL_OBJECT); errcode = 'CLFFT_INVALID_GL_OBJECT'
case(CLFFT_INVALID_OPERATION); errcode = 'CLFFT_INVALID_OPERATION'
case(CLFFT_INVALID_EVENT); errcode = 'CLFFT_INVALID_EVENT'
case(CLFFT_INVALID_EVENT_WAIT_LIST); errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST'
case(CLFFT_INVALID_GLOBAL_OFFSET); errcode = 'CLFFT_INVALID_GLOBAL_OFFSET'
case(CLFFT_INVALID_WORK_ITEM_SIZE); errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE'
case(CLFFT_INVALID_WORK_GROUP_SIZE); errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE'
case(CLFFT_INVALID_WORK_DIMENSION); errcode = 'CLFFT_INVALID_WORK_DIMENSION'
case(CLFFT_INVALID_KERNEL_ARGS); errcode = 'CLFFT_INVALID_KERNEL_ARGS'
case(CLFFT_INVALID_ARG_SIZE); errcode = 'CLFFT_INVALID_ARG_SIZE'
case(CLFFT_INVALID_ARG_VALUE); errcode = 'CLFFT_INVALID_ARG_VALUE'
case(CLFFT_INVALID_ARG_INDEX); errcode = 'CLFFT_INVALID_ARG_INDEX'
case(CLFFT_INVALID_KERNEL); errcode = 'CLFFT_INVALID_KERNEL'
case(CLFFT_INVALID_KERNEL_DEFINITION); errcode = 'CLFFT_INVALID_KERNEL_DEFINITION'
case(CLFFT_INVALID_KERNEL_NAME); errcode = 'CLFFT_INVALID_KERNEL_NAME'
case(CLFFT_INVALID_PROGRAM_EXECUTABLE); errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE'
case(CLFFT_INVALID_PROGRAM); errcode = 'CLFFT_INVALID_PROGRAM'
case(CLFFT_INVALID_BUILD_OPTIONS); errcode = 'CLFFT_INVALID_BUILD_OPTIONS'
case(CLFFT_INVALID_BINARY); errcode = 'CLFFT_INVALID_BINARY'
case(CLFFT_INVALID_SAMPLER); errcode = 'CLFFT_INVALID_SAMPLER'
case(CLFFT_INVALID_IMAGE_SIZE); errcode = 'CLFFT_INVALID_IMAGE_SIZE'
case(CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR); errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
case(CLFFT_INVALID_MEM_OBJECT); errcode = 'CLFFT_INVALID_MEM_OBJECT'
case(CLFFT_INVALID_HOST_PTR); errcode = 'CLFFT_INVALID_HOST_PTR'
case(CLFFT_INVALID_COMMAND_QUEUE); errcode = 'CLFFT_INVALID_COMMAND_QUEUE'
case(CLFFT_INVALID_QUEUE_PROPERTIES); errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES'
case(CLFFT_INVALID_CONTEXT); errcode = 'CLFFT_INVALID_CONTEXT'
case(CLFFT_INVALID_DEVICE); errcode = 'CLFFT_INVALID_DEVICE'
case(CLFFT_INVALID_PLATFORM); errcode = 'CLFFT_INVALID_PLATFORM'
case(CLFFT_INVALID_DEVICE_TYPE); errcode = 'CLFFT_INVALID_DEVICE_TYPE'
case(CLFFT_INVALID_VALUE); errcode = 'CLFFT_INVALID_VALUE'
case(CLFFT_MAP_FAILURE); errcode = 'CLFFT_MAP_FAILURE'
case(CLFFT_BUILD_PROGRAM_FAILURE); errcode = 'CLFFT_BUILD_PROGRAM_FAILURE'
case(CLFFT_IMAGE_FORMAT_NOT_SUPPORTED); errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
case(CLFFT_IMAGE_FORMAT_MISMATCH); errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH'
case(CLFFT_MEM_COPY_OVERLAP); errcode = 'CLFFT_MEM_COPY_OVERLAP'
case(CLFFT_PROFILING_INFO_NOT_AVAILABLE); errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
case(CLFFT_OUT_OF_HOST_MEMORY); errcode = 'CLFFT_OUT_OF_HOST_MEMORY'
case(CLFFT_OUT_OF_RESOURCES); errcode = 'CLFFT_OUT_OF_RESOURCES'
case(CLFFT_MEM_OBJECT_ALLOCATION_FAILURE); errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
case(CLFFT_COMPILER_NOT_AVAILABLE); errcode = 'CLFFT_COMPILER_NOT_AVAILABLE'
case(CLFFT_DEVICE_NOT_AVAILABLE); errcode = 'CLFFT_DEVICE_NOT_AVAILABLE'
case(CLFFT_DEVICE_NOT_FOUND); errcode = 'CLFFT_DEVICE_NOT_FOUND'
case(CLFFT_SUCCESS); errcode = 'CLFFT_SUCCESS'
case(CLFFT_BUGCHECK); errcode = 'CLFFT_BUGCHECK'
case(CLFFT_NOTIMPLEMENTED); errcode = 'CLFFT_NOTIMPLEMENTED'
case(CLFFT_FILE_NOT_FOUND); errcode = 'CLFFT_FILE_NOT_FOUND'
case(CLFFT_FILE_CREATE_FAILURE); errcode = 'CLFFT_FILE_CREATE_FAILURE'
case(CLFFT_VERSION_MISMATCH); errcode = 'CLFFT_VERSION_MISMATCH'
case(CLFFT_INVALID_PLAN); errcode = 'CLFFT_INVALID_PLAN'
case(CLFFT_DEVICE_NO_DOUBLE); errcode = 'CLFFT_DEVICE_NO_DOUBLE'
case(CLFFT_ENDSTATUS); errcode = 'CLFFT_ENDSTATUS'
case default
write(errcode, '(i10)') ierr
errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
end select
message(1) = 'clfft '//trim(name)//' '//trim(errcode)
call messages_fatal(1)
POP_SUB(clfft_print_error)
end subroutine clfft_print_error
! ----------------------------------------------------
#ifdef HAVE_OPENCL
logical function f90_cl_device_has_extension(device, extension) result(has)
type(cl_device_id), intent(inout) :: device
character(len=*), intent(in) :: extension
integer :: cl_status
character(len=2048) :: all_extensions
#ifdef HAVE_OPENCL
call clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, all_extensions, cl_status)
#endif
has = index(all_extensions, extension) /= 0
end function f90_cl_device_has_extension
#endif
! ---------------------------------------------------------
integer pure function opencl_pad(size, blk) result(pad)
integer, intent(in) :: size
integer, intent(in) :: blk
integer :: mm
mm = mod(size, blk)
if(mm == 0) then
pad = size
else
pad = size + blk - mm
end if
end function opencl_pad
! ----------------------------------------------------
subroutine accel_set_buffer_to_zero(buffer, type, nval, offset)
type(accel_mem_t), intent(inout) :: buffer
type(type_t), intent(in) :: type
integer, intent(in) :: nval
integer, optional, intent(in) :: offset
integer :: nval_real, bsize, offset_real
PUSH_SUB(accel_set_buffer_to_zero)
ASSERT(type == TYPE_CMPLX .or. type == TYPE_FLOAT)
if(nval > 0) then
nval_real = nval*types_get_size(type)/8
offset_real = optional_default(offset, 0)*types_get_size(type)/8
call accel_set_kernel_arg(set_zero, 0, nval_real)
call accel_set_kernel_arg(set_zero, 1, offset_real)
call accel_set_kernel_arg(set_zero, 2, buffer)
bsize = accel_kernel_workgroup_size(set_zero)
call accel_kernel_run(set_zero, (/ opencl_pad(nval_real, bsize) /), (/ bsize /))
call accel_finish()
POP_SUB(accel_set_buffer_to_zero)
end subroutine accel_set_buffer_to_zero
! ----------------------------------------------------
subroutine opencl_check_bandwidth()
integer :: itime
integer, parameter :: times = 10
integer :: size
real(8) :: time, stime
real(8) :: read_bw, write_bw
FLOAT, allocatable :: data(:)
call messages_new_line()
call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
call messages_new_line()
call messages_info()
call messages_write(' Buffer size Read bw Write bw')
call messages_new_line()
call messages_write(' [MiB] [MiB/s] [MiB/s]')
call messages_info()
size = 15000
do
SAFE_ALLOCATE(data(1:size))
call accel_create_buffer(buff, ACCEL_MEM_READ_WRITE, TYPE_FLOAT, size)
stime = loct_clock()
do itime = 1, times
call accel_write_buffer(buff, size, data)
end do
time = (loct_clock() - stime)/dble(times)
write_bw = dble(size)*8.0_8/time
stime = loct_clock()
do itime = 1, times
call accel_read_buffer(buff, size, data)
time = (loct_clock() - stime)/dble(times)
read_bw = dble(size)*8.0_8/time
call messages_write(size*8.0_8/1024.0**2)
call messages_write(write_bw/1024.0**2, fmt = '(f10.1)')
call messages_write(read_bw/1024.0**2, fmt = '(f10.1)')
call messages_info()
call accel_release_buffer(buff)
SAFE_DEALLOCATE_A(data)
size = int(size*2.0)
if(size > 50000000) exit
end do
end subroutine opencl_check_bandwidth
! ----------------------------------------------------
logical pure function accel_use_shared_mem() result(use_shared_mem)
end function accel_use_shared_mem
!------------------------------------------------------------
call cuda_module_map_init(accel%module_map)
POP_SUB(accel_kernel_global_init)
end subroutine accel_kernel_global_init
!------------------------------------------------------------
type(accel_kernel_t), pointer :: next_head
do
if(.not. associated(head)) exit
next_head => head%next
head => next_head
end do
if(accel_is_enabled()) then
call cuda_module_map_end(accel%module_map)
end if
POP_SUB(accel_kernel_global_end)
end subroutine accel_kernel_global_end
!------------------------------------------------------------
subroutine accel_kernel_build(this, file_name, kernel_name, flags)
type(accel_kernel_t), intent(inout) :: this
character(len=*), intent(in) :: file_name
character(len=*), intent(in) :: kernel_name
character(len=*), optional, intent(in) :: flags
#ifdef HAVE_OPENCL
type(cl_program) :: prog
#endif
type(c_ptr) :: cuda_module
PUSH_SUB(accel_kernel_build)
call profiling_in(prof, "ACCEL_COMPILE", exclude = .true.)
all_flags = '-I'//trim(conf%share)//'/opencl/'
if(accel_use_shared_mem()) then
all_flags = trim(all_flags)//' -DSHARED_MEM'
end if
all_flags = trim(all_flags)//' '//trim(flags)

Xavier Andrade
已提交
call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, trim(file_name), trim(all_flags))
call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name))
call cuda_alloc_arg_array(this%arguments)
call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags)
call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
call opencl_release_program(prog)
POP_SUB(accel_kernel_build)
end subroutine accel_kernel_build
!------------------------------------------------------------
type(accel_kernel_t), intent(inout) :: this
#ifdef HAVE_OPENCL
integer :: ierr
#endif
call cuda_free_arg_array(this%arguments)
call cuda_release_kernel(this%cuda_kernel)
! modules are not released here, since they are not associated to a kernel
#ifdef HAVE_OPENCL
call clReleaseKernel(this%kernel, ierr)
if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "release_kernel")
#endif
this%initialized = .false.
POP_SUB(accel_kernel_end)
end subroutine accel_kernel_end
!------------------------------------------------------------
subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
type(accel_kernel_t), target, intent(inout) :: this
character(len=*), intent(in) :: file_name
character(len=*), intent(in) :: kernel_name
character(len=*), optional, intent(in) :: flags
if(.not. this%initialized) then
call accel_kernel_build(this, file_name, kernel_name, flags)
this%next => head
head => this
end if
POP_SUB(accel_kernel_start_call)
end subroutine accel_kernel_start_call
!--------------------------------------------------------------
integer(8) pure function accel_global_memory_size() result(size)
size = accel%global_memory_size
end function accel_global_memory_size
!--------------------------------------------------------------
integer(8) pure function accel_local_memory_size() result(size)
size = accel%local_memory_size
end function accel_local_memory_size
!--------------------------------------------------------------
#include "undef.F90"
#include "real.F90"
#include "undef.F90"
#include "complex.F90"
#include "undef.F90"
#include "real_single.F90"
#include "undef.F90"
#include "complex_single.F90"
#include "undef.F90"
#include "integer.F90"
!! Local Variables:
!! mode: f90
!! coding: utf-8
!! End: