Skip to content
代码片段 群组 项目
accel.F90 58.5 KB
Newer Older
      write(message(2), '(a,f12.6,a)') "          available local memory: ", dble(accel%local_memory_size)/1024.0, " Kb"
      call messages_fatal(2)
    else if(size_in_bytes <= 0) then
      write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
      call messages_fatal(1)
    end if

#ifdef HAVE_CUDA
    kernel%cuda_shared_mem = size_in_bytes
#endif

    call clSetKernelArgLocal(kernel%kernel, narg, size_in_bytes, ierr)
    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "set_kernel_arg_local")
    POP_SUB(accel_set_kernel_arg_local)
  end subroutine accel_set_kernel_arg_local
  ! ------------------------------------------
  subroutine accel_kernel_run(kernel, globalsizes, localsizes)
    type(accel_kernel_t), intent(inout) :: kernel
    integer,              intent(in)    :: globalsizes(:)
    integer,              intent(in)    :: localsizes(:)

    integer :: dim, ierr
    integer(8) :: gsizes(1:3)
    integer(8) :: lsizes(1:3)

    ! no push_sub, called too frequently

    ! cuda needs all dimensions
    gsizes = 1
    lsizes = 1
    
    dim = ubound(globalsizes, dim = 1)

    ASSERT(dim == ubound(localsizes, dim = 1))

    ! if one size is zero, there is nothing to do
    if(any(globalsizes == 0)) return

    ASSERT(all(localsizes > 0))
    ASSERT(all(localsizes <= accel_max_workgroup_size()))
    ASSERT(all(mod(globalsizes, localsizes) == 0))

    gsizes(1:dim) = int(globalsizes(1:dim), 8)
    lsizes(1:dim) = int(localsizes(1:dim), 8)
    call clEnqueueNDRangeKernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
#ifdef HAVE_CUDA
    gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
    
    ASSERT(gsizes(1) < 2_8**31 - 1_8)
    ASSERT(all(gsizes(2:3) <= 65535_8))
    call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)

    kernel%cuda_shared_mem = 0    
  end subroutine accel_kernel_run
  ! -----------------------------------------------
  integer pure function accel_max_workgroup_size() result(max_workgroup_size)
    max_workgroup_size = accel%max_workgroup_size
  end function accel_max_workgroup_size
  ! -----------------------------------------------
  integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
    type(accel_kernel_t), intent(inout) :: kernel

    integer(8) :: workgroup_size8
    integer    :: ierr

    call clGetKernelWorkGroupInfo(kernel%kernel, accel%device%cl_device, CL_KERNEL_WORK_GROUP_SIZE, workgroup_size8, ierr)
    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
#ifdef HAVE_CUDA
    workgroup_size = accel%max_workgroup_size
#endif

  end function accel_kernel_workgroup_size
  ! -----------------------------------------------
  subroutine opencl_build_program(prog, filename, flags)
    type(cl_program),           intent(inout) :: prog
    character(len=*),           intent(in)    :: filename
    character(len=*), optional, intent(in)    :: flags

    character(len = 256) :: share_string
    integer :: ierr, ierrlog, iunit, irec, newlen
    string = '#include "'//trim(filename)//'"'
    if(debug%info) then
      call messages_write("Building CL program '"//trim(filename)//"'.")
      call messages_info()
    end if
    prog = clCreateProgramWithSource(accel%context%cl_context, trim(string), ierr)
    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateProgramWithSource")

    ! build the compilation flags
    string='-w'
    ! full optimization
    string=trim(string)//' -cl-denorms-are-zero'
    ! The following flag gives an error with the Xeon Phi
    !    string=trim(string)//' -cl-strict-aliasing'
    string=trim(string)//' -cl-mad-enable'
    string=trim(string)//' -cl-unsafe-math-optimizations'
    string=trim(string)//' -cl-finite-math-only'
    string=trim(string)//' -cl-fast-relaxed-math'
    share_string='-I'//trim(conf%share)//'/opencl/'
    if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then
      string = trim(string)//' -DEXT_KHR_FP64'
    else if(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then
      string = trim(string)//' -DEXT_AMD_FP64'
    else
      call messages_write('Octopus requires an OpenCL device with double-precision support.')
      call messages_fatal()
    end if
    if(accel_use_shared_mem()) then
    if(present(flags)) then
      string = trim(string)//' '//trim(flags)
    end if

      call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.)
      call messages_write('  '//trim(share_string)//"'.")
    string = trim(string)//' '//trim(share_string)

    call clBuildProgram(prog, trim(string), ierr)

    call clGetProgramBuildInfo(prog, accel%device%cl_device, CL_PROGRAM_BUILD_LOG, string, ierrlog)
    if(ierrlog /= CL_SUCCESS) call opencl_print_error(ierrlog, "clGetProgramBuildInfo")

    ! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it
    newlen = scan(string, achar(010), back = .true.) - 1
    if(newlen >= 0) string = string(1:newlen)
    
    if(len(trim(string)) > 0) write(stderr, '(a)') trim(string)

    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clBuildProgram")
    POP_SUB(opencl_build_program)
  end subroutine opencl_build_program

  ! -----------------------------------------------
  subroutine opencl_release_program(prog)
    type(cl_program),    intent(inout) :: prog

    integer :: ierr

    PUSH_SUB(opencl_release_program)

    call clReleaseProgram(prog, ierr)
    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseProgram")

    POP_SUB(opencl_release_program)
  end subroutine opencl_release_program

  ! -----------------------------------------------

  subroutine opencl_release_kernel(prog)
    type(cl_kernel),      intent(inout) :: prog

    integer :: ierr

    PUSH_SUB(opencl_release_kernel)

    call clReleaseKernel(prog, ierr)
    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseKernel")

    POP_SUB(opencl_release_kernel)
  end subroutine opencl_release_kernel
  ! -----------------------------------------------
  subroutine opencl_create_kernel(kernel, prog, name)
    type(cl_kernel),  intent(inout) :: kernel
    type(cl_program), intent(inout) :: prog
    character(len=*), intent(in)    :: name

    integer :: ierr
    type(profile_t), save :: prof

    PUSH_SUB(opencl_create_kernel)
    call profiling_in(prof, "CL_BUILD_KERNEL", exclude = .true.)
    kernel = clCreateKernel(prog, name, ierr)
    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateKernel")

    call profiling_out(prof)
    POP_SUB(opencl_create_kernel)
  end subroutine opencl_create_kernel
  ! ------------------------------------------------

  subroutine opencl_print_error(ierr, name)
    integer,          intent(in) :: ierr
    character(len=*), intent(in) :: name

    character(len=40) :: errcode

    PUSH_SUB(opencl_print_error)

    select case(ierr)
    case(CL_SUCCESS); errcode = 'CL_SUCCESS '
    case(CL_DEVICE_NOT_FOUND); errcode = 'CL_DEVICE_NOT_FOUND '
    case(CL_DEVICE_NOT_AVAILABLE); errcode = 'CL_DEVICE_NOT_AVAILABLE '
    case(CL_COMPILER_NOT_AVAILABLE); errcode = 'CL_COMPILER_NOT_AVAILABLE '
    case(CL_MEM_OBJECT_ALLOCATION_FAILURE); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE '
    case(CL_OUT_OF_RESOURCES); errcode = 'CL_OUT_OF_RESOURCES '
    case(CL_OUT_OF_HOST_MEMORY); errcode = 'CL_OUT_OF_HOST_MEMORY '
    case(CL_PROFILING_INFO_NOT_AVAILABLE); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE '
    case(CL_MEM_COPY_OVERLAP); errcode = 'CL_MEM_COPY_OVERLAP '
    case(CL_IMAGE_FORMAT_MISMATCH); errcode = 'CL_IMAGE_FORMAT_MISMATCH '
    case(CL_IMAGE_FORMAT_NOT_SUPPORTED); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED '
    case(CL_BUILD_PROGRAM_FAILURE); errcode = 'CL_BUILD_PROGRAM_FAILURE '
    case(CL_MAP_FAILURE); errcode = 'CL_MAP_FAILURE '
    case(CL_INVALID_VALUE); errcode = 'CL_INVALID_VALUE '
    case(CL_INVALID_DEVICE_TYPE); errcode = 'CL_INVALID_DEVICE_TYPE '
    case(CL_INVALID_PLATFORM); errcode = 'CL_INVALID_PLATFORM '
    case(CL_INVALID_DEVICE); errcode = 'CL_INVALID_DEVICE '
    case(CL_INVALID_CONTEXT); errcode = 'CL_INVALID_CONTEXT '
    case(CL_INVALID_QUEUE_PROPERTIES); errcode = 'CL_INVALID_QUEUE_PROPERTIES '
    case(CL_INVALID_COMMAND_QUEUE); errcode = 'CL_INVALID_COMMAND_QUEUE '
    case(CL_INVALID_HOST_PTR); errcode = 'CL_INVALID_HOST_PTR '
    case(CL_INVALID_MEM_OBJECT); errcode = 'CL_INVALID_MEM_OBJECT '
    case(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
    case(CL_INVALID_IMAGE_SIZE); errcode = 'CL_INVALID_IMAGE_SIZE '
    case(CL_INVALID_SAMPLER); errcode = 'CL_INVALID_SAMPLER '
    case(CL_INVALID_BINARY); errcode = 'CL_INVALID_BINARY '
    case(CL_INVALID_BUILD_OPTIONS); errcode = 'CL_INVALID_BUILD_OPTIONS '
    case(CL_INVALID_PROGRAM); errcode = 'CL_INVALID_PROGRAM '
    case(CL_INVALID_PROGRAM_EXECUTABLE); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE '
    case(CL_INVALID_KERNEL_NAME); errcode = 'CL_INVALID_KERNEL_NAME '
    case(CL_INVALID_KERNEL_DEFINITION); errcode = 'CL_INVALID_KERNEL_DEFINITION '
    case(CL_INVALID_KERNEL); errcode = 'CL_INVALID_KERNEL '
    case(CL_INVALID_ARG_INDEX); errcode = 'CL_INVALID_ARG_INDEX '
    case(CL_INVALID_ARG_VALUE); errcode = 'CL_INVALID_ARG_VALUE '
    case(CL_INVALID_ARG_SIZE); errcode = 'CL_INVALID_ARG_SIZE '
    case(CL_INVALID_KERNEL_ARGS); errcode = 'CL_INVALID_KERNEL_ARGS '
    case(CL_INVALID_WORK_DIMENSION); errcode = 'CL_INVALID_WORK_DIMENSION '
    case(CL_INVALID_WORK_GROUP_SIZE); errcode = 'CL_INVALID_WORK_GROUP_SIZE '
    case(CL_INVALID_WORK_ITEM_SIZE); errcode = 'CL_INVALID_WORK_ITEM_SIZE '
    case(CL_INVALID_GLOBAL_OFFSET); errcode = 'CL_INVALID_GLOBAL_OFFSET '
    case(CL_INVALID_EVENT_WAIT_LIST); errcode = 'CL_INVALID_EVENT_WAIT_LIST '
    case(CL_INVALID_EVENT); errcode = 'CL_INVALID_EVENT '
    case(CL_INVALID_OPERATION); errcode = 'CL_INVALID_OPERATION '
    case(CL_INVALID_GL_OBJECT); errcode = 'CL_INVALID_GL_OBJECT '
    case(CL_INVALID_BUFFER_SIZE); errcode = 'CL_INVALID_BUFFER_SIZE '
    case(CL_INVALID_MIP_LEVEL); errcode = 'CL_INVALID_MIP_LEVEL '
    case(CL_INVALID_GLOBAL_WORK_SIZE); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE '
    case(CL_PLATFORM_NOT_FOUND_KHR); errcode = 'CL_PLATFORM_NOT_FOUND_KHR'
    case default
      write(errcode, '(i10)') ierr
      errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
    end select
    message(1) = 'OpenCL '//trim(name)//' '//trim(errcode)
    call messages_fatal(1)

    POP_SUB(opencl_print_error)
  end subroutine opencl_print_error

  ! ----------------------------------------------------

  subroutine clblas_print_error(ierr, name)
    integer,          intent(in) :: ierr
    character(len=*), intent(in) :: name

    character(len=40) :: errcode

    PUSH_SUB(clblas_print_error)
    case(clblasSuccess);                    errcode = 'clblasSuccess'
    case(clblasInvalidValue);               errcode = 'clblasInvalidValue'
    case(clblasInvalidCommandQueue);        errcode = 'clblasInvalidCommandQueue'
    case(clblasInvalidContext);             errcode = 'clblasInvalidContext'
    case(clblasInvalidMemObject);           errcode = 'clblasInvalidMemObject'
    case(clblasInvalidDevice);              errcode = 'clblasInvalidDevice'
    case(clblasInvalidEventWaitList);       errcode = 'clblasInvalidEventWaitList'
    case(clblasOutOfResources);             errcode = 'clblasOutOfResources'
    case(clblasOutOfHostMemory);            errcode = 'clblasOutOfHostMemory'
    case(clblasInvalidOperation);           errcode = 'clblasInvalidOperation'
    case(clblasCompilerNotAvailable);       errcode = 'clblasCompilerNotAvailable'
    case(clblasBuildProgramFailure );       errcode = 'clblasBuildProgramFailure'
    case(clblasNotImplemented);             errcode = 'clblasNotImplemented'
    case(clblasNotInitialized);             errcode = 'clblasNotInitialized'
    case(clblasInvalidMatA);                errcode = 'clblasInvalidMatA'
    case(clblasInvalidMatB);                errcode = 'clblasInvalidMatB'
    case(clblasInvalidMatC);                errcode = 'clblasInvalidMatC'
    case(clblasInvalidVecX);                errcode = 'clblasInvalidVecX'
    case(clblasInvalidVecY);                errcode = 'clblasInvalidVecY'
    case(clblasInvalidDim);                 errcode = 'clblasInvalidDim'
    case(clblasInvalidLeadDimA);            errcode = 'clblasInvalidLeadDimA'
    case(clblasInvalidLeadDimB);            errcode = 'clblasInvalidLeadDimB'
    case(clblasInvalidLeadDimC);            errcode = 'clblasInvalidLeadDimC'
    case(clblasInvalidIncX);                errcode = 'clblasInvalidIncX'
    case(clblasInvalidIncY);                errcode = 'clblasInvalidIncY'
    case(clblasInsufficientMemMatA);        errcode = 'clblasInsufficientMemMatA'
    case(clblasInsufficientMemMatB);        errcode = 'clblasInsufficientMemMatB'
    case(clblasInsufficientMemMatC);        errcode = 'clblasInsufficientMemMatC'
    case(clblasInsufficientMemVecX);        errcode = 'clblasInsufficientMemVecX'
    case(clblasInsufficientMemVecY);        errcode = 'clblasInsufficientMemVecY'
    case default
      write(errcode, '(i10)') ierr
      errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
    end select
    message(1) = 'clblas '//trim(name)//' '//trim(errcode)
    call messages_fatal(1)

    POP_SUB(clblas_print_error)
  end subroutine clblas_print_error
  ! ----------------------------------------------------
  subroutine clfft_print_error(ierr, name)
    integer,          intent(in) :: ierr
    character(len=*), intent(in) :: name
    select case(ierr)
    case(CLFFT_INVALID_GLOBAL_WORK_SIZE);          errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE' 
    case(CLFFT_INVALID_MIP_LEVEL);                 errcode = 'CLFFT_INVALID_MIP_LEVEL' 
    case(CLFFT_INVALID_BUFFER_SIZE);               errcode = 'CLFFT_INVALID_BUFFER_SIZE' 
    case(CLFFT_INVALID_GL_OBJECT);                 errcode = 'CLFFT_INVALID_GL_OBJECT' 
    case(CLFFT_INVALID_OPERATION);                 errcode = 'CLFFT_INVALID_OPERATION' 
    case(CLFFT_INVALID_EVENT);                     errcode = 'CLFFT_INVALID_EVENT' 
    case(CLFFT_INVALID_EVENT_WAIT_LIST);           errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST' 
    case(CLFFT_INVALID_GLOBAL_OFFSET);             errcode = 'CLFFT_INVALID_GLOBAL_OFFSET' 
    case(CLFFT_INVALID_WORK_ITEM_SIZE);            errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE' 
    case(CLFFT_INVALID_WORK_GROUP_SIZE);           errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE' 
    case(CLFFT_INVALID_WORK_DIMENSION);            errcode = 'CLFFT_INVALID_WORK_DIMENSION' 
    case(CLFFT_INVALID_KERNEL_ARGS);               errcode = 'CLFFT_INVALID_KERNEL_ARGS' 
    case(CLFFT_INVALID_ARG_SIZE);                  errcode = 'CLFFT_INVALID_ARG_SIZE' 
    case(CLFFT_INVALID_ARG_VALUE);                 errcode = 'CLFFT_INVALID_ARG_VALUE' 
    case(CLFFT_INVALID_ARG_INDEX);                 errcode = 'CLFFT_INVALID_ARG_INDEX' 
    case(CLFFT_INVALID_KERNEL);                    errcode = 'CLFFT_INVALID_KERNEL' 
    case(CLFFT_INVALID_KERNEL_DEFINITION);         errcode = 'CLFFT_INVALID_KERNEL_DEFINITION' 
    case(CLFFT_INVALID_KERNEL_NAME);               errcode = 'CLFFT_INVALID_KERNEL_NAME' 
    case(CLFFT_INVALID_PROGRAM_EXECUTABLE);        errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE' 
    case(CLFFT_INVALID_PROGRAM);                   errcode = 'CLFFT_INVALID_PROGRAM' 
    case(CLFFT_INVALID_BUILD_OPTIONS);             errcode = 'CLFFT_INVALID_BUILD_OPTIONS' 
    case(CLFFT_INVALID_BINARY);                    errcode = 'CLFFT_INVALID_BINARY' 
    case(CLFFT_INVALID_SAMPLER);                   errcode = 'CLFFT_INVALID_SAMPLER' 
    case(CLFFT_INVALID_IMAGE_SIZE);                errcode = 'CLFFT_INVALID_IMAGE_SIZE' 
    case(CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR);   errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR' 
    case(CLFFT_INVALID_MEM_OBJECT);                errcode = 'CLFFT_INVALID_MEM_OBJECT' 
    case(CLFFT_INVALID_HOST_PTR);                  errcode = 'CLFFT_INVALID_HOST_PTR' 
    case(CLFFT_INVALID_COMMAND_QUEUE);             errcode = 'CLFFT_INVALID_COMMAND_QUEUE' 
    case(CLFFT_INVALID_QUEUE_PROPERTIES);          errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES' 
    case(CLFFT_INVALID_CONTEXT);                   errcode = 'CLFFT_INVALID_CONTEXT' 
    case(CLFFT_INVALID_DEVICE);                    errcode = 'CLFFT_INVALID_DEVICE' 
    case(CLFFT_INVALID_PLATFORM);                  errcode = 'CLFFT_INVALID_PLATFORM' 
    case(CLFFT_INVALID_DEVICE_TYPE);               errcode = 'CLFFT_INVALID_DEVICE_TYPE' 
    case(CLFFT_INVALID_VALUE);                     errcode = 'CLFFT_INVALID_VALUE' 
    case(CLFFT_MAP_FAILURE);                       errcode = 'CLFFT_MAP_FAILURE' 
    case(CLFFT_BUILD_PROGRAM_FAILURE);             errcode = 'CLFFT_BUILD_PROGRAM_FAILURE' 
    case(CLFFT_IMAGE_FORMAT_NOT_SUPPORTED);        errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED' 
    case(CLFFT_IMAGE_FORMAT_MISMATCH);             errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH' 
    case(CLFFT_MEM_COPY_OVERLAP);                  errcode = 'CLFFT_MEM_COPY_OVERLAP' 
    case(CLFFT_PROFILING_INFO_NOT_AVAILABLE);      errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE' 
    case(CLFFT_OUT_OF_HOST_MEMORY);                errcode = 'CLFFT_OUT_OF_HOST_MEMORY' 
    case(CLFFT_OUT_OF_RESOURCES);                  errcode = 'CLFFT_OUT_OF_RESOURCES' 
    case(CLFFT_MEM_OBJECT_ALLOCATION_FAILURE);     errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE' 
    case(CLFFT_COMPILER_NOT_AVAILABLE);            errcode = 'CLFFT_COMPILER_NOT_AVAILABLE' 
    case(CLFFT_DEVICE_NOT_AVAILABLE);              errcode = 'CLFFT_DEVICE_NOT_AVAILABLE' 
    case(CLFFT_DEVICE_NOT_FOUND);                  errcode = 'CLFFT_DEVICE_NOT_FOUND' 
    case(CLFFT_SUCCESS);                           errcode = 'CLFFT_SUCCESS' 
    case(CLFFT_BUGCHECK);                          errcode = 'CLFFT_BUGCHECK' 
    case(CLFFT_NOTIMPLEMENTED);                    errcode = 'CLFFT_NOTIMPLEMENTED' 
    case(CLFFT_FILE_NOT_FOUND);                    errcode = 'CLFFT_FILE_NOT_FOUND' 
    case(CLFFT_FILE_CREATE_FAILURE);               errcode = 'CLFFT_FILE_CREATE_FAILURE' 
    case(CLFFT_VERSION_MISMATCH);                  errcode = 'CLFFT_VERSION_MISMATCH' 
    case(CLFFT_INVALID_PLAN);                      errcode = 'CLFFT_INVALID_PLAN'
    case(CLFFT_DEVICE_NO_DOUBLE);                  errcode = 'CLFFT_DEVICE_NO_DOUBLE' 
    case(CLFFT_ENDSTATUS);                         errcode = 'CLFFT_ENDSTATUS' 
    case default
      write(errcode, '(i10)') ierr
      errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
    end select
    message(1) = 'clfft '//trim(name)//' '//trim(errcode)
    POP_SUB(clfft_print_error)
  end subroutine clfft_print_error
  ! ----------------------------------------------------
  logical function f90_cl_device_has_extension(device, extension) result(has)
    type(cl_device_id), intent(inout) :: device
    character(len=*),   intent(in)    :: extension
    integer :: cl_status
    character(len=2048) :: all_extensions
    call clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, all_extensions, cl_status)
    has = index(all_extensions, extension) /= 0
  end function f90_cl_device_has_extension
  ! ---------------------------------------------------------
  integer pure function opencl_pad(size, blk) result(pad)
    integer, intent(in) :: size
    integer, intent(in) :: blk
    mm = mod(size, blk)
    if(mm == 0) then
      pad = size
    else
      pad = size + blk - mm
    end if
  end function opencl_pad

  ! ----------------------------------------------------

  subroutine accel_set_buffer_to_zero(buffer, type, nval, offset)
    type(accel_mem_t),  intent(inout) :: buffer
    type(type_t),       intent(in)    :: type
    integer,            intent(in)    :: nval
    integer, optional,  intent(in)    :: offset

    integer :: nval_real, bsize, offset_real
    PUSH_SUB(accel_set_buffer_to_zero)

    ASSERT(type == TYPE_CMPLX .or. type == TYPE_FLOAT)

    if(nval > 0) then
      
      nval_real = nval*types_get_size(type)/8
      offset_real = optional_default(offset, 0)*types_get_size(type)/8
      
      call accel_set_kernel_arg(set_zero, 0, nval_real)
      call accel_set_kernel_arg(set_zero, 1, offset_real)
      call accel_set_kernel_arg(set_zero, 2, buffer)
      
      bsize = accel_kernel_workgroup_size(set_zero)
      
      call accel_kernel_run(set_zero, (/ opencl_pad(nval_real, bsize) /), (/ bsize /))
      call accel_finish()
    POP_SUB(accel_set_buffer_to_zero)
  end subroutine accel_set_buffer_to_zero
  ! ----------------------------------------------------
  subroutine opencl_check_bandwidth()
    integer :: itime
    integer, parameter :: times = 10
    integer :: size
    real(8) :: time, stime
    real(8) :: read_bw, write_bw
    type(accel_mem_t) :: buff
    FLOAT, allocatable :: data(:)

    call messages_new_line()
    call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
    call messages_new_line()
    call messages_info()

    call messages_write(' Buffer size   Read bw  Write bw')
    call messages_new_line()
    call messages_write('       [MiB]   [MiB/s]   [MiB/s]')
    call messages_info()

    size = 15000
    do 
      SAFE_ALLOCATE(data(1:size))
      call accel_create_buffer(buff, ACCEL_MEM_READ_WRITE, TYPE_FLOAT, size)
        call accel_write_buffer(buff, size, data)
      end do
      time = (loct_clock() - stime)/dble(times)

      write_bw = dble(size)*8.0_8/time

      stime = loct_clock()
      do itime = 1, times
        call accel_read_buffer(buff, size, data)

      time = (loct_clock() - stime)/dble(times)
      read_bw = dble(size)*8.0_8/time

      call messages_write(size*8.0_8/1024.0**2)
      call messages_write(write_bw/1024.0**2, fmt = '(f10.1)')
      call messages_write(read_bw/1024.0**2, fmt = '(f10.1)')
      call messages_info()

      call accel_release_buffer(buff)

      SAFE_DEALLOCATE_A(data)

      size = int(size*2.0)

      if(size > 50000000) exit
    end do
  end subroutine opencl_check_bandwidth
  ! ----------------------------------------------------
  logical pure function accel_use_shared_mem() result(use_shared_mem)
    use_shared_mem = accel%shared_mem
  end function accel_use_shared_mem

  !------------------------------------------------------------

  subroutine accel_kernel_global_init()
    PUSH_SUB(accel_kernel_global_init)
    call cuda_module_map_init(accel%module_map)
    
    POP_SUB(accel_kernel_global_init)
  end subroutine accel_kernel_global_init

  !------------------------------------------------------------
  
  subroutine accel_kernel_global_end()
    type(accel_kernel_t), pointer :: next_head

    PUSH_SUB(accel_kernel_global_end)

    do
      if(.not. associated(head)) exit
      next_head => head%next
      call accel_kernel_end(head)
    if(accel_is_enabled()) then
      call cuda_module_map_end(accel%module_map)
    end if
    POP_SUB(accel_kernel_global_end)
  end subroutine accel_kernel_global_end

  !------------------------------------------------------------

  subroutine accel_kernel_build(this, file_name, kernel_name, flags)
    type(accel_kernel_t),        intent(inout) :: this
    character(len=*),            intent(in)    :: file_name
    character(len=*),            intent(in)    :: kernel_name
    character(len=*), optional,  intent(in)    :: flags

    type(profile_t), save :: prof
#ifdef HAVE_OPENCL
    type(cl_program) :: prog
#endif
NicolasTD's avatar
NicolasTD 已提交
    character(len=1000) :: all_flags

    call profiling_in(prof, "ACCEL_COMPILE", exclude = .true.)

    all_flags = '-I'//trim(conf%share)//'/opencl/'

    if(accel_use_shared_mem()) then
      all_flags = trim(all_flags)//' -DSHARED_MEM'
    end if
    
      all_flags = trim(all_flags)//' '//trim(flags)
    call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, trim(file_name), trim(all_flags))
    
    call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name))
    call cuda_alloc_arg_array(this%arguments)

    this%cuda_shared_mem = 0
    call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags)
    call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
    call opencl_release_program(prog)
    call profiling_out(prof)
    
    POP_SUB(accel_kernel_build)
  end subroutine accel_kernel_build

  !------------------------------------------------------------

  subroutine accel_kernel_end(this)
    type(accel_kernel_t), intent(inout) :: this
#ifdef HAVE_OPENCL
    integer :: ierr
#endif

      PUSH_SUB(accel_kernel_end)
      call cuda_free_arg_array(this%arguments)
      call cuda_release_kernel(this%cuda_kernel)
      ! modules are not released here, since they are not associated to a kernel
#ifdef HAVE_OPENCL
      call clReleaseKernel(this%kernel, ierr)
      if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "release_kernel")
#endif
      this%initialized = .false.

      POP_SUB(accel_kernel_end)
  end subroutine accel_kernel_end

  !------------------------------------------------------------

  subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
    type(accel_kernel_t), target, intent(inout) :: this
    character(len=*),             intent(in)    :: file_name
    character(len=*),             intent(in)    :: kernel_name
    character(len=*), optional,   intent(in)    :: flags

    PUSH_SUB(accel_kernel_start_call)

    if(.not. this%initialized) then
      call accel_kernel_build(this, file_name, kernel_name, flags)
      this%next => head
      head => this
    end if

    POP_SUB(accel_kernel_start_call)
  end subroutine accel_kernel_start_call

  !--------------------------------------------------------------

  integer(8) pure function accel_global_memory_size() result(size)

    size = accel%global_memory_size
    
  end function accel_global_memory_size

  !--------------------------------------------------------------
  
  integer(8) pure function accel_local_memory_size() result(size)

    size = accel%local_memory_size
    
  end function accel_local_memory_size

  !--------------------------------------------------------------
  
#include "undef.F90"
#include "real.F90"
#include "accel_inc.F90"

#include "undef.F90"
#include "complex.F90"
#include "accel_inc.F90"
#include "undef.F90"
#include "real_single.F90"
#include "accel_inc.F90"

#include "undef.F90"
#include "complex_single.F90"
#include "accel_inc.F90"
#include "undef.F90"
#include "integer.F90"
#include "accel_inc.F90"
end module accel_oct_m

!! Local Variables:
!! mode: f90
!! coding: utf-8
!! End: