/*
 * This software is governed by the CeCILL-B license under French law and
 * abiding by the rules of distribution of free software.  You can  use, 
 * modify and/ or redistribute the software under the terms of the CeCILL-B
 * license as circulated by CEA, CNRS and INRIA at the following URL
 * "http://www.cecill.info" or the LICENCE.txt file present in this project.
*/

#ifndef CUDA_UTILS__
#define CUDA_UTILS__

#include <stdio.h>
#include <cuda.h>
#include <cassert>

#include "cuda_assert.hpp"
#include "cuda_compiler_interop.hpp"
#include "cuda_utils_hd_array.hpp"
#include "cuda_utils_host_array.hpp"
#include "cuda_utils_device_array.hpp"
#include "cuda_utils_device_cuarray.hpp"
#include "cuda_utils_device_elt.hpp"
#include "cuda_utils_typedefs.hpp"

/**
    @namespace Cuda_utils
    @brief This file defines classes to handle both Host and Device arrays.

    The copy constructor in those classes only copy the pointers. This means
    that device arrays can be used as kernel arguments. To do an hard copy
    between arrays you will have to use explicitly copy_from() methods which
    allows copy from any combinaison of device/host arrays. Note that the
    operator '=' is forbiden (private).

    These arrays are freed automatically at their destruction

    You can easily copy data from device to host or any other order :

    @code
    // allocate 100 elements on host memory
    Cuda_utils::HA_int h_my_array(100);
    // allocate 100 elements on device memory
    Cuda_utils::DA_int d_my_array(100);

    ...

    // Copy host array in device array
    d_my_array.copy_from(h_my_array);
    // Every data flux is allowed :
    h_my_array.copy_from(d_my_array);
    d_my_array.copy_from(d_my_array);
    h_my_array.copy_from(h_my_array);

    // Note that you can also copy from another array type
    HA_float     3f_array(3);
    HA_Vec3_cu vec_array(1);
    ...
    3f_array.copy_from(vec_array);

    // More complex data movements can be done through Cuda_utils::mem_cpy_xxx()
    // functions family. But no special overflow underflow check will be done
    mem_cpy_htd(d_array.ptr()    + offset1,
                h_my_array.ptr() + offset2,
                h_my_array.size()-2
               );

    // Call a kernel with a device array will only copy the pointer
    cuda_kernel<<<1,1>>>(d_my_array);
    @endcode
*/

// =============================================================================
namespace Cuda_utils{
// =============================================================================

/// @return the device Id with the maximum compute capabilitie
int get_max_gflops_device_id();

/// print a device attributes (Max threads/blocks or texture sizes etc.)
void print_device_attribs(CUdevice id);

/// Memory usage of the GPU in megabytes
void get_device_memory_usage(double& free, double& total);

}// END CUDA_UTILS NAMESPACE ===================================================

#endif // CUDA_UTILS__
