/*
 * This software is governed by the CeCILL-B license under French law and
 * abiding by the rules of distribution of free software.  You can  use, 
 * modify and/ or redistribute the software under the terms of the CeCILL-B
 * license as circulated by CEA, CNRS and INRIA at the following URL
 * "http://www.cecill.info" or the LICENCE.txt file present in this project.
*/

#ifndef CUDA_UTILS_MEMORY_HANDLING_HPP__
#define CUDA_UTILS_MEMORY_HANDLING_HPP__

#include <cassert>
#include <vector>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
#include "cuda_compiler_interop.hpp"
#include "cuda_assert.hpp"

/** @file cuda_utils_memory_handling.hpp
    @brief Functions shortcut to work on both device and host memory

    This file is part of the Cuda_utils homemade toolkit it provides common
    feature for device and host arrays

    How to read acronyms in this file:
    @li h: stand for host
    @li d: stand for device
    @li htd: host to device
    @li dth: device to host
    @li dtd: device to device
    @li hth: host to host

    Why prefer these wrappers instead of cuda API functions? They are way
    shorter to write and performs additionnal checks, its thus safer to use
    than cuda functions.

    Checks that are done with these functions:
    @li Automatic type casting
    @li CUDA_SAFE_CALL at each API call
    @li free() always set pointers to null
    @li mallloc() doesn't crash when allocated 0 elements and set the pointer to
    null.

    @see Cuda_utils
*/

// =============================================================================
namespace Cuda_utils{
// =============================================================================

/// Safe memory copy host to device
template <typename T>
inline
void mem_cpy_htd(T* dst, const T* src, int nb_elt){
    CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dst),
                              reinterpret_cast<const void*>(src),
                              nb_elt * sizeof(T),
                              cudaMemcpyHostToDevice) );
}

/// Safe memory copy device to host
template <typename T>
inline
void mem_cpy_dth(T* dst, const T* src, int nb_elt){
    CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dst),
                              reinterpret_cast<const void*>(src),
                              nb_elt * sizeof(T),
                              cudaMemcpyDeviceToHost) );
}

/// Safe memory copy device to device
template <typename T>
inline
void mem_cpy_dtd(T* dst, const T* src, int nb_elt){
    CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dst),
                              reinterpret_cast<const void*>(src),
                              nb_elt * sizeof(T),
                              cudaMemcpyDeviceToDevice) );
}

/// Safe memory copy host to host
template <typename T>
inline
void mem_cpy_hth(T* dst, const T* src, int nb_elt){
    CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dst),
                              reinterpret_cast<const void*>(src),
                              nb_elt * sizeof(T),
                              cudaMemcpyHostToHost) );
}

/// Safe memory copy from host to device constant memory
template <typename T>
inline
void mem_cpy_symbol_htd(const void* symbol, const T* src, int nb_elt){
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(symbol,
                                      reinterpret_cast<const void*>(src),
                                      nb_elt * sizeof(T),
                                      0,
                                      cudaMemcpyHostToDevice) );
}

/// Safe memory copy from device to device constant memory
template <typename T>
inline
void mem_cpy_symbol_dtd(const void* symbol, const T* d_src, int nb_elt){
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(symbol,
                                      reinterpret_cast<const void*>(d_src),
                                      nb_elt * sizeof(T),
                                      0,
                                      cudaMemcpyHostToDevice) );
}

/// Safe memory copy host to 1D cudaArray
template <typename T>
inline
void mem_cpy_1D_htd(cudaArray* dst, const T* src, int nb_elt){
    int data_size = sizeof(T) * nb_elt;
    CUDA_SAFE_CALL(cudaMemcpyToArray(dst, 0, 0, src, data_size, cudaMemcpyHostToDevice));
}

/// Safe memory copy host to 2D cudaArray
template <typename T>
inline
void mem_cpy_2D_htd(cudaArray* dst, const T* src, int2 nb_elt){
    int data_size = sizeof(T) * nb_elt.x * nb_elt.y;
    CUDA_SAFE_CALL(cudaMemcpyToArray(dst, 0, 0, src, data_size, cudaMemcpyHostToDevice));
}

/// Safe memory copy host to 3D cudaArray
template <typename T>
inline
void mem_cpy_3D_htd(cudaArray* dst, const T* src, int3 nb_elt){
    cudaExtent volumeSize = make_cudaExtent(nb_elt.x, nb_elt.y, nb_elt.z);
    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr = make_cudaPitchedPtr(reinterpret_cast<void*>(src),
                                            volumeSize.width*sizeof(T),
                                            volumeSize.width,
                                            volumeSize.height);
    copyParams.dstArray = dst;
    copyParams.extent   = volumeSize;
    copyParams.kind     = cudaMemcpyHostToDevice;
    CUDA_SAFE_CALL( cudaMemcpy3D(&copyParams) );
}

/// Safe allocation on device memory
template <typename T>
inline
void malloc_d(T*& data, int nb_elt)
{
    if(nb_elt > 0)
        CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&data),
                                  nb_elt * sizeof(T)));
    else
        data = 0;
}

/// Safe allocation on host memory
template <typename T, bool page_lock>
inline
void malloc_h(T*& data, int nb_elt)
{
    if(nb_elt > 0)
    {
        if(page_lock)
        {
            CUDA_SAFE_CALL(cudaMallocHost(reinterpret_cast<void**>(&data),
                                          nb_elt * sizeof(T)));
        }
        else
            data = new T[nb_elt];
    }
    else
        data = 0;
}

/// Safe allocation of a 1D cudaArray
template<typename T>
inline
void malloc_1D_array(cudaArray*& d_data, int nb_elt)
{
    cudaChannelFormatDesc cfd = cudaCreateChannelDesc<T>();
    CUDA_SAFE_CALL(cudaMallocArray(&d_data, &cfd, nb_elt, 1));
}

/// Safe allocation of a 2D cudaArray
template<typename T>
inline
void malloc_2D_array(cudaArray*& d_data, int2 nb_elt)
{
    cudaChannelFormatDesc cfd = cudaCreateChannelDesc<T>();
    CUDA_SAFE_CALL(cudaMallocArray(&d_data, &cfd, nb_elt.x, nb_elt.y));
}

/// Safe allocation of a 3D cudaArray
template<typename T>
inline
void malloc_3D_array(cudaArray*& d_data, int3 nb_elt)
{
    cudaChannelFormatDesc cfd = cudaCreateChannelDesc<T>();
    cudaExtent volumeSize = make_cudaExtent(nb_elt.x, nb_elt.y, nb_elt.z);
    CUDA_SAFE_CALL(cudaMalloc3DArray(&d_data, &cfd, volumeSize) );
}

/// Safe memory deallocation on device
template <typename T>
inline
void free_d(T*& data)
{
    CUDA_SAFE_CALL( cudaFree(reinterpret_cast<void*>(data)) );
    data = 0;
}

/// Safe memory deallocation of cuda arrays
template <>
inline
void free_d<cudaArray>(cudaArray*& data)
{
    CUDA_SAFE_CALL( cudaFreeArray(data) );
    data = 0;
}

/// Safe memory deallocation on host
template <typename T, bool page_lock>
inline
void free_h(T*& data)
{
    if( page_lock ) CUDA_SAFE_CALL( cudaFreeHost(reinterpret_cast<void*>(data)) );
    else            delete[] data;

    data = 0;
}


}// END CUDA_UTILS NAMESPACE ===================================================

#endif // CUDA_UTILS_MEMORY_HANDLING_HPP__
