/*
 * This software is governed by the CeCILL-B license under French law and
 * abiding by the rules of distribution of free software.  You can  use, 
 * modify and/ or redistribute the software under the terms of the CeCILL-B
 * license as circulated by CEA, CNRS and INRIA at the following URL
 * "http://www.cecill.info" or the LICENCE.txt file present in this project.
*/

#ifndef CUDA_UTILS_DEVICE_CU_ARRAY_HPP__
#define CUDA_UTILS_DEVICE_CU_ARRAY_HPP__

#include "cuda_utils_common.hpp"
#include "cuda_utils_host_array.hpp"

// =============================================================================
namespace Cuda_utils{
// =============================================================================

// =============================================================================
namespace Device{
// =============================================================================

/**
 * @class CuArray
 * @brief Utility to allocate/copy 'cudaArray' type
 *
 * When using cuda textures one must use "cudaArray*" if linear interpolation
 * is to be used. This class provide utilities for allocating/copying data
 * of the cudaArray format.
 *
 * use case:
 * @code
 *
 * texture<float, 2, cudaReadModeElementType>  data_tex;
 * CuArray<float> d_data;
 *
 * {
 *     Host::Array<float> h_data(width * height, 0.f);
 *
 *     d_data.malloc(width, height):
 *     d_data.copy_from(h_data);
 *
 *     data_tex.normalized = false;
 *     data_tex.addressMode[0] = cudaAddressModeClamp;
 *     data_tex.addressMode[1] = cudaAddressModeClamp;
 *     data_tex.filterMode = cudaFilterModeLinear;
 *     d_data.bind_tex( data_tex );
 *     // Texture is ready to use with lerp
 * }
 *
 * @endcode
 *
 */
template <class T>
struct CuArray : Cuda_utils::Common::Array<T>{

    // -------------------------------------------------------------------------
    /// @name Constructors
    // -------------------------------------------------------------------------
    inline CuArray() :
        CCA(),
        data(0),
        state(0),
        array_extent(make_cudaExtent(0,0,0))
    { }

    /// Recopy only copy pointers
    inline CuArray(const CuArray& ca) :
        CCA(ca.nb_elt),
        data(ca.data),
        state(ca.state | CCA::IS_COPY),
        array_extent(make_cudaExtent(0,0,0))
    { }

    template<bool pg_lk>
    explicit inline CuArray(const Host::details::ArrayTemplate<T,pg_lk>& h_a);

    explicit inline CuArray(const Cuda_utils::Device::Array<T>& d_a);

    explicit inline CuArray(int dimx);
    explicit inline CuArray(int dimx, int dimy);
    explicit inline CuArray(int dimx, int dimy, int dimz);

    //Destructor
    inline ~CuArray();

    // -------------------------------------------------------------------------
    /// @name Allocation (always erase previous data)
    // -------------------------------------------------------------------------
    /// @{
    inline void malloc(int dimx);
    inline void malloc(int dimx, int dimy);
    inline void malloc(int dimx, int dimy, int dimz);
    /// @}

    /// Free device memory
    inline void erase();

    // -------------------------------------------------------------------------
    /// @name Copy
    // -------------------------------------------------------------------------

    // TODO: copy from std::vector

    /// Copy from another host array
    /// @return 0 if succeeded
    template <class B, bool pg_lk>
    inline int copy_from(const Host::details::ArrayTemplate<B, pg_lk>& h_a);

    /// upload data from device memory to the CuArray
    /// @param d_a : the device array in the standard global memory space
    /// @param size : number of elements of the array d_a.
    /// @return 0 if succeeded
    template <class B>
    inline int copy_from(B* d_a, int size);

    // -------------------------------------------------------------------------
    /// @name Methods
    // -------------------------------------------------------------------------
    #ifdef __CUDACC__
    /// @warning don't forget to setup the texture paremeters
    /// (clamping filter mode etc.)
    template <int dim>
    inline void bind_tex(texture<T, dim, cudaReadModeElementType>& texref) const {
        if(CCA::nb_elt > 0) CUDA_SAFE_CALL(cudaBindTextureToArray(texref, data));
    }
    #endif

private:

    /// @warning assignment operator: forbidden (instead use copy from)
    inline CuArray& operator=(const CuArray& a) {
        return a;
    }

    cudaArray* data;
    int state;
    cudaExtent array_extent;
    typedef Cuda_utils::Common::Array<T> CCA;
};
// END CUARRAY CLASS ___________________________________________________________

}
// END DEVICE NAMESPACE ========================================================

}
// END CUDA_UTILS NAMESPACE ====================================================


////////////////////////////////////////////////////////////////////////////////
// CuArray methods implem
////////////////////////////////////////////////////////////////////////////////

template <class T>
template<bool pg_lk>
inline Cuda_utils::Device::CuArray<T>::

CuArray(const Host::details::ArrayTemplate<T,pg_lk>& h_a):
    CCA(h_a.size()),
    state(CCA::IS_ALLOCATED),
    array_extent(make_cudaExtent(h_a.size(),0,0))
{
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<T>();
    CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr  = make_cudaPitchedPtr(reinterpret_cast<void*>(h_a.ptr()),
                                             array_extent.width*sizeof(T),
                                             array_extent.width, array_extent.height);
    copyParams.dstArray= data;
    copyParams.extent  = array_extent;
    copyParams.kind    = cudaMemcpyHostToDevice;
    CUDA_SAFE_CALL( cudaMemcpy3D(&copyParams) );
}

// -----------------------------------------------------------------------------

template <class T>
inline Cuda_utils::Device::CuArray<T>::

CuArray(const Cuda_utils::Device::Array<T>& d_a):
    CCA(d_a.size()),
    state(CCA::IS_ALLOCATED),
    array_extent(make_cudaExtent(d_a.size(),0,0))
{
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<T>();
    CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr  = make_cudaPitchedPtr(reinterpret_cast<void*>(d_a.ptr()),
                                             array_extent.width*sizeof(T),
                                             array_extent.width, array_extent.height);
    copyParams.dstArray= data;
    copyParams.extent  = array_extent;
    copyParams.kind    = cudaMemcpyDeviceToDevice;
    CUDA_SAFE_CALL( cudaMemcpy3D(&copyParams) );
}

// -----------------------------------------------------------------------------

template <class T>
inline Cuda_utils::Device::CuArray<T>::

CuArray(int dimx):
    CCA(dimx),
    state(CCA::IS_ALLOCATED),
    array_extent(make_cudaExtent(dimx, 0, 0))
{
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<T>();
    CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
}

// -----------------------------------------------------------------------------

template <class T>
inline Cuda_utils::Device::CuArray<T>::

CuArray(int dimx, int dimy):
    CCA(dimx*dimy),
    state(CCA::IS_ALLOCATED),
    array_extent(make_cudaExtent(dimx, dimy, 0))
{
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<T>();
    CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
}

// -----------------------------------------------------------------------------

template <class T>
inline Cuda_utils::Device::CuArray<T>::

CuArray(int dimx, int dimy, int dimz):
    CCA(dimx*dimy*dimz),
    state(CCA::IS_ALLOCATED),
    array_extent(make_cudaExtent(dimx, dimy, dimz))
{
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<T>();
    CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
}

// -----------------------------------------------------------------------------

template <class T>
inline Cuda_utils::Device::CuArray<T>::

~CuArray()
{
    if((state & CCA::IS_ALLOCATED) & !(state & CCA::IS_COPY) & CCA::nb_elt > 0){
        CUDA_SAFE_CALL(cudaFreeArray(data));
        data = 0;
    }
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::CuArray<T>::

erase()
{
    if((state & CCA::IS_ALLOCATED) & !(state & CCA::IS_COPY) & CCA::nb_elt > 0)
    {
        CUDA_SAFE_CALL(cudaFreeArray(data));
        data  = 0;
        state = 0;
        CCA::nb_elt = 0;
    }
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::CuArray<T>::

malloc(int dimx)
{
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<T>();
    cudaExtent new_extent = make_cudaExtent(dimx,0,0);
    int nb_elt = dimx;

    assert(nb_elt >= 0);

    if(!(state & CCA::IS_ALLOCATED)){
        array_extent = new_extent;
        CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
        state = (state | CCA::IS_ALLOCATED) & (~CCA::IS_COPY);
        CCA::nb_elt = nb_elt;
    } else {
        if(state & CCA::IS_COPY){
            array_extent = new_extent;
            CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
            state = (state | CCA::IS_ALLOCATED) & (~CCA::IS_COPY);
            CCA::nb_elt = nb_elt;
        } else {
            CUDA_SAFE_CALL(cudaFreeArray(data));
            data = 0;
            array_extent = new_extent;
            CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
            state = (state | CCA::IS_ALLOCATED);
            CCA::nb_elt = nb_elt;
        }
    }
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::CuArray<T>::

malloc(int dimx, int dimy)
{
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<T>();
    cudaExtent new_extent = make_cudaExtent(dimx, dimy,0);
    int nb_elt = dimx * dimy;

    assert(nb_elt >= 0);

    if(!(state & CCA::IS_ALLOCATED)){
        array_extent = new_extent;
        CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
        state = (state | CCA::IS_ALLOCATED) & (~CCA::IS_COPY);
        CCA::nb_elt = nb_elt;
    } else {
        if(state & CCA::IS_COPY){
            array_extent = new_extent;
            CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
            state = (state | CCA::IS_ALLOCATED) & (~CCA::IS_COPY);
            CCA::nb_elt = nb_elt;
        } else {
            CUDA_SAFE_CALL(cudaFreeArray(data));
            data = 0;
            array_extent = new_extent;
            CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
            state = (state | CCA::IS_ALLOCATED);
            CCA::nb_elt = nb_elt;
        }
    }
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::CuArray<T>::

malloc(int dimx, int dimy, int dimz)
{
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<T>();
    cudaExtent new_extent = make_cudaExtent(dimx, dimy, dimz);
    int nb_elt = dimx * dimy * dimz;

    assert(nb_elt >= 0);

    if(!(state & CCA::IS_ALLOCATED)){
        array_extent = new_extent;
        CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
        state = (state | CCA::IS_ALLOCATED) & (~CCA::IS_COPY);
        CCA::nb_elt = nb_elt;
    } else {
        if(state & CCA::IS_COPY){
            array_extent = new_extent;
            CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
            state = (state | CCA::IS_ALLOCATED) & (~CCA::IS_COPY);
            CCA::nb_elt = nb_elt;
        } else {
            CUDA_SAFE_CALL(cudaFreeArray(data));
            data = 0;
            array_extent = new_extent;
            CUDA_SAFE_CALL(cudaMalloc3DArray(&data, &channelDesc, array_extent));
            state = (state | CCA::IS_ALLOCATED);
            CCA::nb_elt = nb_elt;
        }
    }
}

// -----------------------------------------------------------------------------

template <class T>
template <class B, bool pg_lk>
inline int Cuda_utils::Device::CuArray<T>::

copy_from(const Host::details::ArrayTemplate<B, pg_lk>& h_a)
{
    assert(CCA::nb_elt * sizeof(T) == h_a.size() * sizeof(B));
    if((state & CCA::IS_ALLOCATED) && h_a.size() > 0)
    {
        cudaMemcpy3DParms copyParams = {0};
        copyParams.srcPtr  = make_cudaPitchedPtr(const_cast<void*>(reinterpret_cast<const void*>(h_a.ptr())), // const_cast is so ugly I know ...
                                                 array_extent.width*sizeof(T),
                                                 array_extent.width,
                                                 array_extent.height);
        copyParams.dstArray= data;
        copyParams.extent  = array_extent;
        copyParams.kind    = cudaMemcpyHostToDevice;
        CUDA_SAFE_CALL(cudaMemcpy3D(&copyParams));
        return 0;
    }
    return 1;
}

// -----------------------------------------------------------------------------

template <class T>
template <class B>
inline int Cuda_utils::Device::CuArray<T>::

copy_from(B* d_a, int size)
{
    assert(CCA::nb_elt * sizeof(T) == size * sizeof(B));
    if((state & CCA::IS_ALLOCATED) && size > 0)
    {
        cudaMemcpy3DParms copyParams = {0};
        copyParams.srcPtr  = make_cudaPitchedPtr(d_a,
                                                 array_extent.width*sizeof(T),
                                                 array_extent.width,
                                                 array_extent.height);
        copyParams.dstArray= data;
        copyParams.extent  = array_extent;
        copyParams.kind    = cudaMemcpyDeviceToDevice;
        CUDA_SAFE_CALL(cudaMemcpy3D(&copyParams));
        return 0;
    }
    return 1;
}

// -----------------------------------------------------------------------------

#endif // CUDA_UTILS_DEVICE_CU_ARRAY_HPP__
