/*
 * This software is governed by the CeCILL-B license under French law and
 * abiding by the rules of distribution of free software.  You can  use, 
 * modify and/ or redistribute the software under the terms of the CeCILL-B
 * license as circulated by CEA, CNRS and INRIA at the following URL
 * "http://www.cecill.info" or the LICENCE.txt file present in this project.
*/

#ifndef CUDA_UTILS_DEVICE_ARRAY_HPP__
#define CUDA_UTILS_DEVICE_ARRAY_HPP__

// =============================================================================

#include "cuda_utils_traits.hpp"
#include "cuda_utils_vec_expr.hpp"
#include "cuda_utils_common.hpp"
#include "cuda_compiler_interop.hpp"
#include <vector>
#include <iostream>

/** @namespace Cuda_utils::Device
    @brief utilities to work on device memory with CUDA
    This file is part of the Cuda_utils homemade toolkit it handles device
    arrays

    @see Cuda_utils
*/

// =============================================================================
namespace Cuda_utils{
// =============================================================================

namespace Host{
namespace details {
template <class T, bool page_locked>
struct ArrayTemplate;
}
}

// =============================================================================
namespace Device{
// =============================================================================

/// @brief class of arrays on device global memory
/// @tparam T : type of the array data
template <class T>
struct Array : Cuda_utils::Common::Array<T>, public Vec_expr< Array<T> > {

    template <class B>
    friend struct Cuda_utils::Device::Array;

    typedef T  value_type;
    typedef T& reference;

    typedef Cuda_utils::Common::Array<T> CCA;

    // -------------------------------------------------------------------------
    /// @name Constructors
    // -------------------------------------------------------------------------
    IF_CUDA_DEVICE_HOST
    inline Array(): CCA(), data(0), state(0) { }

    /// @warning this implicit copy constructor only copy pointers
    IF_CUDA_DEVICE_HOST
    inline Array(const Array& d_a);

    /// Create from a user allocated pointer
    /// @param auto_free specify whether the memory should be freed at
    /// destruction the destruction of the array or not
    IF_CUDA_DEVICE_HOST
    inline Array(T* ptr, int nb_elt, bool auto_free);

    /// Allocate sizeof(T)*nb_elt into device memory the array
    inline Array(int nb_elt);

    /// Initialize and allocate sizeof(T)*nb_elt into device memory the array
    /// @param elt : element to fill the array with.
    inline Array(int nb_elt, const T& elt);

    inline ~Array();

    // -------------------------------------------------------------------------
    /// @name Memory managament
    // -------------------------------------------------------------------------

    /// Allocation or reallocation (always erase data)
    inline void malloc(int nb_elt);

    /// Allocation or reallocation (keep previous data)
    inline void realloc(int nb_elt);

    /// Erase the ith element
    /// @warning slow method (array is entirely duplicated)
    inline void erase(int i);

    /// Erase elements from the index start to the index end
    /// (start and end included)
    /// @warning slow method (array is entirely duplicated)
    inline void erase(int start, int end);

    /// Erase the array (memory is freed and array size == 0)
    /// @remarks If allocated the array is also freed with the destructor
    inline void erase();


    // -------------------------------------------------------------------------
    /// @name Data insertions
    /// insert values in device memory at the index 'i'
    /// @note (insertion at the end is done with:
    /// d_a.insert(d_a.size(), elt_array)
    /// @warning slow methods (array is  entirely duplicated).
    // -------------------------------------------------------------------------

    void insert(int i, const Device::Array<T>& d_a);

    template <bool pg_lk>
    void insert(int i, const Host::details::ArrayTemplate<T, pg_lk>& h_a);

    void insert(int i, const std::vector<T>& h_vec);

    /// insert an host value in device memory at the index 'i'
    /// (insertion at the end is done with: d_a.insert(d_a.size(), elt)
    /// @warning slow method (array is entirely duplicated).
    void insert(int i, const T& val);
    /// @}

    // -------------------------------------------------------------------------
    /// @name Copy from another array
    // -------------------------------------------------------------------------
    template <class B, bool pg_lk>
    inline void copy_from(const Host::details::ArrayTemplate<B, pg_lk>& h_a);

    template <class B>
    inline void copy_from(const std::vector<B>& h_vec);

    template <class B>
    inline void copy_from(const Device::Array<B>& d_a);

    /// swap this array pointer and attributes with the given array
    inline void swap(Array& d);

    // -------------------------------------------------------------------------
    /// @name Accessors
    // -------------------------------------------------------------------------

    /// Device pointer to array memory
    inline       T* ptr()     { return data; }
    inline const T* ptr()const{ return data; }

    /// Reinterpret the array internal data as an arbitrary type 'B'
    template <class B>
    IF_CUDA_DEVICE_HOST
    inline Cuda_utils::Device::Array<B> as_array_of() const;

    /// fetch value at index i from device memory with host code
    /// @warning slow method prefer host array and copy_from() for large fetches
    inline T fetch(int i) const;

    /// fetch value at index i from device memory with host code
    /// @warning slow method prefer host array and copy_from() for large fetches
    inline void fetch(int i, T& var) const;

    /// set value from host code into device memory
    /// @warning slow method prefer copy_from() for large changes
    inline void set(int i, const T& var);

    #ifdef __CUDACC__
    /// access to array elements
    /// @warning only possible from device code
    __device__ __host__
    inline const T& operator[](int i) const {
        FORBID_HOST_CALL();
        return data[i];
    }

    /// access to array elements
    /// @warning only possible from device code
    __device__ __host__
    inline T& operator[](int i) {
        FORBID_HOST_CALL();
        return data[i];
    }

    /// Bind a device array to a linear texture
    /// @return if the array as been binded
    template <class B>
    bool bind_tex(texture<B, 1, cudaReadModeElementType>& tex_ref);
    #endif

    // -------------------------------------------------------------------------
    /// @name Overloaded ops
    // -------------------------------------------------------------------------

    #ifdef __CUDACC__
    /// Affect and compute vector using a kernel
    /// @note To be called only from host and only through a .cu
    template <typename E>
    __host__
    void operator=(Vec_expr<E> const& vec)
    {
        // Check if the value type of the expr tree 'vec' match the array type T.
        // WARNING: compiler error will not be very explicit and will likely
        // say something like "error: incomplete type is not allowed" which
        // means static assertion has failed.
        CUDA_STATIC_ASSERT( (is_same<T, typename E::value_type>::value) );

        const int block_size = 256;
        const int grid_size = (CCA::size() + block_size - 1) / block_size;

        E const& tmp = vec;

        compute_expr_kernel<E><<< grid_size, block_size >>>
            (data, tmp, CCA::size());
        CUDA_CHECK_ERRORS();
    }
    #endif

private:
    /// @warning assignment operator: forbidden (instead use swap or copy from)
    inline Array& operator=(const Array& a) {
        assert(false);
        //*this = a.as_array_of<T>();

        Cuda_utils::Device::Array<T> res;
        res.state = a.state & CCA::IS_COPY;
        res.data = a.data;
        res.Cuda_utils::Common::Array<T>::nb_elt = a.CCA::nb_elt;
        return *this;
    }

    T* data;
    int state;
};
// END ARRAY CLASS _____________________________________________________________

template <typename T>
std::ostream& operator<< ( std::ostream& ofs, const Array<T>& vec)
{
    T* tmp;
    malloc_h( tmp, vec.size() );
    mem_cpy_dth(tmp, vec.ptr(), vec.size());

    for(int i = 0; i < vec.size(); ++i )
        ofs << tmp[i] << "; ";

    free_h( tmp );
    return ofs << std::endl;
}

}// END DEVICE NAMESPACE =======================================================

}// END CUDA_UTILS NAMESPACE ===================================================

////////////////////////////////////////////////////////////////////////////////
//Device array methods
////////////////////////////////////////////////////////////////////////////////

template <class T>
inline Cuda_utils::Device::Array<T>::

Array(int nb_elt) :
    CCA(nb_elt),
    state(CCA::IS_ALLOCATED)
{
    data = 0;
    CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&data), nb_elt * sizeof(T)));
}

// -----------------------------------------------------------------------------

template <class T>
inline Cuda_utils::Device::Array<T>::

Array(int nb_elt, const T& elt) :
    CCA(nb_elt),
    state(CCA::IS_ALLOCATED)
{
    data = 0;
    CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&data), nb_elt * sizeof(T)));

    // Fill the array:
    std::vector<T> vec(nb_elt, elt);
    this->copy_from(vec);
}

// -----------------------------------------------------------------------------

template <class T>
inline Cuda_utils::Device::Array<T>::

Array(const Cuda_utils::Device::Array<T>& d_a) :
    CCA(d_a.nb_elt),
    data(d_a.data),
    state(d_a.state | CCA::IS_COPY)
{ /*       */ }

// -----------------------------------------------------------------------------

template <class T>
inline Cuda_utils::Device::Array<T>::

Array(T* ptr, int nb_elt, bool auto_free) :
    CCA(nb_elt),
    data(ptr)
{
    state = CCA::IS_ALLOCATED;
    if(!auto_free)
        state =  state | CCA::IS_COPY;
}

// -----------------------------------------------------------------------------

template <class T>
inline Cuda_utils::Device::Array<T>::

~Array()
{
    if( (state & CCA::IS_ALLOCATED) && !(state & CCA::IS_COPY) && (CCA::nb_elt > 0) )
    {
        CUDA_SAFE_CALL(cudaFree(data));
        data = 0;
    }
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::Array<T>::

malloc(int nb_elt)
{
    if(!(state & CCA::IS_ALLOCATED)){
        CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&data), nb_elt * sizeof(T)));
        state = (state | CCA::IS_ALLOCATED) & (~CCA::IS_COPY);
        CCA::nb_elt = nb_elt;
    } else {
        if(state & CCA::IS_COPY){
            CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&data), nb_elt * sizeof(T)));
            state = (state | CCA::IS_ALLOCATED) & (~CCA::IS_COPY);
            CCA::nb_elt = nb_elt;
        } else {
            if(nb_elt == CCA::nb_elt) return;
            CUDA_SAFE_CALL(cudaFree(data));
            data = 0;
            CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&data), nb_elt * sizeof(T)));
            state = (state | CCA::IS_ALLOCATED);
            CCA::nb_elt = nb_elt;
        }
    }
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::Array<T>::

realloc(int nb_elt)
{
    if(!(state & CCA::IS_COPY))
    {
        if(nb_elt == CCA::nb_elt) return;

        T* data_tmp;
        CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&data_tmp), nb_elt * sizeof(T)));
        CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(data_tmp),
                                  reinterpret_cast<const void*>(data),
                                  (nb_elt > CCA::nb_elt ? CCA::nb_elt : nb_elt) * sizeof(T),
                                  cudaMemcpyDeviceToDevice));
        if(state & CCA::IS_ALLOCATED){
            CUDA_SAFE_CALL(cudaFree(data));
            data = 0;
        }
        data = data_tmp;
        CCA::nb_elt = nb_elt;
        state = CCA::IS_ALLOCATED;
    }else
        fprintf(stderr,"cuda_utils : Can't realloc an implicit copy !\n");
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::Array<T>::

erase(int start, int end)
{
    assert(start >= 0);
    assert(start <= end);
    assert( end < CCA::size() );

    if( state & CCA::IS_ALLOCATED )
    {
        const int nb_elt = end-start+1;
        Array<T> tmp(CCA::size()-nb_elt);
        mem_cpy_dtd(tmp.data        , data      , start            );
        mem_cpy_dtd(tmp.data + start, data+end+1, CCA::size()-end-1);
        tmp.swap(*this);
    }
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::Array<T>::

erase(int i)
{
    erase(i, i);
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::Array<T>::

erase()
{
    if((state & CCA::IS_ALLOCATED) & !(state & CCA::IS_COPY))
    {
        CUDA_SAFE_CALL(cudaFree(data));
        data  = 0;
        state = 0;
        CCA::nb_elt = 0;
    }
}

// -----------------------------------------------------------------------------

template <class T>
template <bool pg_lk>
inline void Cuda_utils::Device::Array<T>::

insert(int i, const Host::details::ArrayTemplate<T, pg_lk>& h_a)
{
    assert(i >= 0);
    assert(i <= CCA::size());
    if(h_a.size() != 0)
    {
        Array<T> tmp(CCA::size() + h_a.size() );
        mem_cpy_dtd(tmp.data               , data          , i            );
        mem_cpy_htd(tmp.data + i           , h_a.ptr(), h_a.size()   );
        mem_cpy_dtd(tmp.data + i+h_a.size(), data + i      , CCA::size()-i);
        tmp.swap(*this);
    }
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::Array<T>::

insert(int i, const std::vector<T>& h_vec)
{
    assert(i >= 0);
    assert(i <= CCA::size());
    if( h_vec.size() != 0)
    {
        Array<T> tmp(CCA::size() + h_vec.size() );
        mem_cpy_dtd(tmp.data                 , data     , i             );
        mem_cpy_htd(tmp.data + i             , &h_vec[0], h_vec.size()  );
        mem_cpy_dtd(tmp.data + i+h_vec.size(), data + i , CCA::size()-i );
        tmp.swap(*this);
    }
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::Array<T>::

insert(int i, const Device::Array<T>& d_a)
{
    assert(i >= 0);
    assert(i <= CCA::size());
    if(d_a.size() != 0)
    {
        Array<T> tmp(CCA::size() + d_a.size() );
        mem_cpy_dtd(tmp.data               , data          , i            );
        mem_cpy_dtd(tmp.data + i           , d_a.ptr(), d_a.size()   );
        mem_cpy_dtd(tmp.data + i+d_a.size(), data + i      , CCA::size()-i);
        tmp.swap(*this);
    }
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::Array<T>::

insert(int i, const T& val)
{
    assert(i >= 0);
    assert(i <= CCA::size());

    Array<T> tmp(CCA::size() + 1 );
    mem_cpy_dtd(tmp.data      , data     , i             );
    mem_cpy_htd(tmp.data + i  , &val     , 1             );
    mem_cpy_dtd(tmp.data + i+1, data + i , CCA::size()-i );
    tmp.swap(*this);

}

// -----------------------------------------------------------------------------

template <class T>
template <class B, bool pg_lk>
inline void Cuda_utils::Device::Array<T>::

copy_from(const Host::details::ArrayTemplate<B, pg_lk>& h_a)
{
    assert(state & CCA::IS_ALLOCATED);

    int bytes_to_copy;
    if(CCA::nb_elt * sizeof(T) >= h_a.size() * sizeof(B))
        bytes_to_copy = h_a.size() * sizeof(B);
    else
    {
        bytes_to_copy = CCA::nb_elt * sizeof(T);
        fprintf(stderr,"cuda_utils : warning array capacity exceeded\n");
        assert(false);
    }

    CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(data),
                              reinterpret_cast<const void*>(h_a.ptr()),
                              bytes_to_copy,
                              cudaMemcpyHostToDevice));

}

// -----------------------------------------------------------------------------

// Standard function to copy a std::vector<T> to device memory
template<typename A, typename B>
inline
void cuda_mem_cpy_from_hack(A* dst,
                            const std::vector<B>& src,
                            int bytes_to_copy)
{
    CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dst),
                              reinterpret_cast<const void*>(&src[0]),
                              bytes_to_copy,
                              cudaMemcpyHostToDevice));
}

// FIXME: we should use partial template specialization as dst should not be
// forced to be a boolean

// Specific function to copy a std::vector<bool> to device memory
template<>
inline
void cuda_mem_cpy_from_hack< bool, bool >(bool* dst,
                                          const std::vector<bool>& src,
                                          int bytes_to_copy)
{
    bool *tmp = new bool[src.size()];
    for (unsigned int i = 0; i < src.size(); ++i)
        tmp[i] = src[i];

    CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(dst),
                              reinterpret_cast<const void*>(&tmp[0]),
                              bytes_to_copy,
                              cudaMemcpyHostToDevice));
    delete[] tmp;
}

// -----------------------------------------------------------------------------

template <class T>
template <class B>
inline void Cuda_utils::Device::Array<T>::

copy_from(const std::vector<B>& h_vec)
{
    assert(state & CCA::IS_ALLOCATED);
    if(h_vec.size() > 0)
    {
        int bytes_to_copy;
        if(CCA::nb_elt * sizeof(T) >= h_vec.size() * sizeof(B))
            bytes_to_copy = h_vec.size() * sizeof(B);
        else
        {
            bytes_to_copy = CCA::nb_elt * sizeof(T);
            fprintf(stderr,"cuda_utils : warning array capacity exceeded\n");
            assert(false);
        }

        // To copy data we have to chech if the std::vector is a vector of bool
        // if so we cannot use the pointer of the vector to copy datas because
        // std::vector<bool> are stored internally with bitflags !
        // Hence this ugly hack of cuda_mem_cpy_from() which is a dedicated
        // functions that check with templates specializations if the vector
        // is indeed a boolean vector.
        cuda_mem_cpy_from_hack(data, h_vec, bytes_to_copy);
    }
}

// -----------------------------------------------------------------------------

template <class T>
template <class B>
inline void Cuda_utils::Device::Array<T>::

copy_from(const Device::Array<B>& d_a)
{
    assert(state & CCA::IS_ALLOCATED);

    int bytes_to_copy;
    if(CCA::nb_elt * sizeof(T) >= d_a.size() * sizeof(B))
        bytes_to_copy = d_a.size() * sizeof(B);
    else
    {
        bytes_to_copy = CCA::nb_elt * sizeof(T);
        fprintf(stderr,"cuda_utils : warning array capacity exceeded\n");
        assert(false);
    }

    CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(data),
                              reinterpret_cast<const void*>(d_a.ptr()),
                              bytes_to_copy,
                              cudaMemcpyDeviceToDevice));

}

// -----------------------------------------------------------------------------

#ifdef __CUDACC__
template <class T>
template <class B>
bool Cuda_utils::Device::Array<T>::

bind_tex(texture<B, 1, cudaReadModeElementType>& tex_ref)
{
    int size = CCA::nb_elt;
    if(size > 0)
    {
        tex_ref.addressMode[0] = cudaAddressModeWrap;
        tex_ref.addressMode[1] = cudaAddressModeWrap;
        tex_ref.filterMode = cudaFilterModePoint;
        tex_ref.normalized = false;
        CUDA_SAFE_CALL(cudaBindTexture(0,
                                       tex_ref,
                                       data,
                                       size * sizeof(T))
                       );
        return true;
    }
    return false;
}
#endif

// -----------------------------------------------------------------------------

template <class T>
inline T Cuda_utils::Device::Array<T>::

fetch(int i) const
{
    assert(i >= 0);
    assert(i < CCA::nb_elt);
//    struct To_POD{
//        char __c[sizeof(T)];
//        inline operator T() const {return *reinterpret_cast<const T*>(__c);}
//    } fdata;
    T fdata;
    CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(&fdata),
                              reinterpret_cast<const void*>(&(data[i])),
                              sizeof(T),
                              cudaMemcpyDeviceToHost));
    return fdata;
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::Array<T>::

fetch(int i, T& var) const
{
    assert(i >= 0);
    assert(i < CCA::nb_elt);
    CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(&var),
                              reinterpret_cast<const void*>(&(data[i])),
                              sizeof(T),
                              cudaMemcpyDeviceToHost));
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::Array<T>::

set(int i, const T& var)
{
    assert(i < CCA::nb_elt);
    mem_cpy_htd(data+i, &var, 1);
}

// -----------------------------------------------------------------------------

template <class T>
inline void Cuda_utils::Device::Array<T>::

swap(Array& d)
{
    T* data_tmp = data;
    int state_tmp = state;
    int nb_tmp = CCA::nb_elt;
    data = d.data;
    state = d.state;
    CCA::nb_elt = d.nb_elt;
    d.data = data_tmp;
    d.state = state_tmp;
    d.nb_elt = nb_tmp;
}

// -----------------------------------------------------------------------------

template <class T>
template <class B>
inline Cuda_utils::Device::Array<B> Cuda_utils::Device::Array<T>::

as_array_of() const
{
    Cuda_utils::Device::Array<B> res;
    res.state = state & CCA::IS_COPY;
    res.data = reinterpret_cast<B*>(data);
    res.Cuda_utils::Common::Array<B>::nb_elt = (CCA::nb_elt * sizeof(T) )/sizeof(B);
    return res;
}


#endif // CUDA_UTILS_DEVICE_ARRAY_HPP__
