/*
 * This software is governed by the CeCILL-B license under French law and
 * abiding by the rules of distribution of free software.  You can  use, 
 * modify and/ or redistribute the software under the terms of the CeCILL-B
 * license as circulated by CEA, CNRS and INRIA at the following URL
 * "http://www.cecill.info" or the LICENCE.txt file present in this project.
*/

#ifndef CUDA_UTILS_HD_ARRAY_HPP__
#define CUDA_UTILS_HD_ARRAY_HPP__

#include "cuda_utils_common.hpp"
#include "cuda_utils_host_array.hpp"
#include "cuda_utils_device_array.hpp"
#include <cassert>

// =============================================================================
namespace Cuda_utils{
// =============================================================================

/**
  @class HD_Array

  @brief Host and Device Array holding both device and host array

  Because often we need to maintain both device and host copies this class
  holds an host array and a device array.

  At construction both host and device arrays are allocated.

  Typical usage:
  @code
  // Allocating device and host memory
  HD_Array<int> hd_array(2);

  // doing stuff on host memory
  hd_array[0] = 3;
  hd_array.erase(1);
  hd_array.realloc(3);

  // !!!
  // reporting on device memory the changes.
  // this will allocate device mem if necessary and copy from host
  // !!!
  hd_array.update_device_mem();

  // Launching kernel on the array
  ker<<< b, g>>>(hd_array.d_ptr(), hd_array.size());

  // reporting on host memory the changes
  hd_array.update_host_mem();

  @endcode

  You can also access device memory on a kernel using [] it will
  correctly fetch with the device pointer:
  @code
  // Note: as with device and host array hd_array copy constructor only
  // copy pointers it enable to pass it by value as a kernel parameter.
  __global__
  void ker(HD_Array<T> array ) {
    ...
    array[i] = ... ;
  }
  @endcode

  TODO desribe hd methods which operates both in device and host memory

*/
template <class T>
struct HD_Array : public Host::Array<T>, public Vec_expr< HD_Array<T> >{

    HD_Array() : Host::Array<T>() { }

    // -------------------------------------------------------------------------
    /// @name Operate on both device and host
    // -------------------------------------------------------------------------

    /// Allocate both device and host memory
    HD_Array(int nb_elt_) :
        Host::Array<T>(nb_elt_),
        _d_array(nb_elt_)
    {   }

    /// Allocate both device and host memory
    HD_Array(int nb_elt_, const T &elt) :
        Host::Array<T>(nb_elt_, elt),
        _d_array(nb_elt_, elt)
    {   }

    /// Erase device memory with host memory
    /// Set both device/host arrays ith element to 'elt'
    /// @warning it's the slowest method to set a value
    void set_hd(int i, const T& elt){
        (*this)[i] = elt;
        _d_array.set(i, elt);
    }

    /// Allocate both host and device (only if necessary for both dev and host)
    void malloc_hd(int nb_elt_) {
        if( Host::Array<T>::size() != nb_elt_) this->   malloc( nb_elt_ );
        if( _d_array.size()        != nb_elt_) _d_array.malloc( nb_elt_ );
    }

    /// Allocate both host and device (only if necessary for both dev and host)
    /// @param elt : element used to fill the newly allocated arrays
    template <typename B>
    void malloc_hd(int nb_elt_, const B& elt)
    {
        if(nb_elt_ == 0){
            this->erase();
            _d_array.erase();
            return;
        }

        if( Host::Array<T>::size() != nb_elt_)
            this->malloc( nb_elt_, elt);
        else
            this->fill( elt );

        if( _d_array.size() != nb_elt_)
            _d_array.malloc( nb_elt_ );

        _d_array.copy_from( *this );
    }

    /// Copy from both host and device
    template <class B, bool pg_lk>
    inline void copy_from_hd(const Host::details::ArrayTemplate<B, pg_lk>& h_a)
    {
        assert(Host::Array<T>::size() == _d_array.size());
        this->copy_from( h_a );
        _d_array.copy_from( h_a );
    }

    template <class B>
    inline void copy_from_hd(const std::vector<B>& h_vec){
        assert(Host::Array<T>::size() == _d_array.size());
        this->copy_from( h_vec );
        _d_array.copy_from( h_vec );
    }

    template <class B>
    inline void copy_from_hd(const B* h_vec, int nb_elt){
        assert(Host::Array<T>::size() == _d_array.size());
        this->copy_from( h_vec, nb_elt );
        _d_array.copy_from( *this );
    }

    template <class B>
    inline void copy_from_hd(const Cuda_utils::Device::Array<B>& d_a){
        assert(Host::Array<T>::size() == _d_array.size());
        this->copy_from( d_a );
        _d_array.copy_from( d_a );
    }

    /// access to array elements: will fetch from device memory if called from
    /// a kernel or to host if called from CPU code
    IF_CUDA_DEVICE_HOST
    inline const T& operator[](int i) const {
        #ifndef __CUDA_ARCH__
            //assert( i > -1           );
            //assert( i <  CCA::nb_elt );
            return Host::Array<T>::ptr()[i];
        #else
            return _d_array[i];
        #endif
    }

    /// access to array elements: will fetch from device memory if called from
    /// a kernel or to host if called from CPU code
    IF_CUDA_DEVICE_HOST
    inline T& operator[](int i) {
        #ifndef __CUDA_ARCH__
            //assert( i > -1           );
            //assert( i <  CCA::nb_elt );
            return Host::Array<T>::ptr()[i];
        #else
            return _d_array[i];
        #endif
    }

    // -------------------------------------------------------------------------
    /// @name Update if host/device unsynchronised
    // -------------------------------------------------------------------------

    /// Copy a segment of the host mem into device
    void update_device_mem(int start, int nb_elt_)
    {
        assert(Host::Array<T>::size() == _d_array.size());
        mem_cpy_htd( d_ptr() + start, Host::Array<T>::ptr() + start, nb_elt_ );
    }

    /// Copy a segment of the device mem into host
    void update_host_mem(int start, int nb_elt_)
    {
        assert(Host::Array<T>::size() == _d_array.size());
        mem_cpy_dth( Host::Array<T>::ptr() + start, d_ptr() + start, nb_elt_ );
    }

    /// Overide all device memory with host memory.
    /// Device is automatically reallocated to match host size if needed
    void update_device_mem()
    {
        if(CCA::nb_elt == 0){
            _d_array.erase();
            return;
        }

        if(_d_array.size() != CCA::nb_elt)
            _d_array.malloc( CCA::nb_elt );

        _d_array.copy_from( *this );
    }

    /// Overide all host memory with the device memory
    /// Host is automatically reallocated to match device size if needed
    void update_host_mem()
    {
        if( _d_array.size() == 0){
            this->erase();
            return;
        }

        if(_d_array.size() != CCA::nb_elt)
            this->malloc( _d_array.size() );

        this->copy_from( _d_array);
    }

    #ifdef __CUDACC__
    /// Affect and compute vector using a kernel
    /// @note To be called only from host and only through a .cu
    template <typename E>
    __host__
    void operator=(Vec_expr<E> const& vec)
    {
        // Check if the value type of the expr tree 'vec' match the array type T.
        // WARNING: compiler error will not be very explicit and will likely
        // say something like "error: incomplete type is not allowed" which
        // means static assertion has failed.
        CUDA_STATIC_ASSERT( (is_same<T, typename E::value_type>::value) );

        const int block_size = 256;
        const int grid_size = (CCA::size() + block_size - 1) / block_size;

        E const& tmp = vec;

        compute_expr_kernel<E><<< grid_size, block_size >>>
            (d_ptr(), tmp, CCA::size());
        CUDA_CHECK_ERRORS();

        update_host_mem();
    }
    #endif

    // -------------------------------------------------------------------------
    /// @name Accessors
    // -------------------------------------------------------------------------

    /// @return a pointeur to the device memory
          T* d_ptr()       { return _d_array.ptr(); }
    const T* d_ptr() const { return _d_array.ptr(); }

    /// @return reference to the device array
          Device::Array<T>& device_array()       { return _d_array; }
    const Device::Array<T>& device_array() const { return _d_array; }

private:
    typedef Cuda_utils::Common::Array<T> CCA;
    typedef Host::Array<T> CHA;
    Device::Array<T> _d_array;
};

}// END CUDA_UTILS NAMESPACE ===================================================

#endif // CUDA_UTILS_HD_ARRAY_HPP__
