/*
 * This software is governed by the CeCILL-B license under French law and
 * abiding by the rules of distribution of free software.  You can  use, 
 * modify and/ or redistribute the software under the terms of the CeCILL-B
 * license as circulated by CEA, CNRS and INRIA at the following URL
 * "http://www.cecill.info" or the LICENCE.txt file present in this project.
*/

#ifndef VEC_EXPR_HPP__
#define VEC_EXPR_HPP__

#include "cuda_utils_traits.hpp"
#include "cuda_assert.hpp"
#include "cuda_compiler_interop.hpp"
#include "math_cu.hpp"
#include "transfo.hpp"
#include "vec3_cu.hpp"
#include "vec2_cu.hpp"

// This is blasphemy, This is madness ...
// NO ! THIS IS METAPROGRAMING AND MACROOOOOOOOOOOOOOOOOOS !!!!!!!!!!!!!!!!!!!!!

/**
 * @file cuda_utils_vec_expr.hpp
 * @brief Handling a static tree of arithmetic expressions such as (a+b*3/c)
 *
 * @warning this file contains a MetaProgramming pattern called "Expression
 * Template". If you intend to modify it or understand the implementation
 * please see FIRST
 * <a href="http://en.wikipedia.org/wiki/Expression_templates" >a synthetic
 * example</a> and then read carrefully this doc.
 *
 * Let's introduce the goal of the classes implemented here:
 * We want to be able to do arithmetic operations on arrays but in an efficient
 * manner. for instance we would like to compute the expression res = (a+b*3);
 * with 'res', 'a', and 'b' arrays with the same size. The first approach
 * would be to overload the operator '*' '+' and '=' in the following manner:
 * @code
 * Array operator+ (const Array& a) const
 * {
 *      Array temp( this->size() );
 *      for(int i = 0; i < this->size(); ++i)
 *          temp[i] = (*this)[i] + a[i];
 *      return tmp;
 * }
 * @endcode
 *
 * With the other operators like '*' defined similarly the computation will be
 * particularly ineficient. Indeed <b>each time</b> we use an overloaded
 * operator we need to look up the array and create a temporary value which is
 * entirely copied (because returned by value).
 *
 * Instead we want to do what's called a lazy evaluation of the expression
 * res = (a+b*3); i.e. we want to memorize the operations and only evaluate them
 * once we do the '=' at the very end. So we'd like the operator '=' to look
 * like this:
 * @code
 * Array operator = ( Arguments that describe the operations to do  )
 * {
 *      Array temp( this->size() );
 *      for(int i = 0; i < this->size(); ++i)
 *          // Retreive the expression somehow and evaluate it
 *          temp[i] = (*this)[i] = (a[i] + b[i] * 3);
 *
 *      return tmp;
 * }
 * @endcode
 *
 * Here we provide classes to implement yourself the lazy evaluation mechanism.
 * This pattern use template meta programming which ensure all is done at
 * compile time. It means you will get no overhead compared to the example above
 * (that is: a manual optimisation where we write the for loop ourselves
 * to avoid copies and multiple array look ups)
 *
 * First we will describe how to use this mechanism for your own type of arrays
 * and then we'll explain a bit of the MetaProgramming pattern used.
 *
 *----------------
 * HOW TO USE IT
 *----------------
 *
 * To represent an arithemtic expression we use the base class Vec_expr.
 * Expressions are represented as a tree data structure where the leaves are
 * the arrays and the other nodes the operators such as '+', '-' etc.
 * To handle your own array type (say Custom_array) the tree must be aware that
 * Custom_array is a node. We do that by inheriting from Vec_expr as below:
 * @code
 * #include "cuda_utils_vec_expr.hpp"
 *
 * template <typename Array_type>
 * class Custom_array : public Vec_expr< Custom_array<Array_type> > {
 *      // Mandatory typedef to value_type
 *      typedef Array_type value_type;
 *      // Mandatory accessor to the array
 *      Array_type& operator[](int i) { return array_internal_storage[i]; }
 *      ...
 * }
 * @endcode
 *
 * Notice that to be a valid node of the expression tree your array must
 * also define the type 'value_type' with a typedef and an array accessor with
 * the operator [].
 *
 * You are now ready to evaluate the expression tree with your own array type!
 * Usually one will define inside Custom_array the overloaded operator '=':
 * @code
 * template <typename E>
 * Array operator=( const Vec_expr<E>& vec )
 * {
 *      Array temp( this->size() );
 *      for(int i = 0; i < this->size(); ++i)
 *          temp[i] = (*this)[i] = vec[i];
 *      return tmp;
 * }
 * @endcode
 *
 * Note that the evaluation of 'vec[i]' is entirely replaced by
 * (a[i] + b[i] * 3) at compile time! (considering the earlier example)
 *
 *--------------
 * LIMITATIONS
 *--------------
 *
 * The design pattern used here is closer to a hack than a real feature. To do
 * the lazy eval we must use MetaProgramming which is not a C++ feature but
 * a <b>design accident</b>! As such user should not expect the utilities here
 * to be perfeclty robust and general to all use cases. There is limitations
 * which strongly depends on how you employ this trick.
 *
 * - We use templates:
 *  This requires all definitions to be in headers. Your array type and this
 *  header will be inlined each time there are used it can slow down compilation
 *  time.
 *
 * - Inexplicit error message when missusing this module:
 *  MetaProgramming not being an official feature of c++ it should not be
 *  surprising that related error messages are often cryptic. Debuging will
 *  be inevitably harder. We also use macro which has the drawback to hide
 *  the correct line number, compiler will crash at the macro call instead of
 *  pointing to the correct line in the macro definition.
 *  To ease this burden here is some common errors you may be confronted to:
 *
 * Type mismatch: your expression type doesn't match the array storing
 * the result:
 * @code
 * Custom_array<float> res;
 * Custom_array<vec3> a, b;
 * res = a + b; // res should contains vec3 or a, b should contains floats ...
 * @endcode
 * In this example compiler might throw you an error like "no operator '=' match
 * these operands" or "no function 'foo_bar' match these operands".
 *
 * No node of the expression tree match your array type:
 * @code
 * Custom_array<float> res;
 * Custom_array<vec3> a;
 * Custom_array_2<vec3> b;
 * // throw an error like "no operator 'xx' match these operand" if you forgot
 * // to ineherit from Vec_expr with Custom_array_2
 * res = a + b;
 * @endcode
 *
 * If array sizes doesn't match in the expression your are likely to segfault
 * as this is not checked at all.
 *
 *
 *   the array use a type that with overloaded operators not defined here
 *
 * - Custom_array copy constructor must only copy pointers <-------------------- TODO force user to implement a method wich only copy pointers
 *
 * - if arrays stores custom classes with overloaded operators you need to specify the binary opertors return type
 * other wise by default it will be
 *
 *-----------------------------------------
 * HOW TO ADD OPERATORS / SPECIALIZE THEM
 *------------------------------------------
 * with macros -> see the inl for an example
 *
 * by yourself defining a new Vec_expr node and constructor with an
 * overloaded operator or a function.
 *
 * THE DESIGN PATTERN
 *
 *
 * Reason expr might not work:
 * How to declare a new expression
 *
 */

/**
 *  SUMMARY AVALAIBLE OPERATORS/FUNCTIONS FOR ARRAY EXPRESSIONS
 *
 *  @code
    ///////////////////////////////////////
    // List of binnary overloaded operators

    // The element wise operations by default uses the overloaded operator
    // "T1 operator symbol (T1, T2);" which returns the right hand side type

    T1      operator * (Vec_expr<T1>     , Vec_expr<T2>     )
    Vec3_cu operator * (Vec_expr<Transfo>, Vec_expr<Vec3_cu>)

    T1      operator - (Vec_expr<T1>     , Vec_expr<T2>     )
    T1      operator + (Vec_expr<T1>     , Vec_expr<T2>     )
    T1      operator / (Vec_expr<T1>     , Vec_expr<T2>     )
    T1      operator % (Vec_expr<T1>     , Vec_expr<T2>     )

    ///////////////////////////////////////////////////////
    // List of not commutative binnary overloaded operators

    Handled base type == B == { float, double, int, unsigned, long }

    B      operator * (Vec_expr<T1>     , Vec_expr<B>     )
    B      operator + (Vec_expr<T1>     , Vec_expr<B>     )
    B      operator - (Vec_expr<T1>     , Vec_expr<B>     )

    ///////////////////////
    // Cast/Map expressions
    Vec_expr<T> res = cast_to< T >::expr( Vec_expr< Old_T > );

    Vec_expr<T> res = map< MapFunctor >::to( Vec_expr< Old_T > );

    ////////////////////
    // Generic functions

    Vec_expr<T> incr_expr( Vec_expr<T> );
    Vec_expr<T> sin_expr ( Vec_expr<T> );
    Vec_expr<T> cos_expr ( Vec_expr<T> );
    Vec_expr<T> tan_expr ( Vec_expr<T> );
    Vec_expr<T> exp_expr ( Vec_expr<T> );
    Vec_expr<T> log_expr ( Vec_expr<T> );

    Vec_expr<T> clamp_expr( Vec_expr<T>, T, T );

    Vec_expr<T> ternary_expr( Vec_expr<bool>, Vec_expr<T>, Vec_expr<T>);

    // Specialized functions
    Vec_expr<float> norm_expr( Vec_expr<Vec3_cu> );
    Vec_expr<float> norm_expr( Vec_expr<Vec2_cu> );

    Vec_expr<float> norm_squared_expr( Vec_expr<Vec3_cu> );
    Vec_expr<float> norm_squared_expr( Vec_expr<Vec2_cu> );

    Vec_expr<float> dot_expr( Vec_expr<Vec3_cu>, Vec_expr<Vec3_cu> );
    Vec_expr<float> dot_expr( Vec_expr<Vec2_cu>, Vec_expr<Vec2_cu> );

    Vec_expr<Vec3_cu> cross_expr( Vec_expr<Vec3_cu>, Vec_expr<Vec3_cu> );
    Vec_expr<Vec3_cu> cross_expr( Vec_expr<Vec2_cu>, Vec_expr<Vec2_cu> );
    @endcode
*/

/**
 *  SUMMARY AVALAIBLE MACROS

    @code
    ////////////////////////
    // Overloaded operators:
    BIN_OP_LEFT_ARG
    BIN_OP_RIGHT_ARG

    DEFINE_BIN_OP(Op_name_, op_symbol);
    DEFINE_BIN_OP_DEFAULT_RET_TYPE(NAME, ITH_ARG);
    DEFINE_BIN_OP_SPECIAL_RET_TYPE(NAME, RET_TYPE, ARG_TYPE0, ARG_TYPE1);

    DEFINE_BIN_OP_RHS_BASE_TYPE_OVERLOAD_ALL(NAME, SYMBOL);

    /////////////////////
    // Generic Functions:
    DEFINE_GENERIC_MAP(fun_name);
    DEFINE_GENERIC_MAP_2ARGS(fun_name);

    /////////////
    // Functions:
    DEFINE_MAP(fun_name, ret_type);
    DEFINE_BIN_OP_FUNC(fun_name, ret_type);
    @endcode
 */


// This pattern is called an expression template

// Note: the vector using lazy evals needs to have a copy constructor which
// copy pointers only.

// =============================================================================
namespace Cuda_utils{
// =============================================================================

/// @class Vec_expr
/// @brief A curiously recursive template pattern (CRTP) base class for arrays
/// One must inherit from this class to create a leave/node of the arithmetic
/// expression tree. (either an array or an operator/function).
/// To be fully compatible in addition of inheriting you must define the []
/// operator and 'value_type' typedef which tells what is the type returned by
/// []
template <typename Expr_t>
class Vec_expr {
public:
    IF_CUDA_DEVICE_HOST
    operator Expr_t&() { return static_cast<Expr_t&>(*this); }

    IF_CUDA_DEVICE_HOST
    operator const Expr_t&() const { return static_cast<const Expr_t&>(*this); }
};

//------------------------------------------------------------------------------
// Binnary operators expression : macros to define the classes
//------------------------------------------------------------------------------

// TODO: A maximum of the definitions must be hiddent in detail namespace

/// @def BIN_OP_LEFT_ARG
/// @brief Flag to designate left hand side argument
#define BIN_OP_LEFT_ARG 1

/// @def BIN_OP_RIGHT_ARG
/// @brief Flag to designate right hand side argument
#define BIN_OP_RIGHT_ARG 2

// Mandatory if of defined a bin op with the macro DEFINE_BIN_OP
/// for ITH_ARG arg you can use BIN_OP_LEFT_ARG or BIN_OP_RIGHT_ARG
#define DEFINE_BIN_OP_DEFAULT_RET_TYPE(NAME, ITH_ARG)       \
namespace Ret_type {                                        \
    template <typename T1, typename T2>                     \
    struct NAME {                                           \
        typedef CUDA_UTILS_JOIN(T,ITH_ARG) type;            \
    };                                                      \
}                                                           \
struct Dummy_forward_def // Hack to be able to use ';'

//------------------------------------------------------------------------------

#define DEFINE_BIN_OP_SPECIAL_RET_TYPE(NAME, RET_TYPE, ARG_TYPE0, ARG_TYPE1)\
namespace Ret_type {                                                        \
    template <>                                                             \
    struct NAME<ARG_TYPE0, ARG_TYPE1> {                                     \
        typedef RET_TYPE type;                                              \
    };                                                                      \
}                                                                           \
struct Dummy_forward_def // Hack to be able to use ';'

//------------------------------------------------------------------------------

// Note binnary operator define with this won't handle base type like int,
// double, float etc. But only bin op between two Vec_expr.
#define DEFINE_BIN_OP(Op_name_, op_symbol)                                     \
namespace Ret_type {                                                           \
    template <typename T1, typename T2>                                        \
    struct Op_name_;                                                           \
}                                                                              \
                                                                               \
namespace details {                                                            \
template <typename E1, typename E2>                                            \
class Op_name_ : public Vec_expr< Op_name_<E1, E2> > {                         \
     E1 _u;                                                                    \
     E2 _v;                                                                    \
public:                                                                        \
                                                                               \
    typedef typename E1::value_type Ret1;                                      \
    typedef typename E2::value_type Ret2;                                      \
    typedef typename Ret_type::Op_name_<Ret1, Ret2>::type value_type;          \
                                                                               \
    Op_name_(const Vec_expr<E1>& u, const Vec_expr<E2>& v) :                   \
        _u( u ), _v( v ) { }                                                   \
                                                                               \
    IF_CUDA_DEVICE_HOST                                                        \
    value_type operator[](int i) const { return _u[i] op_symbol _v[i]; }       \
};                                                                             \
} /* END detail namespace */                                                   \
                                                                               \
template <typename E1, typename E2>                                            \
details::Op_name_<E1, E2> const operator op_symbol (const Vec_expr<E1>& u,     \
                                                    const Vec_expr<E2>& v)     \
{                                                                              \
    return details::Op_name_<E1, E2>(u, v);                                    \
}                                                                              \
struct Dummy_forward_def // Hack to be able to use ';'


//------------------------------------------------------------------------------
// Define scale operator expr * real
//------------------------------------------------------------------------------

// Define Bin operator with right hand side with base type (int float etc.)
#define DEFINE_BIN_OP_RHS_BASE_TYPE_EXPR(NAME, OP_SYMBOL)                       \
namespace details {                                                             \
    template <typename Expr_t, typename Real_t>                                 \
    class NAME : public Vec_expr< NAME<Expr_t, Real_t> > {                      \
        Expr_t _v;                                                              \
        Real_t _alpha;                                                          \
    public:                                                                     \
        typedef typename Expr_t::value_type value_type;                         \
                                                                                \
        NAME(const Vec_expr<Expr_t>& v, Real_t alpha) : _v(v), _alpha(alpha) {} \
                                                                                \
        IF_CUDA_DEVICE_HOST                                                     \
        value_type operator[](int i) const { return _v[i] OP_SYMBOL _alpha; }   \
    };                                                                          \
} struct Dummy_forward_def // Hack to be able to use ';'

//------------------------------------------------------------------------------

/// @def DEFINE_VEC_SCALE_RIGHT_OPERATOR
/// Operator '*' to automatically build a Vec_scale_right node from a number of
/// type 'TYPE' and an expression tree.
#define DEFINE_BIN_OP_RHS_BASE_TYPE_OVERLOAD(OP_SYMB, TYPE, FUNCTOR)           \
    template <typename Expr_t>                                                 \
    const details::FUNCTOR<Expr_t, TYPE>                                       \
    operator OP_SYMB (const Vec_expr<Expr_t>& v,                               \
                      TYPE alpha)                                              \
    {                                                                          \
        return details::FUNCTOR<Expr_t, TYPE>(v, alpha);                       \
    }                                                                          \
    struct Dummy_forward_def // Hack to be able to use ';'

#define DEFINE_BIN_OP_RHS_BASE_TYPE_OVERLOAD_ALL(NAME, SYMBOL)                 \
    DEFINE_BIN_OP_RHS_BASE_TYPE_EXPR(NAME, SYMBOL);                            \
    DEFINE_BIN_OP_RHS_BASE_TYPE_OVERLOAD(SYMBOL, double  , NAME);              \
    DEFINE_BIN_OP_RHS_BASE_TYPE_OVERLOAD(SYMBOL, float   , NAME);              \
    DEFINE_BIN_OP_RHS_BASE_TYPE_OVERLOAD(SYMBOL, int     , NAME);              \
    DEFINE_BIN_OP_RHS_BASE_TYPE_OVERLOAD(SYMBOL, unsigned, NAME);              \
    DEFINE_BIN_OP_RHS_BASE_TYPE_OVERLOAD(SYMBOL, long    , NAME)

//------------------------------------------------------------------------------

/**
 * @class cast_to
 * @brief utility to cast an arithmetic expression to a specific type
 *
 * Sometime we need to convert expressions of float arrays to int or any other
 * type. You can cast any expressions to any array type as long as the
 * conversion is valid or overloaded correctly.
 * An example:
 * @code
 * Custom_array<int> res;
 * Custom_array<float> a, b;
 * res = cast_to< int >::expr( a * b + b / a);
 * @endcode
*/
template <typename Cast_t>
struct cast_to {


    template <typename Expr_t>
    class Casted_expr : public Vec_expr< Casted_expr<Expr_t> > {
        Expr_t _v;
    public:
        typedef Cast_t value_type;

        Casted_expr(const Vec_expr<Expr_t>& v) : _v(v){ }

        IF_CUDA_DEVICE_HOST
        value_type operator[](int i) const { return Cast_t(_v[i]); }
    };

    template <typename Expr_t>
    static
    const Casted_expr<Expr_t> expr(const Vec_expr<Expr_t>& e) {
        return Casted_expr<Expr_t>( static_cast<const Expr_t&>( e ) );
    }
};

//------------------------------------------------------------------------------

/**
 * @class map
 * @brief utility to map an arithmetic expression with a specific function
 *
 * Sometime we need to apply a particular function to every elements [i] of
 * the arithmetic expression.
 * You can do so like this:
 * @code
 * // First define a class which describe the operation for one element
 * struct MyIncr {
 *      typedef int value_type;
 *      IF_CUDA_DEVICE_HOST static
 *      value_type map(int val) { return val+1; }
 * };
 *
 * Custom_array<int> res;
 * Custom_array<int> a, b;
 * res = map< MyIncr >::to( a * b + b / a);
 * @endcode
 *
 * The typedef is mandatory as well as the definition of a static
 * function of name 'map' which takes only one parameter and return type must
 * be value_type.
 *
 * @note this utility is not very generic. For instance if you need to define
 * the map T cos(T) with T=float|double|etc. you will have to add a template to
 * the struct and call using the correct template instanciation.
 * There are more generic utilities below for this particular use case.
*/
template <typename Map_t>
struct map {

    template <typename Expr_t>
    class Mapped_expr : public Vec_expr< Mapped_expr<Expr_t> > {
        Expr_t _v;
    public:
        typedef typename Map_t::value_type value_type;

        Mapped_expr(const Vec_expr<Expr_t>& v) : _v(v){ }

        IF_CUDA_DEVICE_HOST
        value_type operator[](int i) const { return Map_t::map(_v[i]); }
    };

    // build the expression map
    template <typename Expr_t>
    static const Mapped_expr<Expr_t>

    to(const Vec_expr<Expr_t>& e)
    {
        return Mapped_expr<Expr_t>( static_cast<const Expr_t&>( e ) );
    }
};

//------------------------------------------------------------------------------
// Map shortcuts
//------------------------------------------------------------------------------

/**
 * @def DEFINE_GENERIC_MAP
 * Define a function map over an expression. The map named 'fun_name'
 * must have symetric types (T -> T). As long as 'T fun_name(T)' is defined for
 * all Types 'T' the expression map 'expr<T> fun_name_expr(expr<T>)' will be
 * correctly defined as well.
 * Example of use:
 * @code
 * // First define your map to treat a single element of the expression:
 * template<typename Real>
 * IF_CUDA_DEVICE_HOST
 * Real incr(Real arg){ return arg + 1; }
 *
 * // The macro will then define the map incr_expr() using 'incr()'
 * DEFINE_GENERIC_MAP(incr);
 *
 * {
 *      // Use your map on any type handled by 'incr()'
 *      Custom_array<double> a, b;
 *      Custom_array<float>  c, d;
 *      a = incr_expr( b );
 *      c = incr_expr( d );
 * }
 * @endcode
 */
#define DEFINE_GENERIC_MAP(fun_name)                                           \
namespace details {                                                            \
    template <typename Expr_t>                                                 \
    class Map_functor_gen_##fun_name :                                         \
        public Vec_expr< Map_functor_gen_##fun_name<Expr_t> >                  \
    {                                                                          \
        Expr_t _v;                                                             \
    public:                                                                    \
        typedef typename Expr_t::value_type value_type;                        \
                                                                               \
        Map_functor_gen_##fun_name(const Vec_expr<Expr_t>& v) : _v(v){ }       \
                                                                               \
        IF_CUDA_DEVICE_HOST                                                    \
        value_type operator[](int i) const { return fun_name(_v[i]); }         \
    };                                                                         \
}                                                                              \
                                                                               \
template <typename Expr_t>                                                     \
const details::Map_functor_gen_##fun_name< Expr_t >                            \
CUDA_UTILS_JOIN(fun_name,_expr)( const Vec_expr<Expr_t>& e )                   \
{                                                                              \
                                                                               \
    return details::Map_functor_gen_##fun_name< Expr_t >( e );                 \
}                                                                              \
struct Dummy_forward_def // Hack to be able to use ';'

// -----------------------------------------------------------------------------
// Define clamp
// -----------------------------------------------------------------------------

/// Define an operator of the form: real fun(vec<real>[i], real, real)
/// return type must be generically the same as args
#define DEFINE_GENERIC_MAP_2ARGS(fun_name)                                     \
namespace details {                                                            \
    template <typename Expr_t, typename Real1_t, typename Real2_t>             \
    class Map_functor_2args##fun_name :                                        \
        public Vec_expr<                                                       \
                       Map_functor_2args##fun_name<Expr_t, Real1_t, Real2_t>   \
                       >                                                       \
    {                                                                          \
        Expr_t _v;                                                             \
        Real1_t _arg1;                                                         \
        Real2_t _arg2;                                                         \
    public:                                                                    \
        typedef typename Expr_t::value_type value_type;                        \
                                                                               \
        Map_functor_2args##fun_name(const Vec_expr<Expr_t>& v,                 \
                                    Real1_t a,                                 \
                                    Real2_t b) :                               \
            _v(v),                                                             \
            _arg1(a),                                                          \
            _arg2(b)                                                           \
            { }                                                                \
                                                                               \
        IF_CUDA_DEVICE_HOST                                                    \
        value_type operator[](int i) const {                                   \
            return fun_name(_v[i], _arg1, _arg2);                              \
        }                                                                      \
    };                                                                         \
}                                                                              \
                                                                               \
template <typename Expr_t, typename Real1_t, typename Real2_t>                 \
const details::Map_functor_2args##fun_name< Expr_t, Real1_t, Real2_t >         \
CUDA_UTILS_JOIN(fun_name,_expr)                                                \
( const Vec_expr<Expr_t>& expr, Real1_t a, Real2_t b)                          \
{                                                                              \
    using namespace details;                                                   \
    return Map_functor_2args##fun_name< Expr_t, Real1_t, Real2_t >(expr, a, b);\
}                                                                              \
struct Dummy_forward_def // Hack to be able to use ';'



// -----------------------------------------------------------------------------
// Ternary op
// -----------------------------------------------------------------------------

namespace details {
    template <typename Expr_t, typename Expr1_t, typename Expr2_t>
    class Vec_ternary_expr :
        public Vec_expr< Vec_ternary_expr<Expr_t, Expr1_t, Expr2_t> >
    {
        Expr_t _bool_expr;
        Expr1_t _ret1;
        Expr2_t _ret2;
    public:
        typedef typename Expr_t ::value_type value_type0;
        typedef typename Expr1_t::value_type value_type1;
        typedef typename Expr2_t::value_type value_type2;
        typedef value_type1                  value_type;

        Vec_ternary_expr(const Vec_expr<Expr_t>& v,
                         const Vec_expr<Expr1_t>& a,
                         const Vec_expr<Expr2_t>& b) :
            _bool_expr(v),
            _ret1(a),
            _ret2(b)
        {
            // Check returned type are the same
            CUDA_STATIC_ASSERT( (is_same<value_type1, value_type2>::value) );
            // Check the tested expression is boolean
            CUDA_STATIC_ASSERT( (is_same<value_type0, bool       >::value) );
        }

        IF_CUDA_DEVICE_HOST
        value_type operator[](int i) const {
            return _bool_expr[i] ? _ret1[i] : _ret2[i];
        }
    };
}

/// Build a ternary expression where the traditionnal (a ? b : c)
/// is equivalent to call ternary_expr(a, b, c)
template <typename Expr_t, typename Expr1_t, typename Expr2_t>
const details::Vec_ternary_expr< Expr_t, Expr1_t, Expr2_t >

ternary_expr( const Vec_expr<Expr_t>& bool_expr,
              const Vec_expr<Expr1_t>& ret1,
              const Vec_expr<Expr2_t>& ret2)
{
    using namespace details;
    return Vec_ternary_expr< Expr_t, Expr1_t, Expr2_t >( bool_expr, ret1, ret2 );
}

// -----------------------------------------------------------------------------

/// define -> re_type fun( expr must match fun arg type)
#define DEFINE_MAP(fun_name, ret_type)                                         \
namespace details {                                                            \
    template <typename Expr_t>                                                 \
    class Vec_##fun_name : public Vec_expr< Vec_##fun_name<Expr_t> > {         \
        Expr_t _v;                                                             \
    public:                                                                    \
        Vec_##fun_name(const Vec_expr<Expr_t>& v) : _v(v){ }                   \
                                                                               \
        typedef ret_type value_type;                                           \
                                                                               \
        IF_CUDA_DEVICE_HOST                                                    \
        value_type operator[](int i) const { return fun_name(_v[i]); }         \
    };                                                                         \
}                                                                              \
                                                                               \
template <typename Expr_t>                                                     \
const details::Vec_##fun_name<Expr_t>                                          \
CUDA_UTILS_JOIN(fun_name,_expr)( const Vec_expr<Expr_t>& e ) {                 \
    return details::Vec_##fun_name<Expr_t>( e );                               \
}                                                                              \
struct Dummy_forward_def // Hack to be able to use ';'

//------------------------------------------------------------------------------
// Define binary functions
//------------------------------------------------------------------------------

#define DEFINE_BIN_OP_FUNC(fun_name, ret_type)                                 \
namespace details {                                                            \
    template <typename Expr1_t, typename Expr2_t>                              \
    class Vec_##fun_name :                                                     \
        public Vec_expr< Vec_##fun_name<Expr1_t, Expr2_t> > {                  \
        Expr1_t _v1;                                                           \
        Expr2_t _v2;                                                           \
    public:                                                                    \
        Vec_##fun_name(const Vec_expr<Expr1_t>& v1,                            \
                       const Vec_expr<Expr2_t>& v2) :                          \
                           _v1( v1 ), _v2( v2 )                                \
                       { }                                                     \
                                                                               \
        typedef ret_type value_type;                                           \
                                                                               \
        IF_CUDA_DEVICE_HOST                                                    \
        value_type operator[](int i) const { return fun_name(_v1[i], _v2[i]); }\
    };                                                                         \
}                                                                              \
                                                                               \
template <typename Expr1_t, typename Expr2_t>                                  \
const details::Vec_##fun_name<Expr1_t, Expr2_t>                                \
CUDA_UTILS_JOIN(fun_name,_expr)( const Vec_expr<Expr1_t>& e1,                  \
                                 const Vec_expr<Expr2_t>& e2)                  \
{                                                                              \
    return details::Vec_##fun_name<Expr1_t, Expr2_t>( e1, e2 );                \
}                                                                              \
struct Dummy_forward_def // Hack to be able to use ';'

//------------------------------------------------------------------------------

// TODO: unary operators '!' ''

#ifdef __CUDACC__
/// Cuda kernel to evaluate a vector expression 'Expr' on GPU
/// @tparam Expr : the expression to evaluate
/// (for instance (a + x/5 - d) with a, x and d arrays of same size),
/// it must overload the operator [] which return the value of the expression
/// For instance: to_eval[i] is equivalent to (a[i] + x[i]/5 - d[i]).
/// In addition Expr must define the typedef Expr::value_type
/// which is the final type of the expression. For instance if we consider
/// (mat * vec) with mat an array of matrices and vec an array of vectors we
/// usually expect the returned type Expr::value_type of (mat[i] * vec[i])
/// to be a vector.
/// @param d_input : device array with the result of the expression
/// @param to_eval : static tree of the expression defined with template
/// metaprogramming. It must define the operator [] and typedef Expr::value_type
/// which is the return type of [].
/// @param size : size of 'd_input' and 'to_eval' arrays.
template <typename Expr_t>
__global__
void compute_expr_kernel(typename Expr_t::value_type* d_input,
                         Expr_t to_eval,
                         int size)
{
    int p = blockIdx.x * blockDim.x + threadIdx.x;
    if(p < size){
        d_input[p] = to_eval[p];
    }
}
#endif

}// END CUDA_UTILS NAMESPACE ===================================================

#include "cuda_utils_vec_expr.inl"

#endif // VEC_EXPR_HPP__
