/************************************************************************/
/*                                                                      */
/*    vspline - a set of generic tools for creation and evaluation      */
/*              of uniform b-splines                                    */
/*                                                                      */
/*            Copyright 2015 - 2018 by Kay F. Jahnke                    */
/*                                                                      */
/*    The git repository for this software is at                        */
/*                                                                      */
/*    https://bitbucket.org/kfj/vspline                                 */
/*                                                                      */
/*    Please direct questions, bug reports, and contributions to        */
/*                                                                      */
/*    kfjahnke+vspline@gmail.com                                        */
/*                                                                      */
/*    Permission is hereby granted, free of charge, to any person       */
/*    obtaining a copy of this software and associated documentation    */
/*    files (the "Software"), to deal in the Software without           */
/*    restriction, including without limitation the rights to use,      */
/*    copy, modify, merge, publish, distribute, sublicense, and/or      */
/*    sell copies of the Software, and to permit persons to whom the    */
/*    Software is furnished to do so, subject to the following          */
/*    conditions:                                                       */
/*                                                                      */
/*    The above copyright notice and this permission notice shall be    */
/*    included in all copies or substantial portions of the             */
/*    Software.                                                         */
/*                                                                      */
/*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
/*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
/*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
/*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
/*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
/*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
/*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
/*    OTHER DEALINGS IN THE SOFTWARE.                                   */
/*                                                                      */
/************************************************************************/

/*! \file wielding.h

    \brief Implementation of vspline::transform

    wielding.h provides code to process all 1D subbarrays of nD views.
    This is similar to using vigra::Navigator, which also iterates over
    1D subarrays of nD arrays. Here, this access is hand-coded to have
    complete control over the process, and to work with range-based
    code rather than the iterator-based approach vigra uses.
    
    The code is structured so that separable aspects of the process
    are coded as separate entities:
    
    the top-level object in the wielding code is class wield.
    class wield offers operator() overloads taking information about
    the data which are to be processed, and std::functions defining
    the specific processing which is intended for the 1D subbarrays.
    When one of wield's operator() overloads is called, it iterates
    over the 1D subarrays, calling the std::function for each subarray
    in turn - the std::function is used as a callback function.
    
    Once inside the callback function, what's now seen is a specific
    1D subarray (or a pair of them, when two arrays are processed
    in sync), plus any additional information specifically needed
    by the callback function, like the starting index in the nD
    array, which is needed for index-based transforms.
    
    The callback 'functions' passed to the wield object in this body
    of code are actually functors. They are set up to 'contain' an
    adapted vspline::unary_functor, which is capable of processing
    data contained in the arrays.
    
    If vectorization is not used, the processing is trivial: it 'collapses'
    to a simple traversal of the 1D subarray(s), using the unvectorized
    evaluation code in the vspline::unary_functor. But the whole point
    of 'aggregation' is to feed the *vectorized* evaluation code:
    
    Here, the data are reworked to be suited for vectorized processing.
    This is done by copying incoming data into a small buffer, using
    techniques like SIMD gathering, SIMD loads and possibly Vc-provided
    deinterleaving, then processing the buffer with vectorized code,
    and finally writing the result back to target memory using the
    reverse operations: SIMD scatters or stores, or Vc's interleaving
    code. The 'magic' is that all of this is transparent to calling
    code: to the caller it's merely a call into code processing arrays
    of data, and all the complex buffering and unbuffering is done
    in a 'black box', encapsulated in class wield and the callback
    functions.
    
    If Vc is used, the code provides specialized routines for cases
    where Vc can speed things up. Without Vc, this code will not be
    compiled (it's inside #ifdef USE_VC ... #endif preprocessor
    statements). Without Vc, the code will still be vectorized by
    a technique I call 'goading': The data are repackaged into small
    SoAs with vector-friendly array sizes and the expectation is that
    the compiler will recognize that the resulting inner loops are
    candidates for autovectorization. Using this technique has the
    advantage that - if the compiler 'gets it' - code will be generated
    for every target the *compiler* can produce autovectorized code for,
    rather than being limited to what Vc covers. And since the Vc types
    may mystify the compiler, not using them may also allow the compiler
    to optimize the code better. The 'goading' is done by using a 'mock'
    SIMD type (vspline::simd_tv, derived from vigra::TinyVector).
    The actual SIMD or pseudo-SIMD data types used by the wielding code
    are not fixed, though - what's used is inferred from the functor
    passed to the wielding code, and the idea is to widen the feeding
    spectrum simply to other vectorized data types. If there is no
    specialized code for these types (like the Vc code for Vc data),
    there are only very few requirements for these types and adapting
    to new variants should be simple. TODO: concretize interface
    
    After the aggregation code, wielding.h provides two functions
    using the mechanism described above to process arrays of data.
    These functions (index_wield and coupled_wield) take care of
    multithreading the operation and feeding subranges of the data
    to wield objects. They are used in turn to implement 'transform'
    routines, which are the top-level code user code calls. These
    top-level routines take care of argument checking and presenting
    the arguments to the wielding code in the form it needs. That
    code is in transform.h.
    
    So by now, the use of the term 'wielding' should be obvious.
    We have a 'tool', namely the vspline::unary_functor, and we have
    data on which we intend to use the unary_functor. What's left to
    do? Wielding the tool! And since this operation can be factored
    out, I've done so and labeled it the 'wielding' code. There is
    another place in vspline which also provides 'wielding' code:
    it's the code in filter.h, which is used to 'wield' specific
    digital filters (like convolution or b-spline prefiltering),
    applying them to arrays of data. The requirements there are
    quite different from the requirements here, so these two bodies
    of wielding code are separate, but the design method is the same:
    we use two conceptual entities, the tool and it's use.
    
    The 'magic' in vspline's wielding code is the automatic
    multithreading and vectorization, which is done transparently and
    makes the code fast. But seen from the outside, by a caller of
    one of the 'transform' functions, all the complexity is hidden.
    And, at the same time, if code is needed for targets which can't
    use vector code or multithreading, enabling or disabling these
    capabilites is as simple as passing a preprocessor definition
    to the compiler.
*/

#include "vspline.h"

namespace wielding
{

template < int dimension , class in_type , class out_type = in_type >
struct wield
{
  typedef vigra::MultiArrayView < dimension , in_type > in_view_type ;
  typedef vigra::MultiArrayView < dimension , out_type > out_view_type ;
  typedef typename in_view_type::difference_type shape_type ;
  typedef typename in_view_type::difference_type_1 index_type ;
  
  // wielding, using two arrays. It's assumed that both arrays have
  // the same shape. The std::function takes a pointer and stride
  // for each array and will be matched by a coupled_aggregator.
  // Note how the first view is taken by const&, indicating that
  // it can not be modified. Only the second view, the target of
  // the operation, is non-const.
  // The aggregator is taken as a std::function of this type:

  typedef std::function < void ( const in_type * ,
                                 index_type ,
                                 out_type *,
                                 index_type ,
                                 index_type )
                        > coupled_aggregator_type ;
  
  void operator() ( const in_view_type & in_view ,
                    out_view_type & out_view ,
                    coupled_aggregator_type func ,
                    const shape_type & begin ,
                    const shape_type & end ,
                    int axis
                  )
  {
    auto stride1 = in_view.stride ( axis ) ;
    auto segment1 = in_view.subarray ( begin , end ) ;
    auto slice1 = segment1.bindAt ( axis , 0 ) ;
    
    auto stride2 = out_view.stride ( axis ) ;
    auto segment2 = out_view.subarray ( begin , end ) ;
    auto slice2 = segment2.bindAt ( axis , 0 ) ;
    
    auto iter1 = slice1.begin() ;
    auto _end1 = slice1.end() ;
    
    auto iter2 = slice2.begin() ;
    auto _end2 = slice2.end() ;
    
    assert ( in_view.shape() == out_view.shape() ) ;
    auto length = segment1.shape ( axis ) ;    
    
    while ( iter1 != _end1 )
    {
      func ( &(*iter1) , stride1 , &(*iter2) , stride2 , length ) ;
      ++iter1 ;
      ++iter2 ;
    }
  }
  
  // overload of operator() which will work with an object
  // of type indexed_aggregator for the std::function it expects. This
  // object presents the nD index into the target array as input to its'
  // inner functor, which produces the output from this nD index, rather
  // than looking at the array (which is only written to).
  // The view coming in is non-const and will receive the result data.
  // The aggregator is taken as a std::function of this type:
  
  typedef std::function < void ( const shape_type & ,
                                 int ,
                                 in_type * ,
                                 index_type ,
                                 index_type )
                        > indexed_aggregator_type ;
                                 
  void operator() ( in_view_type & view ,
                    indexed_aggregator_type func ,
                    const shape_type & begin ,
                    const shape_type & end ,
                    int axis
                  )
  {
    auto stride = view.stride ( axis ) ;
    auto length = end[axis] - begin[axis] ;
    
    auto slice_shape = end - begin ; // shape of whole subarray
    slice_shape[axis] = 1 ;          // shape of slice with start positions
    
    // we iterate over the coordinates in slice_shape. This produces
    // nD indexes into the view's subarray from 'begin' to 'end', so we
    // need to offset the indexes with 'begin' to receive indexes
    // into the view itself.
    // vigra::MultiCoordinateIterator's two-argument constructor
    // has a bug, which prevents me from coding the straightforward
    // mci_type it ( begin , end ).
    
    typedef vigra::MultiCoordinateIterator
            < in_view_type::actual_dimension > mci_type ;

    mci_type it ( slice_shape ) , e = it.getEndIterator() ;
    
    while ( it != e )
    {
      auto s = begin + *it ;
      func ( s , axis , &(view[s]) , stride , length ) ;
      ++it ;
    }
  }
} ;

/// specialization for 1D arrays. Here, the aggregation functions
/// are called directly for their share of the data defined by
/// 'begin' and 'end'

template < class in_type , class out_type >
struct wield < 1 , in_type , out_type >
{
  enum { dimension = 1 } ;
  
  typedef vigra::MultiArrayView < dimension , in_type > in_view_type ;
  typedef vigra::MultiArrayView < dimension , out_type > out_view_type ;
  typedef typename in_view_type::difference_type shape_type ;
  typedef typename in_view_type::difference_type_1 index_type ;
  
  typedef std::function < void ( const in_type * ,
                                 index_type ,
                                 out_type *,
                                 index_type ,
                                 index_type )
                        > coupled_aggregator_type ;
  
  void operator() ( const in_view_type & in_view ,
                    out_view_type & out_view ,
                    coupled_aggregator_type func ,
                    const shape_type & begin ,
                    const shape_type & end ,
                    int axis
                  )
  {
    auto segment1 = in_view.subarray ( begin , end ) ;
    auto stride1 = in_view.stride ( axis ) ;
    auto length = segment1.shape ( axis ) ;    
    
    auto segment2 = out_view.subarray ( begin , end ) ;
    auto stride2 = out_view.stride ( axis ) ;
    
    assert ( in_view.shape() == out_view.shape() ) ;
    
    func ( segment1.data() , stride1 , segment2.data() , stride2 , length ) ;
  }
  
  typedef std::function < void ( const shape_type & ,
                                 int ,
                                 in_type * ,
                                 index_type ,
                                 index_type )
                        > indexed_aggregator_type ;
                                 
  void operator() ( in_view_type & view ,
                    indexed_aggregator_type func ,
                    const shape_type & begin ,
                    const shape_type & end ,
                    int axis
                  )
  {
    auto segment = view.subarray ( begin , end ) ;
    auto stride = view.stride ( axis ) ;
    auto length = end[axis] - begin[axis] ;
    
    func ( begin , axis , segment.data() , stride , length ) ;
  }
} ;

#ifdef USE_VC

namespace detail
{

// Here we have some collateral code to use Vc's InterleavedMemoryWrapper.
// This is a specialized way of accessing interleaved but unstrided data,
// which uses several SIMD loads, then reshuffles the data. This should
// be quicker than using a set of gather operations.

// fetch of interleaved, but unstrided data located at _data
// into a TinyVector of vspline::simdized_types using InterleavedMemoryWrapper.
// uses SimdArrays containing K full hardware SIMD Vectors

template < typename T , size_t N , size_t K , size_t ... seq >
void fetch ( vigra::TinyVector
             < vspline::simdized_type < T , K * Vc::Vector<T>::size() > , N > & v ,
             const vigra::TinyVector < T , N > * _data ,
             const size_t & sz ,
             Vc::index_sequence < seq ... > )
{
  const Vc::InterleavedMemoryWrapper < const vigra::TinyVector < T , N > ,
                                       Vc::Vector<T> > data ( _data ) ;

  // as_v1_type is a type holding K Vc::Vector<T> in a TinyVector.
  // we reinterpret the incoming reference to have as_v1_type
  // as value_type - instead of an equally-sized SimdArray. With
  // this interpretation of the data we can use the
  // InterleavedMemoryWrapper, which operates on Vc::Vectors
  // only.

  // KFJ 2018-02-20 given VS as the size of a Vc::Vector<T>, I had initially
  // coded as if a SimdArray<T,VS*K> had a size of VS*K, so just as much as
  // K Vc::Vector<T> occupy. This is not necessarily so, the SimdArray may
  // be larger. Hence this additional bit of size arithmetics to make the
  // reinterpret_cast below succeed for all K, which calculates the number
  // of Vc::Vectors, nv, which occupy the same space as the SimdArray

  enum { nv =   sizeof ( vspline::simdized_type < T , K * Vc::Vector < T > :: size() > )
              / sizeof ( Vc::Vector < T > ) } ;
                    
  typedef typename vigra::TinyVector < Vc::Vector < T > , nv > as_v1_type ;
  typedef typename vigra::TinyVector < as_v1_type , N > as_vn_type ;
  
  as_vn_type & as_vn = reinterpret_cast < as_vn_type & > ( v ) ;
  
  // we fill the SimdArrays in as_vn round-robin. Note the use of
  // Vc::tie - this makes the transition effortless.
  
  for ( size_t k = 0 ; k < K ; k++ )
  {
    Vc::tie ( as_vn [ seq ] [ k ] ... )
      = ( data [ sz + k * Vc::Vector<T>::size() ] ) ;
  }
}

template < typename T , size_t N , size_t K , size_t ... seq >
void stash ( const vigra::TinyVector
             < vspline::simdized_type < T , K * Vc::Vector<T>::size() > , N > & v ,
             vigra::TinyVector < T , N > * _data ,
             const size_t & sz ,
             Vc::index_sequence < seq ... > )
{
  Vc::InterleavedMemoryWrapper < vigra::TinyVector < T , N > ,
                                 Vc::Vector<T> > data ( _data ) ;
  
  // we reinterpret the incoming reference to have as_v1_type
  // as value_type, just as in 'fetch' above.

  // KFJ 2018-02-20 given VS as the size of a Vc::Vector<T>, I had initially
  // coded as if a SimdArray<T,VS*K> had a size of VS*K, so just as much as
  // K Vc::Vector<T> occupy. This is not necessarily so, the SimdArray may
  // be larger. Hence this additional bit of size arithmetics to make the
  // reinterpret_cast below succeed for all K, which calculates the number
  // of Vc::Vectors, nv, which occupy the same space as the SimdArray

  enum { nv =   sizeof ( vspline::simdized_type < T , K * Vc::Vector < T > :: size() > )
              / sizeof ( Vc::Vector < T > ) } ;
                    
  typedef typename vigra::TinyVector < Vc::Vector < T > , nv > as_v1_type ;
  typedef typename vigra::TinyVector < as_v1_type , N > as_vn_type ;
  
  const as_vn_type & as_vn = reinterpret_cast < const as_vn_type & > ( v ) ;
  
  // we unpack the SimdArrays in as_vn round-robin. Note, again, the use
  // of Vc::tie - I found no other way to assign to data[...] at all.
  
  for ( size_t k = 0 ; k < K ; k++ )
  {
    data [ sz + k * Vc::Vector<T>::size() ]
      = Vc::tie ( as_vn [ seq ] [ k ] ... ) ;
  }
}

} ; // end of namespace detail

#endif // #ifdef USE_VC

/// aggregator_base has the formulation of the types involved
/// in aggregation and the two methods 'bunch' and 'fluff', which
/// move data from (possibly strided) interleaved memory to a buffer
/// structured as vector-friendly SoA and back.

template < size_t vsz , typename ic_type >
struct aggregator_base
{
protected:
  
#ifdef USE_VC
  
  // here we have the versions of bunch and fluff using specialized
  // Vc operations to access the buffer. These routines take Vc data
  // types, and they are only present if USE_VC is defined at all.
  // Further down we have less specific signatures which will be chosen
  // if either Vc is not used at all or if the data types passed are
  // not Vc types.

  /// bunch picks up data from interleaved, strided memory and stores
  /// them in a data type representing a package of vector data.
  
  /// The first overload of 'bunch' uses a gather operation to obtain
  /// the data from memory. This overload is used if the source data
  /// are strided and are therefore not contiguous in memory. It's
  /// also used if unstrided data are multi-channel and the vector width
  /// is not a multiple of the hardware vector width, because I haven't
  /// fully implemented using Vc::InterleavedMemoryWrapper for SimdArrays.
  /// This first routine can be used for all situations, the two overloads
  /// below are optimizations, increasing performance for specific
  /// cases.
  
  template < typename ele_type , int chn >
  void bunch ( const vigra::TinyVector < ele_type , chn > * & src ,
               vigra::TinyVector < Vc::SimdArray < ele_type , vsz > , chn > & trg ,
               const ic_type & stride ) const
  {
   typedef typename Vc::SimdArray < ele_type , vsz > :: index_type index_type ;
   index_type ix = index_type::IndexesFromZero() * stride * chn ;
    
    for ( int d = 0 ; d < chn ; d++ )
      trg[d].gather ( ((ele_type*)src) + d , ix ) ;
  }
  
  /// overload for unstrided single-channel data.
  /// here we can use an SIMD load, the implementation is very
  /// straightforward, and the performance gain is large.
  
  template < typename ele_type >
  void bunch ( const vigra::TinyVector < ele_type , 1 > * & src ,
               vigra::TinyVector < Vc::SimdArray < ele_type , vsz > , 1 > & trg ,
               std::true_type
             ) const
  {
    trg[0].load ( (const ele_type*) src ) ;
  }

  /// the third overload, which is only enabled if vsz is a multiple
  /// of the SIMD vector capacity, delegates to detail::fetch, which
  /// handles the data acquisition with a Vc::InterleavedMemoryWrapper.
  /// This overload is only for unstrided multichannel data.

  template < typename ele_type , int chn >
  typename std::enable_if < vsz % Vc::Vector<ele_type>::size() == 0 > :: type 
  bunch ( const vigra::TinyVector < ele_type , chn > * & src ,
          vigra::TinyVector < Vc::SimdArray < ele_type , vsz > , chn > & trg ,
          std::false_type
        ) const
  {
    enum { K = vsz / Vc::Vector<ele_type>::size() } ;
    
    detail::fetch < ele_type , chn , K >
      ( trg , src , 0 , Vc::make_index_sequence<chn>() ) ;
  }
  
  /// reverse operation: a package of vectorized data is written to
  /// interleaved, strided memory. We have the same sequence
  /// of overloads as for 'bunch'.
  
  template < typename ele_type , int chn >
  void fluff ( vigra::TinyVector < Vc::SimdArray < ele_type , vsz > , chn > & src ,
               vigra::TinyVector < ele_type , chn > * & trg ,
               const ic_type & stride ) const
  {
    typedef typename Vc::SimdArray < ele_type , vsz > :: index_type index_type ;
    index_type ix = index_type::IndexesFromZero() * stride * chn ;

    for ( int d = 0 ; d < chn ; d++ )
      src[d].scatter ( ((ele_type*)trg) + d , ix ) ;
  }

  template < typename ele_type >
  void fluff ( vigra::TinyVector < Vc::SimdArray < ele_type , vsz > , 1 > & src ,
               vigra::TinyVector < ele_type , 1 > * & trg ,
               std::true_type
             ) const
  {
    src[0].store ( (ele_type*) trg ) ;
  }

  template < typename ele_type , int chn >
  typename std::enable_if < vsz % Vc::Vector<ele_type>::size() == 0 > :: type 
  fluff ( vigra::TinyVector < Vc::SimdArray < ele_type , vsz > , chn > & src ,
          vigra::TinyVector < ele_type , chn > * & trg ,
          std::false_type
        ) const
  {
    enum { K = vsz / Vc::Vector<ele_type>::size() } ;
    
    detail::stash < ele_type , chn , K >
      ( src , trg , 0 , Vc::make_index_sequence<chn>() ) ;
  }
  
#endif // USE_VC
  
  // when not processing Vc data , bunch and fluff use simple loops
  // for buffering and unbuffering, or SIMD load/store operations if
  // the data are single-channel and unstrided.

  template < typename target_type , typename ele_type , int chn >
  void bunch ( const vigra::TinyVector < ele_type , chn > * src ,
               target_type & trg ,
               const ic_type & stride ) const
  {
    for ( int e = 0 ; e < vsz ; e++ )
    {
      for ( int d = 0 ; d < chn ; d++ )
      {
        trg[d][e] = (*src)[d] ;
      }
      src += stride ;
    }
  }

  // data are unstrided and single-channel, issue a SIMD load operation
  
  template < typename target_type , typename ele_type >
  void bunch ( const vigra::TinyVector < ele_type , 1 > * & src ,
               target_type & trg ,
               std::true_type
             ) const
  {
    // safeguard against improper use
    static_assert
    ( target_type::static_size == 1 ,
      "this variant of bunch must not be called with multichannel data" ) ;
    
    trg[0].load ( (ele_type*) src ) ;    
  }

  template < typename ele_type , typename source_type , int chn >
  void fluff ( source_type & src ,
               vigra::TinyVector < ele_type , chn > * trg ,
               const ic_type & stride ) const
  {
    for ( int e = 0 ; e < vsz ; e++ )
    {
      for ( int d = 0 ; d < chn ; d++ )
      {
        (*trg)[d] = src[d][e] ;
      }
      trg += stride ;
    }
  }
    
  // data are unstrided and single-channel, issue a SIMD store
  
  template < typename ele_type , typename source_type >
  void fluff ( source_type & src ,
               vigra::TinyVector < ele_type , 1 > * & trg ,
               std::true_type
             ) const
  {
    // safeguard against improper use
    static_assert
    ( source_type::static_size == 1 ,
      "this variant of fluff must not be called with multichannel data" ) ;
      
    src[0].store ( (ele_type*) trg ) ;    
  }

} ;

// we have two concrete aggregators derived from aggregator_base:
// indexed_aggregator and coupled_aggregator.

/// indexed_aggregator receives the start coordinate and processing axis
/// along with the data to process, this is meant for index-transforms.
/// The coordinate is updated for every call to the 'inner' functor
/// so that the inner functor has the current coordinate as input.
/// The code in this template will only be used for vectorized operation,
/// without vectorization, only the specialization for vsize == 1 below
/// is used.

template < size_t vsz , typename ic_type , class functor_type ,
           typename = std::enable_if < ( vsz > 1 ) > >
struct indexed_aggregator
: public aggregator_base < vsz , ic_type >
{
  typedef typename functor_type::in_type in_type ;
  typedef typename functor_type::in_ele_type in_ele_type ;
  
  typedef typename functor_type::out_type out_type ;
  typedef typename functor_type::out_ele_type out_ele_type ;
  
  enum { dim_in = functor_type::dim_in } ;
  enum { dim_out = functor_type::dim_out } ;

  typedef aggregator_base < vsz , ic_type > base_type ;
  
  using base_type::fluff ;
  
  // note how we use the functor's in_type as the coordinate type,
  // rather than using a TinyVector of some integral type. This way
  // we have the index already in the type needed by the functor and
  // arithmetic on the coordinate uses this type as well.
  
  const functor_type functor ;
  
  // get the data types the functor expects
  
  typedef typename functor_type::in_v in_v ;
  typedef typename functor_type::out_v out_v ;
  typedef typename functor_type::in_ele_v in_ele_v ;
  typedef typename functor_type::out_ele_v out_ele_v ;
  
  indexed_aggregator ( const functor_type & _functor )
  : functor ( _functor )
  { } ;
  
  // note how 'crd' is of in_type, which depends on the functor,
  // while the actual call passes an integral type. If in_type
  // is real, this overload is nevertheless picked and the argument
  // converted to the real coordinate type.

  void operator() ( in_type crd ,
                    int axis ,
                    out_type * trg ,
                    ic_type stride ,
                    ic_type length )
  {
    auto aggregates = length / vsz ;
    auto leftover = length - aggregates * vsz ;
    
    // the buffer and the md coordinate are created as the data types which
    // the functor expects. They can be filled with specific vector code
    // or alternatively we can use 'goading' to leave this trivial operation
    // to the compiler's autovectorization.
    
    out_v buffer ;
    in_v md_crd ;
    
    // initialize the vectorized coordinate. This coordinate will
    // remain constant except for the component indexing the
    // processing axis, which will be counted up as we go along.
    // This makes the index calculations very efficient.
    
    for ( int d = 0 ; d < dim_in ; d++ )
    {
      if ( d != axis )
        md_crd[d] = crd[d] ;
      else
      {
        for ( int e = 0 ; e < vsz ; e++ )
          md_crd[d][e] = crd[d] + e ;
      }
    }

#ifdef USE_VC

    // flag which is true if vsz is a multiple of the hardware
    // vector size for out_ele_type. This flag will activate the use
    // of specialized memory access code (Vc::InterleavedMemoryWrapper)
    // If this is unwanted, the easiest way to deactivate that code
    // is by setting this flag to false. Then, all access which can't
    // use straight SIMD store operations will use scatters.

    static const bool out_n_vecsz
      = (    vspline::vector_traits<out_ele_type>::hsize > 0
          && vsz % vspline::vector_traits<out_ele_type>::hsize == 0 ) ;

#else
    
    static const bool out_n_vecsz = false ;
    
#endif

    // process a bunch of coordinates: apply the 'inner' functor,
    // then write result to memory using 'fluff'.

    // flag used to dispatch to either of the unstrided bunch/fluff overloads:
    
    typedef typename std::integral_constant < bool , dim_out == 1 > use_store_t ;
    
    // if the stride is 1, we can use specialized 'fluff' variants,
    // provided the data are single-channel (or the vector width
    // is a multiple of the hardware vector width when Vc is used).
    // All other cases are handled with the variant of 'fluff'
    // taking a stride.

    if ( stride == 1 && ( dim_out == 1 || out_n_vecsz ) )
    {
      for ( ic_type a = 0 ; a < aggregates ; a++ )
      {
        functor ( md_crd , buffer ) ;
        fluff ( buffer , trg , use_store_t() ) ;
        trg += vsz ;
        md_crd[axis] += vsz ;
      }
    }
    else
    {
      for ( ic_type a = 0 ; a < aggregates ; a++ )
      {
        functor ( md_crd , buffer ) ;
        fluff ( buffer , trg , stride ) ;
        trg += vsz * stride ;
        md_crd[axis] += vsz ;
      }
    }

    // peeling is done, any leftovers are processed one-by-one
    
    crd[axis] += aggregates * vsz ;

    for ( ic_type r = 0 ; r < leftover ; r++ )
    {
      functor ( crd , *trg ) ;
      trg += stride ;
      crd[axis]++ ;
    }    
  } 
} ; // struct indexed_aggregator

/// specialization for vsz == 1. Here the data are simply
/// processed one by one in a loop, without vectorization.
/// Since there is no aggregation with vsize == 1, this class
/// does not inherit from aggregator_base.

template < typename ic_type , class functor_type >
struct indexed_aggregator < 1 , ic_type , functor_type >
{  
  const functor_type functor ;
  
  indexed_aggregator ( const functor_type & _functor )
  : functor ( _functor )
  { } ;
  
  // note how we use the functor's in_type as the coordinate type,
  // rather than using a TinyVector of some integral type. This way
  // we have the index already in the type needed by the functor and
  // arithmetic on the coordinate uses this type as well.
  
  typedef typename functor_type::in_type sd_coordinate_type ;
  
  void operator() ( sd_coordinate_type crd ,
                    int axis ,
                    typename functor_type::out_type * trg ,
                    ic_type stride ,
                    ic_type length )
  {
    for ( ic_type r = 0 ; r < length ; r++ )
    {
      functor ( crd , *trg ) ;
      trg += stride ;
      crd[axis]++ ;
    }    
  }  
} ;

/// an aggregator for separate - possibly different - source and target.
/// If source and target are in fact different, the inner functor will
/// read data from source, process them and then write them to target.
/// If source and target are the same, the operation will be in-place,
/// but not explicitly so. vspline uses this style of two-argument functor,
/// and this is the aggregator we use for vspline's array-based transforms.
/// The code in this template will only be used for vectorized operation,
/// If vectorization is not used, only the specialization for vsize == 1
/// below is used.

template < size_t vsz , typename ic_type , class functor_type ,
           typename = std::enable_if < ( vsz > 1 ) > >
struct coupled_aggregator
: public aggregator_base < vsz , ic_type >
{
  typedef typename functor_type::in_type in_type ;
  typedef typename functor_type::in_ele_type in_ele_type ;
  
  enum { dim_in = functor_type::dim_in } ;
  enum { dim_out = functor_type::dim_out } ;

  typedef typename functor_type::out_type out_type ;
  typedef typename functor_type::out_ele_type out_ele_type ;
  
  typedef aggregator_base < vsz , ic_type > base_type ;

  using base_type::bunch ;
  using base_type::fluff ;
  
  const functor_type functor ;
  
  // get the data types the functor expects
  
  typedef typename functor_type::in_v in_v ;
  typedef typename functor_type::out_v out_v ;

  coupled_aggregator ( const functor_type & _functor )
  : functor ( _functor )
  { } ;
  
  void operator() ( const in_type * src ,
                    ic_type in_stride ,
                    out_type * trg ,
                    ic_type out_stride ,
                    ic_type length
                  )
  {
    auto aggregates = length / vsz ;
    auto leftover = length - aggregates * vsz ;
    
    in_v in_buffer ;
    out_v out_buffer ;
    
    // first we perform a peeling run, processing data vectorized
    // as long as there are enough data to fill the vectorized
    // buffers (md_XXX_data_type)
    
#ifdef USE_VC

    // flags which are true if vsz is a multiple of the hardware
    // vector size for the elementary types involved. This works like
    // an opt-in: even if dim_in or dim_out are not 1, if these flags
    // are true, specialized load/store variants are called. If, then,
    // use_load_t or use_store_t are std::false_type, we'll end up in
    // the specialized Vc code using InterleavedMemoryWrapper.
    
    static const bool in_n_vecsz
      = (    vspline::vector_traits<in_ele_type>::hsize > 0
          && vsz % vspline::vector_traits<in_ele_type>::hsize == 0 ) ;
      
    static const bool out_n_vecsz
      = (    vspline::vector_traits<out_ele_type>::hsize > 0
          && vsz % vspline::vector_traits<out_ele_type>::hsize == 0 ) ;

#else
    
    static const bool in_n_vecsz = false ;
    static const bool out_n_vecsz = false ;
    
#endif
    
    // used to dispatch to either of the unstrided bunch/fluff overloads;
    // see also the remarks coming with use_store_t in the routine above.

    typedef typename std::integral_constant < bool , dim_in == 1 > use_load_t ;
    
    typedef typename std::integral_constant < bool , dim_out == 1 > use_store_t ;

    // depending on whether the input/output is strided or not,
    // and on the vector size and number of channels,
    // we pick different overloads of 'bunch' and fluff'. The
    // overloads without stride may use InterleavedMemoryWrapper,
    // or, for single-channel data, SIMD load/store operations,
    // which is most efficient. We can only pick the variants
    // using InterleavedMemoryWrapper if vsz is a multiple of
    // the hardware SIMD register size, hence the rather complex
    // conditionals. But the complexity is rewarded with optimal
    // peformance.
    
    if (    in_stride == 1
        && ( dim_in == 1 || in_n_vecsz ) )
    {
      if (    out_stride == 1
          && ( dim_out == 1 || out_n_vecsz ) )
      {
        for ( ic_type a = 0 ; a < aggregates ; a++ )
        {
          bunch ( src , in_buffer , use_load_t() ) ;
          src += vsz ;
          functor ( in_buffer , out_buffer ) ;
          fluff ( out_buffer , trg , use_store_t() ) ;
          trg += vsz ;
        }
      }
      else
      {
        for ( ic_type a = 0 ; a < aggregates ; a++ )
        {
          bunch ( src , in_buffer , use_load_t() ) ;
          src += vsz ;
          functor ( in_buffer , out_buffer ) ;
          fluff ( out_buffer , trg , out_stride ) ;
          trg += out_stride * vsz ;
        }
      }
    }
    else
    {
      if (    out_stride == 1
          && ( dim_out == 1 || out_n_vecsz ) )
      {
        for ( ic_type a = 0 ; a < aggregates ; a++ )
        {
          bunch ( src , in_buffer , in_stride ) ;
          src += in_stride * vsz ;
          functor ( in_buffer , out_buffer ) ;
          fluff ( out_buffer , trg , use_store_t() ) ;
          trg += vsz ;
        }
      }
      else
      {
        // this is the 'generic' case:
        for ( ic_type a = 0 ; a < aggregates ; a++ )
        {
          bunch ( src , in_buffer , in_stride ) ;
          src += in_stride * vsz ;
          functor ( in_buffer , out_buffer ) ;
          fluff ( out_buffer , trg , out_stride ) ;
          trg += out_stride * vsz ;
        }
      }
    }
  
    // peeling is done, we mop up the remainder with scalar code
    
    for ( ic_type r = 0 ; r < leftover ; r++ )
    {
      functor ( *src , *trg ) ;
      src += in_stride ;
      trg += out_stride ;
    }    
  }  
} ; // struct coupled_aggregator

/// specialization for vsz == 1. Here the data are simply
/// processed one by one in a loop, without vectorization.
/// Since there is no aggregation with vsize == 1, this class
/// does not inherit from aggregator_base.

template < typename ic_type , class functor_type >
struct coupled_aggregator < 1 , ic_type , functor_type >
{
  const functor_type functor ;
  
  coupled_aggregator ( const functor_type & _functor )
  : functor ( _functor )
  { } ;
  
  void operator() ( const typename functor_type::in_type * src ,
                    ic_type in_stride ,
                    typename functor_type::out_type * trg ,
                    ic_type out_stride ,
                    ic_type length
                  )
  {    
    for ( ic_type r = 0 ; r < length ; r++ )
    {
      functor ( *src , *trg ) ;
      src += in_stride ;
      trg += out_stride ;
    }    
  }  
} ;

// indexed_aggregator and coupled_aggregator can be replaced by two
// separate stages each, separating the pointer manipulation and
// the call to the functor. This code is usable but not quite as
// evolved (no specialization using InterleavedMemoryWrapper).

// /// indexed_stepper works like indexed_aggregator, but the functor
// /// receives it's second argument as a pointer, so the functor is
// /// responsible to handle storing the data it produces.
// 
// template < size_t vsz , typename ic_type , class functor_type ,
//            typename = std::enable_if < ( vsz > 1 ) > >
// struct indexed_stepper
// {
//   typedef typename functor_type::in_type in_type ;
//   typedef typename functor_type::in_v in_v ;
//   enum { dim_in = functor_type::dim_in } ;
//   
//   typedef typename functor_type::out_type out_type ;
//   
//   const functor_type functor ;
//   
//   indexed_stepper ( const functor_type & _functor )
//   : functor ( _functor )
//   { } ;
//   
//   void operator() ( in_type crd ,
//                     int axis ,
//                     out_type * trg ,
//                     ic_type stride ,
//                     ic_type length )
//   {
//     auto aggregates = length / vsz ;
//     auto leftover = length - aggregates * vsz ;
//     
//     in_v md_crd ;
//     
//     for ( int d = 0 ; d < dim_in ; d++ )
//     {
//       if ( d != axis )
//         md_crd[d] = crd[d] ;
//       else
//       {
//         for ( int e = 0 ; e < vsz ; e++ )
//           md_crd[d][e] = crd[d] + e ;
//       }
//     }
// 
//     while ( aggregates-- )
//     {
//       functor ( md_crd , trg , stride ) ;
//       trg += vsz * stride ;
//       md_crd[axis] += vsz ;
//     }
// 
//     // peeling is done, any leftovers are processed one-by-one
//     
//     crd[axis] += aggregates * vsz ;
// 
//     while ( leftover-- )
//     {
//       functor ( crd , trg ) ;
//       trg += stride ;
//       crd[axis]++ ;
//     }    
//   } 
// } ; // struct indexed_stepper
// 
// template < typename ic_type , class functor_type >
// struct indexed_stepper < 1 , ic_type , functor_type >
// {
//   typedef typename functor_type::in_type in_type ;
//   typedef typename functor_type::out_type out_type ;  
//   const functor_type functor ;
//   
//   indexed_stepper ( const functor_type & _functor )
//   : functor ( _functor )
//   { } ;
//   
//   void operator() ( in_type crd ,
//                     int axis ,
//                     out_type * trg ,
//                     ic_type stride ,
//                     ic_type length )
//   {
//     while ( length-- )
//     {
//       functor ( crd , trg ) ;
//       trg += stride ;
//       crd[axis]++ ;
//     }    
//   } 
// } ; // struct indexed_stepper
// 
// /// coupled_stepper is like coupled_aggregator, but it passes pointers
// /// to source and target data to the functor, which is responsible for
// /// reading from source and depositing at target.
// 
// template < size_t vsz , typename ic_type , class functor_type ,
//            typename = std::enable_if < ( vsz > 1 ) > >
// struct coupled_stepper
// {
//   typedef typename functor_type::in_type in_type ;
//   typedef typename functor_type::out_type out_type ;
//    
//   const functor_type functor ;
// 
//   coupled_stepper ( const functor_type & _functor )
//   : functor ( _functor )
//   { } ;
//   
//   void operator() ( const in_type * src ,
//                     ic_type in_stride ,
//                     out_type * trg ,
//                     ic_type out_stride ,
//                     ic_type length
//                   )
//   {
//     auto aggregates = length / vsz ;
//     auto leftover = length - aggregates * vsz ;
//     
//     while ( aggregates-- )
//     {
//       functor ( src , in_stride , trg , out_stride ) ;
//       src += in_stride * vsz ;
//       trg += out_stride * vsz ;
//     }
//   
//     // peeling is done, we mop up the remainder with scalar code
//     
//     while ( leftover-- )
//     {
//       functor ( src , trg ) ;
//       src += in_stride ;
//       trg += out_stride ;
//     }    
//   }  
// } ; // struct coupled_stepper
// 
// template < typename ic_type , class functor_type >
// struct coupled_stepper < 1 , ic_type , functor_type >
// {
//   typedef typename functor_type::in_type in_type ;
//   typedef typename functor_type::out_type out_type ;
//    
//   const functor_type functor ;
// 
//   coupled_stepper ( const functor_type & _functor )
//   : functor ( _functor )
//   { } ;
//   
//   void operator() ( const in_type * src ,
//                     ic_type in_stride ,
//                     out_type * trg ,
//                     ic_type out_stride ,
//                     ic_type length
//                   )
//   {
//     while ( length-- )
//     {
//       functor ( src , trg ) ;
//       src += in_stride ;
//       trg += out_stride ;
//     }    
//   }  
// } ; // struct coupled_stepper
// 
// /// indexed_stepper_adapter inherits 'fluff' from class aggregator_base,
// /// which it uses to store to the target pointers it receives (from an
// /// indexed_stepper object). This is a common way of dealing with incoming
// /// pointers and used to be the only one before the stepping code was
// /// factored out. Conceptually, the stepper acts as an iterator, while
// /// this class is an accessor.
// 
// template < class inner_type >
// struct indexed_stepper_adapter
// : public aggregator_base < inner_type::vsize , int >
// {
//   typedef typename inner_type::in_type in_type ;
//   typedef typename inner_type::out_type out_type ;
//   typedef typename inner_type::in_v in_v ;
//   typedef typename inner_type::out_v out_v ;
//   enum { dim_in = inner_type::dim_in } ;
//   
//   typedef aggregator_base < inner_type::vsize , int > base_type ;
//   using base_type::fluff ;
//   
//   const inner_type inner ;
//   
//   indexed_stepper_adapter ( const inner_type & _inner )
//   : inner ( _inner )
//   { } ;
//   
//   void operator() ( const in_v & in , out_type * trg , int stride ) const
//   {
//     out_v buffer ;
//     inner ( in , buffer ) ;
//     fluff ( buffer , trg , stride ) ;
//   }
//   
//   void operator() ( const in_type & in , out_type * trg ) const
//   {
//     inner ( in , *trg ) ;
//   }
// } ;
// 
// /// coupled_stepper_adapter inherits methods bunch and fluff from class
// /// aggregator_base and uses them to read arguments from src and store
// /// results to trg.
// 
// template < class inner_type >
// struct coupled_stepper_adapter
// : public aggregator_base < inner_type::vsize , int >
// {
//   typedef typename inner_type::in_type in_type ;
//   typedef typename inner_type::out_type out_type ;
//   typedef typename inner_type::in_v in_v ;
//   typedef typename inner_type::out_v out_v ;
//   
//   typedef aggregator_base < inner_type::vsize , int > base_type ;
//   using base_type::bunch ;
//   using base_type::fluff ;
//   
//   const inner_type inner ;
//   
//   coupled_stepper_adapter ( const inner_type & _inner )
//   : inner ( _inner )
//   { } ;
//   
//   void operator() ( const in_type * src , int in_stride ,
//                     out_type * trg , int out_stride ) const
//   {
//     in_v in_buffer ;
//     out_v out_buffer ;
//     bunch ( src , in_buffer , in_stride ) ;
//     inner ( in_buffer , out_buffer ) ;
//     fluff ( out_buffer , trg , out_stride ) ;
//   }
//   
//   void operator() ( const in_type * in , out_type * trg ) const
//   {
//     inner ( *in , *trg ) ;
//   }
// } ;

/// vs_adapter wraps a vspline::unary_functor to produce a functor which is
/// compatible with the wielding code. This is necessary, because vspline's
/// unary_functors take 'naked' arguments if the data are 1D, while the
/// wielding code always passes TinyVectors. The operation of this wrapper
/// class should not have a run-time effect; it's simply converting references.
/// the wrapped functor is only used via operator(), so this is what we provide.
/// While it would be nice to simply pass through the unwrapped unary_functor,
/// this would force us to deal with the distinction between data in TinyVectors
/// and 'naked' fundamentals deeper down in the code, and here is a good central
/// place where we can route to uniform access via TinyVectors - possibly with
/// only one element.
/// By inheriting from inner_type, we provide all of inner_type's type system
/// which we don't explicitly override.

template < class inner_type >
struct vs_adapter
: public inner_type
{
  using typename inner_type::in_ele_v ;
  using typename inner_type::out_ele_v ;
  
  typedef typename inner_type::in_nd_ele_type in_type ;
  typedef typename inner_type::out_nd_ele_type out_type ;
  typedef typename inner_type::in_nd_ele_v in_v ;
  typedef typename inner_type::out_nd_ele_v out_v ;
                              
  vs_adapter ( const inner_type & _inner )
  : inner_type ( _inner )
  { } ;
  
  /// operator() overload for unvectorized arguments

  void operator() ( const in_type & in ,
                         out_type & out ) const
  {
    inner_type::eval
      ( reinterpret_cast < const typename inner_type::in_type & > ( in ) ,
        reinterpret_cast < typename inner_type::out_type & > ( out ) ) ;
  }

  /// vectorized evaluation function. This is enabled only if vsize > 1
  
  template < typename = std::enable_if < ( inner_type::vsize > 1 ) > >
  void operator() ( const in_v & in ,
                         out_v & out ) const
  {
    inner_type::eval
      ( reinterpret_cast < const typename inner_type::in_v & > ( in ) ,
        reinterpret_cast < typename inner_type::out_v & > ( out ) ) ;
  }
} ;

/// index_wield uses vspline's 'multithread' function to invoke
/// an index-transformation functor for all indexes into an array,
/// splitting the array into chunks to be processed by several
/// worker threads. We use functors which are vector-capable,
/// typically they will be derived from vspline::unary_functor.
/// index_wield internally uses a 'wield' object to invoke
/// the functor on the chunks of data.
/// Note how index_wield is initially called with the number
/// of jobs multithread should produce. It is called again
/// in each single job, now with njobs==1, which causes it
/// to actually perform the work.
/// Also note how 'output' is passed in via a pointer, not as
/// a reference, as is my usual style - and the functor is passed
/// in by value. this saves trickery with std::refs, and since
/// we're not very inner-loop here, performance is less of an issue.

template < class functor_type , int dimension >
void index_wield ( vspline::shape_range_type < dimension > range ,
                   int njobs ,
                   const functor_type functor ,
                   vigra::MultiArrayView < dimension ,
                                           typename functor_type::out_type
                                         > * output )
{
  typedef typename functor_type::out_type out_type ;
  typedef vspline::shape_range_type < dimension > range_type ;
  
  if ( njobs == 1 )
  {
    // njobs==1 signals that we are now in the worker thread.
    // We set up a wield object.
    // Then we set up an indexed_aggregator to apply the functor
    // and finally pass the data, aggregator, range limits and
    // processing axis to the 'wield' object.
    
    wield < dimension , out_type > wld ;
    
    indexed_aggregator < functor_type::vsize ,
                         int , // std::ptrdiff_t ,
                         functor_type > agg ( functor ) ;

// might instead use:
                         
//     typedef indexed_stepper_adapter < functor_type > isa_type ;
//     isa_type isa ( functor ) ;
//     
//     indexed_stepper < functor_type::vsize , int , isa_type >
//       agg ( isa ) ;
    
    // now 'wield' the aggregator to fill the part of 'output'
    // inside the range with the result of the coordinate
    // transformation functor. Operate collinear to axis 0, which
    // will normally be the fastest way (if the array is set up
    // vigra style). TODO: determine optimal axis

    wld ( *output , agg , range[0] , range[1] , 0 ) ;
  }
  else
  {
    // if njobs is not 1, we partition the incoming range into roughly
    // equal-sized chunks. Next we call 'multithread', passing first
    // a pointer to index_wield itself, then the partitioning and
    // remaining arguments. Multithread will call index_wield for
    // every range in the partitioning, now with njobs==1, resulting in
    // the code above being executed.
    
    auto partitioning = VSPLINE_DEFAULT_PARTITIONER<dimension> ( range , njobs ) ;
    auto recall = & ( index_wield < functor_type , dimension > ) ;
    vspline::multithread ( recall , partitioning , 1 , functor , output ) ;
  }
}

/// coupled_wield processes two arrays. The first array is taken as input,
/// the second for output. Both arrays must have the same dimensionality
/// and shape. Their data types have to be the same as the 'in_type' and
/// the 'out_type' of the functor which was passed in.

template < class functor_type , int dimension >
void coupled_wield ( vspline::shape_range_type < dimension > range ,
                     int njobs ,
                     const functor_type functor ,
                     const vigra::MultiArrayView < dimension ,
                                                   typename functor_type::in_type
                                                 > * input ,
                     vigra::MultiArrayView < dimension ,
                                             typename functor_type::out_type
                                           > * output
                 )
{
  typedef typename functor_type::in_type in_type ;
  typedef typename functor_type::out_type out_type ;
  typedef vspline::shape_range_type < dimension > range_type ;
  
  if ( njobs == 1 )
  {
    wield < dimension , in_type , out_type > wld ;
    
    coupled_aggregator < functor_type::vsize ,
                         int , // std::ptrdiff_t ,
                         functor_type > agg ( functor ) ;

// might instead use:
                         
//     typedef coupled_stepper_adapter < functor_type > csa_type ;
//     csa_type csa ( functor ) ;
//     
//     coupled_stepper < functor_type::vsize , int , csa_type >
//       agg ( csa ) ;
      
    wld ( *input , *output , agg , range[0] , range[1] , 0 ) ;
  }
  else
  {
    auto partitioning = VSPLINE_DEFAULT_PARTITIONER<dimension> ( range , njobs ) ;
    auto recall = & ( coupled_wield < functor_type , dimension > ) ;
    vspline::multithread ( recall , partitioning , 1 , functor , input , output ) ;
  }
}

} ; // namespace wielding

