// Fast Lanczos-2 scaler for SSE2 in C and intrinsics.
// Written by Nils Liaeen Corneliusen 2013.
// License: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication license
// Read the 2023 article here: https://www.ignorantus.com/pages/neon_filters/
// Read the 2018 article here: https://www.ignorantus.com/pages/image_transformation/

#include <stdint.h>
#include <alloca.h>
#include <x86intrin.h>

#include "bmp_planar.h"
#include "coeffs.h"

#define alloca_aligned(size, align) ((void *)((((uint64_t)alloca((size)+(align)))+(align)-1)&(~((align)-1))))

#define GETOFFSET(p) ( ((p)>>3)*64+((p)&0x07) )

static inline int clamp( int v, int lo, int hi )
{
    return v < lo ? lo : v > hi ? hi: v;
}

static void h2v( uint8_t *in, int cnt )
{
    __m128i *inw = (__m128i *)in;
    __m128i *outw = inw;

    do {
        __m128i in01 = _mm_load_si128( inw++ ); // 00000000.11111111
        __m128i in23 = _mm_load_si128( inw++ ); // 22222222.33333333
        __m128i in45 = _mm_load_si128( inw++ ); // 44444444.55555555
        __m128i in67 = _mm_load_si128( inw++ ); // 66666666.77777777

        __m128i in02 = _mm_unpacklo_epi8( in01, in23 ); // 02020202.02020202
        __m128i in13 = _mm_unpackhi_epi8( in01, in23 ); // 13131313.13131313
        __m128i in46 = _mm_unpacklo_epi8( in45, in67 ); // 46464646.46464646
        __m128i in57 = _mm_unpackhi_epi8( in45, in67 ); // 57575757.57575757

        __m128i in0123l = _mm_unpacklo_epi8( in02, in13 ); // 01230123.01230123
        __m128i in0123h = _mm_unpackhi_epi8( in02, in13 ); // ..
        __m128i in4567l = _mm_unpacklo_epi8( in46, in57 ); // 45674567.45674567
        __m128i in4567h = _mm_unpackhi_epi8( in46, in57 ); // ..

        __m128i out0 = _mm_unpacklo_epi32( in0123l, in4567l ); // 01234567.01234567
        __m128i out1 = _mm_unpackhi_epi32( in0123l, in4567l ); // ..
        __m128i out2 = _mm_unpacklo_epi32( in0123h, in4567h ); // ..
        __m128i out3 = _mm_unpackhi_epi32( in0123h, in4567h ); // ..

        _mm_store_si128( outw++, out0 );
        _mm_store_si128( outw++, out1 );
        _mm_store_si128( outw++, out2 );
        _mm_store_si128( outw++, out3 );

        cnt -= 8;

    } while( cnt > 0 );
}

// 2x 4 to 1 so 128 bit stores can be used
static void scale_vertical_x2( uint8_t *src, int srcw, int srch, int srcstr, uint8_t *dst0, uint32_t *yco, int yy, int yadd )
{
    int ypos;
    uint32_t vco;

    srch -= 1;

    ypos = yy>>16;
    __m128i *src00 = (__m128i *)(src + srcstr * clamp( ypos  , 0, srch ));
    __m128i *src01 = (__m128i *)(src + srcstr * clamp( ypos+1, 0, srch ));
    __m128i *src02 = (__m128i *)(src + srcstr * clamp( ypos+2, 0, srch ));
    __m128i *src03 = (__m128i *)(src + srcstr * clamp( ypos+3, 0, srch ));

    vco = *(yco + __bextr_u32( yy, ((COEFFS_STOPBIT-COEFFS_STARTBIT+1)<<8) | COEFFS_STARTBIT ) );
    __m128i vco00 = _mm_set1_epi16( (int8_t)(vco & 0xff) );
    __m128i vco01 = _mm_set1_epi16( (int8_t)__bextr_u32( vco, (8<<8) | 8  ) );
    __m128i vco02 = _mm_set1_epi16( (int8_t)__bextr_u32( vco, (8<<8) | 16 ) );
    __m128i vco03 = _mm_set1_epi16( (int8_t)__bextr_u32( vco, (8<<8) | 24 ) );

    yy += yadd;

    ypos = yy>>16;
    __m128i *src10 = (__m128i *)(src + srcstr * clamp( ypos  , 0, srch ));
    __m128i *src11 = (__m128i *)(src + srcstr * clamp( ypos+1, 0, srch ));
    __m128i *src12 = (__m128i *)(src + srcstr * clamp( ypos+2, 0, srch ));
    __m128i *src13 = (__m128i *)(src + srcstr * clamp( ypos+3, 0, srch ));

    vco = *(yco + __bextr_u32( yy, ((COEFFS_STOPBIT-COEFFS_STARTBIT+1)<<8) | COEFFS_STARTBIT ) );
    __m128i vco10 = _mm_set1_epi16( (int8_t)(vco & 0xff) );
    __m128i vco11 = _mm_set1_epi16( (int8_t)__bextr_u32( vco, (8<<8) | 8  ) );
    __m128i vco12 = _mm_set1_epi16( (int8_t)__bextr_u32( vco, (8<<8) | 16 ) );
    __m128i vco13 = _mm_set1_epi16( (int8_t)__bextr_u32( vco, (8<<8) | 24 ) );

    uint8_t *dst00 = dst0;

    __m128i zero     = _mm_set1_epi32( 0x00000000 );
    __m128i roundval = _mm_set1_epi16( COEFFS_ROUNDVAL );

    for( int x = 0; x < srcw; x += 16 ) {
        __m128i i0, i1, i2, i3;
        __m128i lo, hi;
        __m128i r00, r10;

        i0 = _mm_load_si128( src00++ );
        i1 = _mm_load_si128( src01++ );
        i2 = _mm_load_si128( src02++ );
        i3 = _mm_load_si128( src03++ );

        lo = _mm_add_epi16( _mm_add_epi16( _mm_mullo_epi16( _mm_unpacklo_epi8( i0, zero ), vco00 ),
                                           _mm_mullo_epi16( _mm_unpacklo_epi8( i1, zero ), vco01 ) ),
                            _mm_add_epi16( _mm_mullo_epi16( _mm_unpacklo_epi8( i2, zero ), vco02 ),
                                           _mm_mullo_epi16( _mm_unpacklo_epi8( i3, zero ), vco03 ) ) );

        hi = _mm_add_epi16( _mm_add_epi16( _mm_mullo_epi16( _mm_unpackhi_epi8( i0, zero ), vco00 ),
                                           _mm_mullo_epi16( _mm_unpackhi_epi8( i1, zero ), vco01 ) ),
                            _mm_add_epi16( _mm_mullo_epi16( _mm_unpackhi_epi8( i2, zero ), vco02 ),
                                           _mm_mullo_epi16( _mm_unpackhi_epi8( i3, zero ), vco03 ) ) );

        r00 = _mm_packus_epi16( _mm_srai_epi16( _mm_add_epi16( lo, roundval ), COEFFS_SUM_BITS ),
                                _mm_srai_epi16( _mm_add_epi16( hi, roundval ), COEFFS_SUM_BITS ) );

        i0 = _mm_load_si128( src10++ );
        i1 = _mm_load_si128( src11++ );
        i2 = _mm_load_si128( src12++ );
        i3 = _mm_load_si128( src13++ );

        lo = _mm_add_epi16( _mm_add_epi16( _mm_mullo_epi16( _mm_unpacklo_epi8( i0, zero ), vco10 ),
                                           _mm_mullo_epi16( _mm_unpacklo_epi8( i1, zero ), vco11 ) ),
                            _mm_add_epi16( _mm_mullo_epi16( _mm_unpacklo_epi8( i2, zero ), vco12 ),
                                           _mm_mullo_epi16( _mm_unpacklo_epi8( i3, zero ), vco13 ) ) );

        hi = _mm_add_epi16( _mm_add_epi16( _mm_mullo_epi16( _mm_unpackhi_epi8( i0, zero ), vco10 ),
                                           _mm_mullo_epi16( _mm_unpackhi_epi8( i1, zero ), vco11 ) ),
                            _mm_add_epi16( _mm_mullo_epi16( _mm_unpackhi_epi8( i2, zero ), vco12 ),
                                           _mm_mullo_epi16( _mm_unpackhi_epi8( i3, zero ), vco13 ) ) );

        r10 = _mm_packus_epi16( _mm_srai_epi16( _mm_add_epi16( lo, roundval ), COEFFS_SUM_BITS ),
                                _mm_srai_epi16( _mm_add_epi16( hi, roundval ), COEFFS_SUM_BITS ) );

        _mm_store_si128( (__m128i *)dst00, _mm_unpacklo_epi64( r00, r10 ) ); dst00 += 64;
        _mm_store_si128( (__m128i *)dst00, _mm_unpackhi_epi64( r00, r10 ) ); dst00 += 64;

    }

    // pad left
    __m128i left;
    left = _mm_set1_epi8( dst0[0] );
    _mm_storel_epi64( (__m128i *)(dst0-64), left );
    left = _mm_set1_epi8( dst0[8] );
    _mm_storel_epi64( (__m128i *)(dst0+8-64), left );

    // pad right
    uint8_t right;
    right = *(dst0+GETOFFSET(srcw-1));
    *(dst0+GETOFFSET(srcw+0)) = right;
    *(dst0+GETOFFSET(srcw+1)) = right;
    right = *(dst0+8+GETOFFSET(srcw-1));
    *(dst0+8+GETOFFSET(srcw+0)) = right;
    *(dst0+8+GETOFFSET(srcw+1)) = right;
}


static void scale_horizontal_vertical( uint8_t *src, uint8_t *dst, int dstw, int dststr, int xadd, int xoffset, uint32_t *xco )
{
    int xx = xoffset + (-1<<16);

    __m128i zero     = _mm_set1_epi32( 0x00000000 );
    __m128i roundval = _mm_set1_epi16( COEFFS_ROUNDVAL );

    for( int x = 0; x < dstw; x += 16 ) {

        __m128i dbuf[8];

        for( int i = 0; i < 8; i++ ) {
            uint32_t hco;
            __m128i hco0, hco1, hco2, hco3;
            __m128i in01, in23;
            uint8_t *in;

            hco = *(xco + __bextr_u32( xx, ((COEFFS_STOPBIT-COEFFS_STARTBIT+1)<<8) | COEFFS_STARTBIT ) );
            hco0 = _mm_set1_epi16( (int8_t)(hco & 0xff) );
            hco1 = _mm_set1_epi16( (int8_t)__bextr_u32( hco, (8<<8) | 8  ) );
            hco2 = _mm_set1_epi16( (int8_t)__bextr_u32( hco, (8<<8) | 16 ) );
            hco3 = _mm_set1_epi16( (int8_t)__bextr_u32( hco, (8<<8) | 24 ) );

            in  = src + ((xx>>16)<<3);
            xx += xadd;

            in01 = _mm_loadu_si128( (__m128i *)in ); in += 16;
            in23 = _mm_loadu_si128( (__m128i *)in );

            __m128i lo = _mm_add_epi16( _mm_add_epi16( _mm_mullo_epi16( _mm_unpacklo_epi8( in01, zero ), hco0 ),
                                                       _mm_mullo_epi16( _mm_unpackhi_epi8( in01, zero ), hco1 ) ),
                                        _mm_add_epi16( _mm_mullo_epi16( _mm_unpacklo_epi8( in23, zero ), hco2 ),
                                                       _mm_mullo_epi16( _mm_unpackhi_epi8( in23, zero ), hco3 ) ) );

            hco = *(xco + __bextr_u32( xx, ((COEFFS_STOPBIT-COEFFS_STARTBIT+1)<<8) | COEFFS_STARTBIT ) );
            hco0 = _mm_set1_epi16( (int8_t)(hco & 0xff) );
            hco1 = _mm_set1_epi16( (int8_t)__bextr_u32( hco, (8<<8) | 8  ) );
            hco2 = _mm_set1_epi16( (int8_t)__bextr_u32( hco, (8<<8) | 16 ) );
            hco3 = _mm_set1_epi16( (int8_t)__bextr_u32( hco, (8<<8) | 24 ) );

            in  = src + ((xx>>16)<<3);
            xx += xadd;

            in01 = _mm_loadu_si128( (__m128i *)in ); in += 16;
            in23 = _mm_loadu_si128( (__m128i *)in );

            __m128i hi = _mm_add_epi16( _mm_add_epi16( _mm_mullo_epi16( _mm_unpacklo_epi8( in01, zero ), hco0 ),
                                                       _mm_mullo_epi16( _mm_unpackhi_epi8( in01, zero ), hco1 ) ),
                                        _mm_add_epi16( _mm_mullo_epi16( _mm_unpacklo_epi8( in23, zero ), hco2 ),
                                                       _mm_mullo_epi16( _mm_unpackhi_epi8( in23, zero ), hco3 ) ) );


            dbuf[i] = _mm_packus_epi16( _mm_srai_epi16( _mm_add_epi16( lo, roundval ), COEFFS_SUM_BITS ),
                                        _mm_srai_epi16( _mm_add_epi16( hi, roundval ), COEFFS_SUM_BITS ) );

        }

        __m128i in02, in13, in46, in57;
        __m128i in0123l, in0123h, in4567l, in4567h;
        __m128i out0l, out1l, out2l, out3l;
        __m128i out0h, out1h, out2h, out3h;

        in02 = _mm_unpacklo_epi8( dbuf[0], dbuf[1] );
        in13 = _mm_unpackhi_epi8( dbuf[0], dbuf[1] );
        in46 = _mm_unpacklo_epi8( dbuf[2], dbuf[3] );
        in57 = _mm_unpackhi_epi8( dbuf[2], dbuf[3] );

        in0123l = _mm_unpacklo_epi8( in02, in13 );
        in0123h = _mm_unpackhi_epi8( in02, in13 );
        in4567l = _mm_unpacklo_epi8( in46, in57 );
        in4567h = _mm_unpackhi_epi8( in46, in57 );

        out0l = _mm_unpacklo_epi32( in0123l, in4567l );
        out1l = _mm_unpackhi_epi32( in0123l, in4567l );
        out2l = _mm_unpacklo_epi32( in0123h, in4567h );
        out3l = _mm_unpackhi_epi32( in0123h, in4567h );

        in02 = _mm_unpacklo_epi8( dbuf[4], dbuf[5] );
        in13 = _mm_unpackhi_epi8( dbuf[4], dbuf[5] );
        in46 = _mm_unpacklo_epi8( dbuf[6], dbuf[7] );
        in57 = _mm_unpackhi_epi8( dbuf[6], dbuf[7] );

        in0123l = _mm_unpacklo_epi8( in02, in13 );
        in0123h = _mm_unpackhi_epi8( in02, in13 );
        in4567l = _mm_unpacklo_epi8( in46, in57 );
        in4567h = _mm_unpackhi_epi8( in46, in57 );

        out0h = _mm_unpacklo_epi32( in0123l, in4567l );
        out1h = _mm_unpackhi_epi32( in0123l, in4567l );
        out2h = _mm_unpacklo_epi32( in0123h, in4567h );
        out3h = _mm_unpackhi_epi32( in0123h, in4567h );

        uint8_t *out = dst + x;

        _mm_stream_si128( (__m128i *)out, _mm_unpacklo_epi64( out0l, out0h ) ); out += dststr;
        _mm_stream_si128( (__m128i *)out, _mm_unpackhi_epi64( out0l, out0h ) ); out += dststr;
        _mm_stream_si128( (__m128i *)out, _mm_unpacklo_epi64( out1l, out1h ) ); out += dststr;
        _mm_stream_si128( (__m128i *)out, _mm_unpackhi_epi64( out1l, out1h ) ); out += dststr;
        _mm_stream_si128( (__m128i *)out, _mm_unpacklo_epi64( out2l, out2h ) ); out += dststr;
        _mm_stream_si128( (__m128i *)out, _mm_unpackhi_epi64( out2l, out2h ) ); out += dststr;
        _mm_stream_si128( (__m128i *)out, _mm_unpacklo_epi64( out3l, out3h ) ); out += dststr;
        _mm_stream_si128( (__m128i *)out, _mm_unpackhi_epi64( out3l, out3h ) );

    }

}


void scale_plane( uint8_t *src, int srcw, int srch, int srcstr,
                  uint8_t *dst, int dstw, int dsth, int dststr,
                  uint32_t *xco, uint32_t *yco )
{
    int xadd = (srcw<<16)/dstw;
    int yadd = (srch<<16)/dsth;

    int xoffset = (xadd>>1)-(1<<15);
    int yoffset = (yadd>>1)-(1<<15);

    int yy = yoffset + (-1<<16);

    uint8_t *buf = alloca_aligned( (srcstr+8+8)*8, 16 );
    buf += 64;

    for( int y = 0; y < dsth; y += 8 ) {

        for( int i = 0; i < 8; i += 2 ) {

            scale_vertical_x2( src, srcw, srch, srcstr, buf + i*8, yco, yy, yadd );

            yy += yadd*2;

        }

        h2v( buf-64, srcw+16 );

        scale_horizontal_vertical( buf, dst + y * dststr, dstw, dststr, xadd, xoffset, xco );

    }

}
