// Fast Lanczos-2 scaler for ARMv7 Neon in Assembler.
// Written by Nils Liaeen Corneliusen 2014. Updated 2023.
// License: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication license
// Read the 2023 article here: https://www.ignorantus.com/pages/neon_scaler/
// Read the 2018 article here: https://www.ignorantus.com/pages/image_transformation/

// This is the classic Lanczos-2 scaler that I wrote in 2014 updated to actually
// compile and, well, scale images. It might not look like a scaler, but it is.
// Please read the articles for info about the method.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "coeffs.h"

//#define GETOFFSET(p) ( ((p)>>3)*64+((p)&0x07) )

#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"

static inline int _min_s32( int a, int b )
{
    return a < b ? a : b;
}

static inline int _max_s32( int a, int b )
{
    return a > b ? a : b;
}

static void vert_interpolate_4to1_8( uint8_t *src, int src_stride, uint32_t srch, int yy0, uint8_t *dst, int width, uint32_t *ycoeffs )
{
    uint8_t *sp0 = src + src_stride * _min_s32(_max_s32((yy0>>16)  ,0), srch);
    uint8_t *sp1 = src + src_stride * _min_s32(_max_s32((yy0>>16)+1,0), srch);
    uint8_t *sp2 = src + src_stride * _min_s32(_max_s32((yy0>>16)+2,0), srch);
    uint8_t *sp3 = src + src_stride * _min_s32(_max_s32((yy0>>16)+3,0), srch);

    uint8_t *dst0;
    uint32_t ws;

    asm volatile (

        "pld       [%[sp0], #0]\n\t"
        "pld       [%[sp1], #0]\n\t"
        "pld       [%[sp2], #0]\n\t"
        "pld       [%[sp3], #0]\n\t"
        "pld       [%[sp0], #64]\n\t"
        "pld       [%[sp1], #64]\n\t"
        "pld       [%[sp2], #64]\n\t"
        "pld       [%[sp3], #64]\n\t"

        "sub      %[ws],   %[width], #1\n\t"
        "mov      %[dst0], %[dst]\n\t"

        // co0 = *(coeffs + ((((uint32_t)yy0)>>10)&63));
        "ubfx     %[yy0], %[yy0], #10, #6\n\t"
        "add      %[yy0], %[ycoeffs], %[yy0], lsl #2\n\t"
        "vld1.32  d2, [%[yy0]]\n\t"
        "vmovl.s8 q1, d2\n\t"

        "vmov.u16 q0, #0x0020\n\t"

        "vld1.64   {d16-d19}, [%[sp0]:64]!\n\t"
        "vld1.64   {d20-d23}, [%[sp1]:64]!\n\t"

        "32421:\n\t"

        "pld       [%[sp0], #128]\n\t"
        "pld       [%[sp1], #128]\n\t"
        "pld       [%[sp2], #128]\n\t"
        "pld       [%[sp3], #128]\n\t"

        "vmovl.u8  q12, d16\n\t"
        "vmovl.u8  q13, d17\n\t"
        "vmovl.u8  q14, d18\n\t"
        "vmovl.u8  q15, d19\n\t"
        "vmul.s16  q4,  q12, d2[0]\n\t"
        "vmul.s16  q5,  q13, d2[0]\n\t"
        "vmul.s16  q6,  q14, d2[0]\n\t"
        "vmul.s16  q7,  q15, d2[0]\n\t"

        "vmovl.u8  q12, d20\n\t"
        "vmovl.u8  q13, d21\n\t"
        "vmovl.u8  q14, d22\n\t"
        "vmovl.u8  q15, d23\n\t"
        "vmla.s16  q4,  q12, d2[1]\n\t"
        "vmla.s16  q5,  q13, d2[1]\n\t"
        "vmla.s16  q6,  q14, d2[1]\n\t"
        "vmla.s16  q7,  q15, d2[1]\n\t"

        "vld1.64   {d24-d27}, [%[sp2]:64]!\n\t"
        "vld1.64   {d28-d31}, [%[sp3]:64]!\n\t"

        "vmovl.u8  q8,  d24\n\t"
        "vmovl.u8  q9,  d25\n\t"
        "vmovl.u8  q10, d26\n\t"
        "vmovl.u8  q11, d27\n\t"
        "vmla.s16  q4,  q8,  d2[2]\n\t"
        "vmla.s16  q5,  q9,  d2[2]\n\t"
        "vmla.s16  q6,  q10, d2[2]\n\t"
        "vmla.s16  q7,  q11, d2[2]\n\t"

        "vmovl.u8  q8,  d28\n\t"
        "vmovl.u8  q9,  d29\n\t"
        "vmovl.u8  q10, d30\n\t"
        "vmovl.u8  q11, d31\n\t"
        "vmla.s16  q4,  q8,  d2[3]\n\t"
        "vmla.s16  q5,  q9,  d2[3]\n\t"
        "vmla.s16  q6,  q10, d2[3]\n\t"
        "vmla.s16  q7,  q11, d2[3]\n\t"

        "vld1.64   {d16-d19}, [%[sp0]:64]!\n\t"
        "vld1.64   {d20-d23}, [%[sp1]:64]!\n\t"

        // round and pack
        "vadd.s16 q4, q4, q0\n\t"
        "vadd.s16 q5, q5, q0\n\t"
        "vadd.s16 q6, q6, q0\n\t"
        "vadd.s16 q7, q7, q0\n\t" //n.71-0   1c n0 q3:3

        "vqshrun.s16 d8,  q4, #6\n\t"
        "vqshrun.s16 d9,  q5, #6\n\t"
        "vqshrun.s16 d10, q6, #6\n\t"
        "vqshrun.s16 d11, q7, #6\n\t"

        "vstr     d8,  [%[dst], #0]\n\t"
        "vstr     d9,  [%[dst], #64]\n\t"
        "vstr     d10, [%[dst], #128]\n\t"
        "vstr     d11, [%[dst], #192]\n\t"

        "subs     %[width], #32\n\t"
        "add      %[dst], #256\n\t"
        "bgt      32421b\n\t"

        // left pad
        // *(uint64_t *)(dst0-64) = dup8( dst0[0] );
        "ldrb    %[sp0], [%[dst0]]\n\t"
        "vdup.8  d0, %[sp0]\n\t"
        "vstr    d0, [%[dst0], #-64]\n\t"
        "vstr    d0, [%[dst0], #-8-64]\n\t"

        // right pad
        // avoid unaligned stores when srcw is odd
        // pad = *(dst0+64+GETOFFSET(ws));
        "mov     %[sp0], %[ws], lsr #3\n\t"
        "lsl     %[sp0], %[sp0], #6\n\t"
        "and     %[sp2], %[ws], #7\n\t"
        "add     %[sp0], %[sp0], %[sp2]\n\t"
        "ldrb    %[sp1], [%[dst0], %[sp0]]\n\t"

        // *(dst0+64+GETOFFSET(ws+1)) = pad;
        "add     %[ws],  %[ws], #1\n\t"
        "lsr     %[sp0], %[ws], #3\n\t"
        "lsl     %[sp0], %[sp0], #6\n\t"
        "and     %[sp2], %[ws], #7\n\t"
        "add     %[sp0], %[sp0], %[sp2]\n\t"
        "strb    %[sp1], [%[dst0], %[sp0]]\n\t"

        // *(dst0+64+GETOFFSET(ws+2)) = pad;
        "add     %[ws],  %[ws], #1\n\t"
        "lsr     %[sp0], %[ws], #3\n\t"
        "lsl     %[sp0], %[sp0], #6\n\t"
        "and     %[sp2], %[ws], #7\n\t"
        "add     %[sp0], %[sp0], %[sp2]\n\t"
        "strb    %[sp1], [%[dst0], %[sp0]]\n\t"

    : [dst] "+r" (dst), [dst0] "+r" (dst0),
      [sp0] "+r" (sp0), [sp1] "+r" (sp1), [sp2] "+r" (sp2), [sp3] "+r" (sp3),
      [ws] "+r" (ws), [yy0] "+r" (yy0), [width] "+r" (width)
    : [ycoeffs] "r" (ycoeffs)
    :  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
       "d8", "d9","d10","d11","d12","d13","d14","d15",
      "d16","d17","d18","d19","d20","d21","d22","d23",
      "d24","d25","d26","d27","d28","d29","d30","d31"
    );

}

static void h2v( uint8_t *src, int cnt )
{
    uint8_t *dst;

    asm volatile (
        "mov     %[dst], %[src]\n\t"

        "vld4.8  {d0-d3},   [%[src]:64]!\n\t"
        "vld4.8  {d4-d7},   [%[src]:64]!\n\t"

        "9016:\n\t"
        "subs    %[cnt], #16\n\t"

        "vtrn.8  q0, q2\n\t"
        "vtrn.8  q1, q3\n\t"

        "pld     [%[src], #192]\n\t"
        "pld     [%[src], #256]\n\t"

        "vld4.8  {d8-d11},  [%[src]:64]!\n\t"
        "vld4.8  {d12-d15}, [%[src]:64]!\n\t"

        "vst1.64 {d0-d3},   [%[dst]:64]!\n\t"
        "vst1.64 {d4-d7},   [%[dst]:64]!\n\t"

        "vtrn.8  q4, q6\n\t"
        "vtrn.8  q5, q7\n\t"

        "vld4.8  {d0-d3},   [%[src]:64]!\n\t"
        "vld4.8  {d4-d7},   [%[src]:64]!\n\t"

        "vst1.64 {d8-d11},   [%[dst]:64]!\n\t"
        "vst1.64 {d12-d15},  [%[dst]:64]!\n\t"

        "bgt     9016b\n\t"
    : [src] "+r" (src), [dst] "+r" (dst), [cnt] "+r" (cnt)
    :
    :  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
       "d8", "d9","d10","d11","d12","d13","d14","d15"
    );

}

static void horiz_interpolate_8_8( uint8_t *src, int xadd, int xoffset, uint32_t *xcoeffs, int width, uint8_t *dst, int dst_stride )
{
    int xx_0;
    uint32_t tmp;

    asm volatile (

        // xx_0 = xoffset + (-1<<16);
        "mov     %[xx_0], %[xoffset]\n\t"
        "sub     %[xx_0], %[xx_0], #65536\n\t"

        "88:\n\t"

        // 0
        "ubfx     %[tmp], %[xx_0], #10, #6\n\t"
        "vmov.i16 q8, #0x0020\n\t"
        "add      %[tmp], %[xcoeffs], %[tmp], lsl #2\n\t"
        "vld1.32  d0, [%[tmp]]\n\t"
        "asr      %[tmp], %[xx_0], #16\n\t"
        "add      %[tmp], %[src], %[tmp], lsl #3\n\t"
        "vmovl.s8 q0, d0\n\t"
        "vld1.64  {d8-d11}, [%[tmp]:64]\n\t"
        "add      %[xx_0], %[xx_0], %[xadd]\n\t"
        "vmovl.u8 q1, d8\n\t"
        "vmovl.u8 q2, d9\n\t"
        "vmovl.u8 q6, d10\n\t"
        "vmovl.u8 q7, d11\n\t"
        "vmla.s16 q8, q1, d0[0]\n\t"
        "vmla.s16 q8, q2, d0[1]\n\t"
        "vmla.s16 q8, q6, d0[2]\n\t"
        "vmla.s16 q8, q7, d0[3]\n\t"

        // 1
        "ubfx     %[tmp], %[xx_0], #10, #6\n\t"
        "vmov.i16 q9, #0x0020\n\t"
        "add      %[tmp], %[xcoeffs], %[tmp], lsl #2\n\t"
        "vld1.32  d0, [%[tmp]]\n\t"
        "asr      %[tmp], %[xx_0], #16\n\t"
        "add      %[tmp], %[src], %[tmp], lsl #3\n\t"
        "vmovl.s8 q0, d0\n\t"
        "vld1.64  {d8-d11}, [%[tmp]:64]\n\t"
        "add      %[xx_0], %[xx_0], %[xadd]\n\t"
        "vmovl.u8 q1, d8\n\t"
        "vmovl.u8 q2, d9\n\t"
        "vmovl.u8 q6, d10\n\t"
        "vmovl.u8 q7, d11\n\t"
        "vmla.s16 q9, q1, d0[0]\n\t"
        "vmla.s16 q9, q2, d0[1]\n\t"
        "vmla.s16 q9, q6, d0[2]\n\t"
        "vmla.s16 q9, q7, d0[3]\n\t"

        // 2
        "ubfx     %[tmp], %[xx_0], #10, #6\n\t"
        "vmov.i16 q10, #0x0020\n\t"
        "add      %[tmp], %[xcoeffs], %[tmp], lsl #2\n\t"
        "vld1.32  d0, [%[tmp]]\n\t"
        "asr      %[tmp], %[xx_0], #16\n\t"
        "add      %[tmp], %[src], %[tmp], lsl #3\n\t"
        "vmovl.s8 q0, d0\n\t"
        "vld1.64  {d8-d11}, [%[tmp]:64]\n\t"
        "add      %[xx_0], %[xx_0], %[xadd]\n\t"
        "vmovl.u8 q1, d8\n\t"
        "vmovl.u8 q2, d9\n\t"
        "vmovl.u8 q6, d10\n\t"
        "vmovl.u8 q7, d11\n\t"
        "vmla.s16 q10, q1, d0[0]\n\t"
        "vmla.s16 q10, q2, d0[1]\n\t"
        "vmla.s16 q10, q6, d0[2]\n\t"
        "vmla.s16 q10, q7, d0[3]\n\t"

        // 3
        "ubfx     %[tmp], %[xx_0], #10, #6\n\t"
        "vmov.i16 q11, #0x0020\n\t"
        "add      %[tmp], %[xcoeffs], %[tmp], lsl #2\n\t"
        "vld1.32  d0, [%[tmp]]\n\t"
        "asr      %[tmp], %[xx_0], #16\n\t"
        "add      %[tmp], %[src], %[tmp], lsl #3\n\t"
        "vmovl.s8 q0, d0\n\t"
        "vld1.64  {d8-d11}, [%[tmp]:64]\n\t"
        "add      %[xx_0], %[xx_0], %[xadd]\n\t"
        "vmovl.u8 q1, d8\n\t"
        "vmovl.u8 q2, d9\n\t"
        "vmovl.u8 q6, d10\n\t"
        "vmovl.u8 q7, d11\n\t"
        "vmla.s16 q11, q1, d0[0]\n\t"
        "vmla.s16 q11, q2, d0[1]\n\t"
        "vmla.s16 q11, q6, d0[2]\n\t"
        "vmla.s16 q11, q7, d0[3]\n\t"

        // 4
        "ubfx     %[tmp], %[xx_0], #10, #6\n\t"
        "vmov.i16 q12, #0x0020\n\t"
        "add      %[tmp], %[xcoeffs], %[tmp], lsl #2\n\t"
        "vld1.32  d0, [%[tmp]]\n\t"
        "asr      %[tmp], %[xx_0], #16\n\t"
        "add      %[tmp], %[src], %[tmp], lsl #3\n\t"
        "vmovl.s8 q0, d0\n\t"
        "vld1.64  {d8-d11}, [%[tmp]:64]\n\t"
        "add      %[xx_0], %[xx_0], %[xadd]\n\t"
        "vmovl.u8 q1, d8\n\t"
        "vmovl.u8 q2, d9\n\t"
        "vmovl.u8 q6, d10\n\t"
        "vmovl.u8 q7, d11\n\t"
        "vmla.s16 q12, q1, d0[0]\n\t"
        "vmla.s16 q12, q2, d0[1]\n\t"
        "vmla.s16 q12, q6, d0[2]\n\t"
        "vmla.s16 q12, q7, d0[3]\n\t"

        // 5
        "ubfx     %[tmp], %[xx_0], #10, #6\n\t"
        "vmov.i16 q13, #0x0020\n\t"
        "add      %[tmp], %[xcoeffs], %[tmp], lsl #2\n\t"
        "vld1.32  d0, [%[tmp]]\n\t"
        "asr      %[tmp], %[xx_0], #16\n\t"
        "add      %[tmp], %[src], %[tmp], lsl #3\n\t"
        "vmovl.s8 q0, d0\n\t"
        "vld1.64  {d8-d11}, [%[tmp]:64]\n\t"
        "add      %[xx_0], %[xx_0], %[xadd]\n\t"
        "vmovl.u8 q1, d8\n\t"
        "vmovl.u8 q2, d9\n\t"
        "vmovl.u8 q6, d10\n\t"
        "vmovl.u8 q7, d11\n\t"
        "vmla.s16 q13, q1, d0[0]\n\t"
        "vmla.s16 q13, q2, d0[1]\n\t"
        "vmla.s16 q13, q6, d0[2]\n\t"
        "vmla.s16 q13, q7, d0[3]\n\t"

        // 6
        "ubfx     %[tmp], %[xx_0], #10, #6\n\t"
        "vmov.i16 q14, #0x0020\n\t"
        "add      %[tmp], %[xcoeffs], %[tmp], lsl #2\n\t"
        "vld1.32  d0, [%[tmp]]\n\t"
        "asr      %[tmp], %[xx_0], #16\n\t"
        "add      %[tmp], %[src], %[tmp], lsl #3\n\t"
        "vmovl.s8 q0, d0\n\t"
        "vld1.64  {d8-d11}, [%[tmp]:64]\n\t"
        "add      %[xx_0], %[xx_0], %[xadd]\n\t"
        "vmovl.u8 q1, d8\n\t"
        "vmovl.u8 q2, d9\n\t"
        "vmovl.u8 q6, d10\n\t"
        "vmovl.u8 q7, d11\n\t"
        "vmla.s16 q14, q1, d0[0]\n\t"
        "vmla.s16 q14, q2, d0[1]\n\t"
        "vmla.s16 q14, q6, d0[2]\n\t"
        "vmla.s16 q14, q7, d0[3]\n\t"

        // 7
        "ubfx     %[tmp], %[xx_0], #10, #6\n\t"
        "vmov.i16 q15, #0x0020\n\t"
        "add      %[tmp], %[xcoeffs], %[tmp], lsl #2\n\t"
        "vld1.32  d0, [%[tmp]]\n\t"
        "asr      %[tmp], %[xx_0], #16\n\t"
        "add      %[tmp], %[src], %[tmp], lsl #3\n\t"
        "vmovl.s8 q0, d0\n\t"
        "vld1.64  {d8-d11}, [%[tmp]:64]\n\t"
        "add      %[xx_0], %[xx_0], %[xadd]\n\t"
        "vmovl.u8 q1, d8\n\t"
        "vmovl.u8 q2, d9\n\t"
        "vmovl.u8 q6, d10\n\t"
        "vmovl.u8 q7, d11\n\t"
        "vmla.s16 q15, q1, d0[0]\n\t"
        "vmla.s16 q15, q2, d0[1]\n\t"
        "vmla.s16 q15, q6, d0[2]\n\t"
        "vmla.s16 q15, q7, d0[3]\n\t"

        // very speculative...
        "pld     [%[tmp], #96]\n\t"
        "pld     [%[tmp], #160]\n\t"

        "vqshrun.s16 d16, q8,  #6\n\t"
        "vqshrun.s16 d17, q9,  #6\n\t"
        "vqshrun.s16 d18, q10, #6\n\t"
        "vqshrun.s16 d19, q11, #6\n\t"
        "vqshrun.s16 d20, q12, #6\n\t"
        "vqshrun.s16 d21, q13, #6\n\t"
        "vqshrun.s16 d22, q14, #6\n\t"
        "vqshrun.s16 d23, q15, #6\n\t"

        // Undo h2v
        // or 6 vzip, but higher latency so it's slower
        "vtrn.8   d16, d17\n\t"
        "vtrn.8   d18, d19\n\t"
        "vtrn.8   d20, d21\n\t"
        "vtrn.8   d22, d23\n\t" // n.183-0  1c n0 d23:3
        "vtrn.16  q8,  q9\n\t"
        "vtrn.16  q10, q11\n\t"
        "vtrn.32  q8,  q10\n\t"
        "vtrn.32  q9,  q11\n\t"

        "mov      %[tmp], %[dst]\n\t"

        "vst1.64  d16, [%[tmp]:64], %[dst_stride]\n\t"
        "vst1.64  d18, [%[tmp]:64], %[dst_stride]\n\t"
        "vst1.64  d20, [%[tmp]:64], %[dst_stride]\n\t"
        "vst1.64  d22, [%[tmp]:64], %[dst_stride]\n\t"
        "vst1.64  d17, [%[tmp]:64], %[dst_stride]\n\t"
        "vst1.64  d19, [%[tmp]:64], %[dst_stride]\n\t"
        "vst1.64  d21, [%[tmp]:64], %[dst_stride]\n\t"
        "vst1.64  d23, [%[tmp]:64], %[dst_stride]\n\t"

        "subs     %[width], %[width], #8\n\t"
        "add      %[dst], %[dst], #8\n\t"
        "bgt      88b\n\t"

    : [tmp] "+r" (tmp), [xx_0] "+r" (xx_0), [dst] "+r" (dst), [width] "+r" (width)
    : [src] "r" (src), [xcoeffs] "r" (xcoeffs), [xadd] "r" (xadd), [xoffset] "r" (xoffset), [dst_stride] "r" (dst_stride)
    :   "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9","d10","d11","d12","d13","d14","d15",
       "d16","d17","d18","d19","d20","d21","d22","d23",
       "d24","d25","d26","d27","d28","d29","d30","d31"
    );

}

void scale_plane( uint8_t *src, int srcw, int srch, int srcstr,
                  uint8_t *dst, int dstw, int dsth, int dststr,
                  uint32_t *xco, uint32_t *yco )
{
    int xadd = (srcw<<16)/dstw;
    int yadd = (srch<<16)/dsth;

    int xoffset = (xadd>>1)-(1<<15);
    int yoffset = (yadd>>1)-(1<<15);

    int yy = yoffset + (-1<<16);

    uint8_t *buf = (uint8_t *)alloca( ((srcw+16+32+63)&(~63))*8 );
    buf += 64;

    for( int y = 0; y < dsth; y += 8 ) {

        for( int i = 0; i < 8; i++ ) {

            vert_interpolate_4to1_8( src, srcstr, srch-1, yy, buf+i*8, srcw, yco );

            yy += yadd;

        }

        h2v( buf-64, srcw+16 );

        horiz_interpolate_8_8( buf, xadd, xoffset, xco, dstw, dst + y * dststr, dststr );

    }

}
