// Edgehog: Nvidia Jetson Nano 1080p60 Fractals
// Written by Nils Liaaen Corneliusen 2023.
// License: CC BY 4.0 https://creativecommons.org/licenses/by/4.0/
// Read the 2023 article here: https://www.ignorantus.com/pages/nano_fractals/
#include <stdio.h>
#include <arm_neon.h>
#include "edgehog.h"

static float ex, ey, ez;
static float ezfac;

static float32x4x4_t wf0, hf0;
static float32x4x4_t wf1, hf1;

#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wsign-compare"
#endif

void edgehog_setpos( float x, float y, float z )
{
    ex = x;
    ey = y;
    ez = z;

    ezfac = 1.0f / (0.5f * z);

    for( int i = 0; i < 16; i++ ) {
        wf0.val[i/4][i%4] =     i /(float)DISPW * 1.7778f * ezfac;
        hf0.val[i/4][i%4] =     i /(float)DISPH           * ezfac;
        wf1.val[i/4][i%4] = (16+i)/(float)DISPW * 1.7778f * ezfac;
        hf1.val[i/4][i%4] = (16+i)/(float)DISPH           * ezfac;   
    }
}

static int edgehog32_left( float leftx, float topy )
{
    float32x4_t yco0 = vaddq_f32( hf0.val[0], vdupq_n_f32( topy ) );
    float32x4_t yco1 = vaddq_f32( hf0.val[1], vdupq_n_f32( topy ) );
    float32x4_t yco2 = vaddq_f32( hf0.val[2], vdupq_n_f32( topy ) );
    float32x4_t yco3 = vaddq_f32( hf0.val[3], vdupq_n_f32( topy ) );
    float32x4_t yco4 = vaddq_f32( hf1.val[0], vdupq_n_f32( topy ) );
    float32x4_t yco5 = vaddq_f32( hf1.val[1], vdupq_n_f32( topy ) );
    float32x4_t yco6 = vaddq_f32( hf1.val[2], vdupq_n_f32( topy ) );
    float32x4_t yco7 = vaddq_f32( hf1.val[3], vdupq_n_f32( topy ) );

    float32x4_t x0  = vdupq_n_f32( 0.0f ); float32x4_t x1  = x0;
    float32x4_t x2  = x0;                  float32x4_t x3  = x0;
    float32x4_t x4  = x0;                  float32x4_t x5  = x0;
    float32x4_t x6  = x0;                  float32x4_t x7  = x0;
    float32x4_t y0  = x0;                  float32x4_t y1  = x0;
    float32x4_t y2  = x0;                  float32x4_t y3  = x0;
    float32x4_t y4  = x0;                  float32x4_t y5  = x0;
    float32x4_t y6  = x0;                  float32x4_t y7  = x0;

    int i = 0;
    for( ; i < MAXITER; i++ ) {
        float32x4_t xt0 = vaddq_f32( vsubq_f32( vmulq_f32( x0, x0 ), vmulq_f32( y0, y0 ) ), vdupq_n_f32( leftx ) );
        float32x4_t xt1 = vaddq_f32( vsubq_f32( vmulq_f32( x1, x1 ), vmulq_f32( y1, y1 ) ), vdupq_n_f32( leftx ) );
        float32x4_t xt2 = vaddq_f32( vsubq_f32( vmulq_f32( x2, x2 ), vmulq_f32( y2, y2 ) ), vdupq_n_f32( leftx ) );
        float32x4_t xt3 = vaddq_f32( vsubq_f32( vmulq_f32( x3, x3 ), vmulq_f32( y3, y3 ) ), vdupq_n_f32( leftx ) );
        float32x4_t xt4 = vaddq_f32( vsubq_f32( vmulq_f32( x4, x4 ), vmulq_f32( y4, y4 ) ), vdupq_n_f32( leftx ) );
        float32x4_t xt5 = vaddq_f32( vsubq_f32( vmulq_f32( x5, x5 ), vmulq_f32( y5, y5 ) ), vdupq_n_f32( leftx ) );
        float32x4_t xt6 = vaddq_f32( vsubq_f32( vmulq_f32( x6, x6 ), vmulq_f32( y6, y6 ) ), vdupq_n_f32( leftx ) );
        float32x4_t xt7 = vaddq_f32( vsubq_f32( vmulq_f32( x7, x7 ), vmulq_f32( y7, y7 ) ), vdupq_n_f32( leftx ) );

        y0 = vaddq_f32( vmulq_n_f32( vmulq_f32( x0, y0 ), 2.0f ), yco0 ); x0  = xt0;
        y1 = vaddq_f32( vmulq_n_f32( vmulq_f32( x1, y1 ), 2.0f ), yco1 ); x1  = xt1;
        y2 = vaddq_f32( vmulq_n_f32( vmulq_f32( x2, y2 ), 2.0f ), yco2 ); x2  = xt2;
        y3 = vaddq_f32( vmulq_n_f32( vmulq_f32( x3, y3 ), 2.0f ), yco3 ); x3  = xt3;
        y4 = vaddq_f32( vmulq_n_f32( vmulq_f32( x4, y4 ), 2.0f ), yco4 ); x4  = xt4;
        y5 = vaddq_f32( vmulq_n_f32( vmulq_f32( x5, y5 ), 2.0f ), yco5 ); x5  = xt5;
        y6 = vaddq_f32( vmulq_n_f32( vmulq_f32( x6, y6 ), 2.0f ), yco6 ); x6  = xt6;
        y7 = vaddq_f32( vmulq_n_f32( vmulq_f32( x7, y7 ), 2.0f ), yco7 ); x7  = xt7;

        uint32x4_t resa = vcgtq_f32( vaddq_f32( vmulq_f32( x0, x0 ), vmulq_f32( y0, y0 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resb = vcgtq_f32( vaddq_f32( vmulq_f32( x1, x1 ), vmulq_f32( y1, y1 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resc = vcgtq_f32( vaddq_f32( vmulq_f32( x2, x2 ), vmulq_f32( y2, y2 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resd = vcgtq_f32( vaddq_f32( vmulq_f32( x3, x3 ), vmulq_f32( y3, y3 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t rese = vcgtq_f32( vaddq_f32( vmulq_f32( x4, x4 ), vmulq_f32( y4, y4 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resf = vcgtq_f32( vaddq_f32( vmulq_f32( x5, x5 ), vmulq_f32( y5, y5 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resg = vcgtq_f32( vaddq_f32( vmulq_f32( x6, x6 ), vmulq_f32( y6, y6 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resh = vcgtq_f32( vaddq_f32( vmulq_f32( x7, x7 ), vmulq_f32( y7, y7 ) ), vdupq_n_f32( 4.0f ) );

        uint32x4_t abcd;
        abcd = vandq_u32( vandq_u32( vandq_u32( resa, resb ), vandq_u32( resc, resd ) ),
                          vandq_u32( vandq_u32( rese, resf ), vandq_u32( resg, resh ) ) );
        if( (abcd[0]&abcd[1]&abcd[2]&abcd[3]) == -1 ) break;

        abcd = vorrq_u32( vorrq_u32( vorrq_u32( resa, resb ), vorrq_u32( resc, resd ) ),
                          vorrq_u32( vorrq_u32( rese, resf ), vorrq_u32( resg, resh ) ) );
        if( (abcd[0]|abcd[1]|abcd[2]|abcd[3]) == -1 ) { i = -1; break; }
    }

    return i;
}

static int edgehog32_top( float leftx, float topy )
{
    float32x4_t xco0 = vaddq_f32( wf0.val[0], vdupq_n_f32( leftx ) );
    float32x4_t xco1 = vaddq_f32( wf0.val[1], vdupq_n_f32( leftx ) );
    float32x4_t xco2 = vaddq_f32( wf0.val[2], vdupq_n_f32( leftx ) );
    float32x4_t xco3 = vaddq_f32( wf0.val[3], vdupq_n_f32( leftx ) );
    float32x4_t xco4 = vaddq_f32( wf1.val[0], vdupq_n_f32( leftx ) );
    float32x4_t xco5 = vaddq_f32( wf1.val[1], vdupq_n_f32( leftx ) );
    float32x4_t xco6 = vaddq_f32( wf1.val[2], vdupq_n_f32( leftx ) );
    float32x4_t xco7 = vaddq_f32( wf1.val[3], vdupq_n_f32( leftx ) );

    float32x4_t x0 = vdupq_n_f32( 0.0f ); float32x4_t x1 = x0;
    float32x4_t x2 = x0;                  float32x4_t x3 = x0;
    float32x4_t x4 = x0;                  float32x4_t x5 = x0;
    float32x4_t x6 = x0;                  float32x4_t x7 = x0;
    float32x4_t y0 = x0;                  float32x4_t y1 = x0;
    float32x4_t y2 = x0;                  float32x4_t y3 = x0;
    float32x4_t y4 = x0;                  float32x4_t y5 = x0;
    float32x4_t y6 = x0;                  float32x4_t y7 = x0;

    int i = 0;
    for( ; i < MAXITER; i++ ) {
        float32x4_t xt0 = vaddq_f32( vsubq_f32( vmulq_f32( x0, x0 ), vmulq_f32( y0, y0 ) ), xco0 );
        float32x4_t xt1 = vaddq_f32( vsubq_f32( vmulq_f32( x1, x1 ), vmulq_f32( y1, y1 ) ), xco1 );
        float32x4_t xt2 = vaddq_f32( vsubq_f32( vmulq_f32( x2, x2 ), vmulq_f32( y2, y2 ) ), xco2 );
        float32x4_t xt3 = vaddq_f32( vsubq_f32( vmulq_f32( x3, x3 ), vmulq_f32( y3, y3 ) ), xco3 );
        float32x4_t xt4 = vaddq_f32( vsubq_f32( vmulq_f32( x4, x4 ), vmulq_f32( y4, y4 ) ), xco4 );
        float32x4_t xt5 = vaddq_f32( vsubq_f32( vmulq_f32( x5, x5 ), vmulq_f32( y5, y5 ) ), xco5 );
        float32x4_t xt6 = vaddq_f32( vsubq_f32( vmulq_f32( x6, x6 ), vmulq_f32( y6, y6 ) ), xco6 );
        float32x4_t xt7 = vaddq_f32( vsubq_f32( vmulq_f32( x7, x7 ), vmulq_f32( y7, y7 ) ), xco7 );

        y0 = vaddq_f32( vmulq_n_f32( vmulq_f32( x0, y0 ), 2.0f ), vdupq_n_f32( topy ) ); x0  = xt0;
        y1 = vaddq_f32( vmulq_n_f32( vmulq_f32( x1, y1 ), 2.0f ), vdupq_n_f32( topy ) ); x1  = xt1;
        y2 = vaddq_f32( vmulq_n_f32( vmulq_f32( x2, y2 ), 2.0f ), vdupq_n_f32( topy ) ); x2  = xt2;
        y3 = vaddq_f32( vmulq_n_f32( vmulq_f32( x3, y3 ), 2.0f ), vdupq_n_f32( topy ) ); x3  = xt3;
        y4 = vaddq_f32( vmulq_n_f32( vmulq_f32( x4, y4 ), 2.0f ), vdupq_n_f32( topy ) ); x4  = xt4;
        y5 = vaddq_f32( vmulq_n_f32( vmulq_f32( x5, y5 ), 2.0f ), vdupq_n_f32( topy ) ); x5  = xt5;
        y6 = vaddq_f32( vmulq_n_f32( vmulq_f32( x6, y6 ), 2.0f ), vdupq_n_f32( topy ) ); x6  = xt6;
        y7 = vaddq_f32( vmulq_n_f32( vmulq_f32( x7, y7 ), 2.0f ), vdupq_n_f32( topy ) ); x7  = xt7;

        uint32x4_t resa = vcgtq_f32( vaddq_f32( vmulq_f32( x0, x0 ), vmulq_f32( y0, y0 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resb = vcgtq_f32( vaddq_f32( vmulq_f32( x1, x1 ), vmulq_f32( y1, y1 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resc = vcgtq_f32( vaddq_f32( vmulq_f32( x2, x2 ), vmulq_f32( y2, y2 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resd = vcgtq_f32( vaddq_f32( vmulq_f32( x3, x3 ), vmulq_f32( y3, y3 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t rese = vcgtq_f32( vaddq_f32( vmulq_f32( x4, x4 ), vmulq_f32( y4, y4 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resf = vcgtq_f32( vaddq_f32( vmulq_f32( x5, x5 ), vmulq_f32( y5, y5 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resg = vcgtq_f32( vaddq_f32( vmulq_f32( x6, x6 ), vmulq_f32( y6, y6 ) ), vdupq_n_f32( 4.0f ) );
        uint32x4_t resh = vcgtq_f32( vaddq_f32( vmulq_f32( x7, x7 ), vmulq_f32( y7, y7 ) ), vdupq_n_f32( 4.0f ) );

        uint32x4_t abcd;
        abcd = vandq_u32( vandq_u32( vandq_u32( resa, resb ), vandq_u32( resc, resd ) ),
                          vandq_u32( vandq_u32( rese, resf ), vandq_u32( resg, resh ) ) );
        if( (abcd[0]&abcd[1]&abcd[2]&abcd[3]) == -1 ) break;

        abcd = vorrq_u32( vorrq_u32( vorrq_u32( resa, resb ), vorrq_u32( resc, resd ) ),
                          vorrq_u32( vorrq_u32( rese, resf ), vorrq_u32( resg, resh ) ) );
        if( (abcd[0]|abcd[1]|abcd[2]|abcd[3]) == -1 ) { i = -1; break; }
    }

    return i;
}

void edgehog_top_left( int blockid, int *rowdst, int *coldst )
{
    int bx = blockid%COLS;
    int by = blockid/COLS;

    float leftx = (-0.5f + bx/(DISPW/(float)BLOCKWH)) * 1.7778f * ezfac + ex;
    float topy  = (-0.5f + by/(DISPH/(float)BLOCKWH))           * ezfac + ey;

    *rowdst     = edgehog32_top(  leftx, topy );
    *coldst     = edgehog32_left( leftx, topy );
}

int edgehog_generate( const int *rbuf, const int *cbuf, int *dst, int invert )
{
    int hogs = 0;

    for( int y = 0; y < BLOCKSY; y++ ) {
        const int *row0 = rbuf + y*COLS;
        const int *col0 = cbuf + y*COLS;
        const int *row1 = row0 +   COLS;
        const int *col1 = col0 +      1;

        int *dstrow = dst + y*BLOCKSX;

        for( int x = 0; x < BLOCKSX; x++ ) {
            int topval   = *row0++;
            int botval   = *row1++;
            int leftval  = *col0++;
            int rightval = *col1++;

            int v = topval == botval && topval == leftval && topval == rightval ? topval : -1;
            if( invert && v != -1  ) v = 256 - v;

            hogs += v != -1 ? 1 : 0;

            *dstrow++ = v;
        }
    }

    return hogs;
}
