// Edgehog: Nvidia Jetson Nano 1080p60 Fractals
// Written by Nils Liaaen Corneliusen & Sjur Julin 2023.
// License: CC BY 4.0 https://creativecommons.org/licenses/by/4.0/
// Read the 2023 article here: https://www.ignorantus.com/pages/nano_fractals/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/time.h>
#define _USE_MATH_DEFINES
#include <math.h>

#include <GL/glew.h>

#include "nanofrax_gl.h"
#include "edgehog.h"
#include "txt8x8.h"

#define RGBA(r,g,b,a) ((uint32_t)((a)<<24|(b)<<16|(g)<<8|(r)))

typedef struct {
    float r,g,b;
} vec3;
#define COLZ 6
static const vec3 cols2[COLZ] = {
    { 1.0f, 0.0f, 0.0f },
    { 1.0f, 1.0f, 0.0f },
    { 0.0f, 1.0f, 0.0f },
    { 0.0f, 1.0f, 1.0f },
    { 1.0f, 0.0f, 1.0f },
    { 1.0f, 0.0f, 0.0f },
};
static uint32_t getRGB( float h )
{
    float Slice     = (COLZ-1) * h;
    float SliceInt  = floorf( Slice );
    float SliceFrac = Slice - SliceInt;
    int i = (int)SliceInt;
    vec3 res;
    res.r = cols2[i].r*(1.0f-SliceFrac) + cols2[i+1].r*SliceFrac;
    res.g = cols2[i].g*(1.0f-SliceFrac) + cols2[i+1].g*SliceFrac;
    res.b = cols2[i].b*(1.0f-SliceFrac) + cols2[i+1].b*SliceFrac;
    return RGBA((uint32_t)(res.r*255.0f),(uint32_t)(res.g*255.0f),(uint32_t)(res.b*255.0f),0xff);
}

char *loadtxt( char *fn )
{
    char *txt = NULL;

    FILE *fp = fopen( fn, "rb" );
    if( fp == NULL ) {
        printf( "File %s: Not found\n", fn );
        return NULL;
    }

    fseek( fp, 0, SEEK_END);
    int size = ftell( fp );
    rewind( fp );
    if(size) txt = malloc( size+1 );
    if( !txt ) {
        printf( "File %s: Out of memory (size=%d)\n", fn, size );
        fclose( fp );
        return NULL;
    }

    int rc = fread( txt, 1, size, fp );
    if( rc != size ) {
        printf( "File %s: Short read. Got %d, expected %d\n", fn, rc, size );
        fclose( fp );
        free( txt );
        return NULL;
    }

    txt[size] = 0;
    fclose( fp );
    return txt;
}

static float GetTime() {
#ifndef CLOCK_REALTIME
#define CLOCK_REALTIME 0
#endif
	static struct timespec freq, spec, smek;
	static char init;
	if(!init) { init=1;
		clock_getres(CLOCK_REALTIME, &freq);
		clock_gettime(CLOCK_REALTIME, &smek);
	}
    clock_gettime(CLOCK_REALTIME, &spec);
    return spec.tv_sec - smek.tv_sec + spec.tv_nsec * .000000001;
}

// Can't recall where I copied this from. Seems to work, though.
struct cpu_load {
    long long noidle;
    long long total;
};

static void read_cpu_load( const char* fnam, struct cpu_load * load, bool * ferr)
{
    long long user = 0;
    long long nice = 0;
    long long system = 0;
    long long idle = 0;
    long long iowait = 0;
    long long irq = 0;
    long long softirq = 0;
    long long steal = 0;

    FILE *fp = fopen( fnam, "r" );
    if( fp ) {
        while( true ) {
            char cpuid[80];
            int rc = fscanf( fp, "%s %lld %lld %lld %lld %lld %lld %lld %lld %*d %*d\n",
                             cpuid,
                             &user, &nice, &system, &idle,
                             &iowait, &irq, &softirq, &steal);
            if( rc <= 0 ) {
                *ferr = true;
                fclose( fp );
                return;
            }
            if( !strcmp( cpuid, "cpu3" ) ) break;
        }
        fclose( fp );
    }

    load->noidle = user + nice + system + irq + softirq + steal;
    load->total = load->noidle + idle + iowait;

    return;
}

static float get_cpu_load( void )
{
    static struct cpu_load cpuload_1;
    static struct cpu_load cpuload_2;
    static struct cpu_load * curr = &cpuload_1;
    static struct cpu_load * prev = &cpuload_2;

    bool mferr = false;
    read_cpu_load( "/proc/stat", curr, &mferr );

    if (mferr) {
        return 0.0f;
    }

    long long loaddiff  = curr->noidle - prev->noidle;
    long long totaldiff = curr->total - prev->total;

    struct cpu_load * tmp = prev;
    prev = curr;
    curr = tmp;

    return ((float) loaddiff / (float) totaldiff)*100.0f;
}
//

#define LOAD_NAME_NANO "/sys/devices/gpu.0/load"

static float get_gpu_load( void )
{
    FILE *fp = fopen( LOAD_NAME_NANO, "r" );
    if( fp == NULL ) return 0;

    int load = 0;
    int rc = fscanf( fp, "%d", &load );
    if( rc != 1 ) load = 0;

    fclose( fp );

    return load/10.0f;
}

typedef struct {
    int edgy[BLOCKCNT];
} gpufrac;


#define THREADCNT 4
#define CORECNT   4

static int rowbuf[ROWCOLCNT];
static int colbuf[ROWCOLCNT];

static pthread_barrier_t bar;
static volatile int frames;
static volatile int blockctr;

static uint64_t get_usec( void )
{
    struct timeval tv;
    gettimeofday( &tv, NULL );
    return (uint64_t)tv.tv_sec*1000000 + (uint64_t)tv.tv_usec;
}

static inline int atomic_fetchadd( volatile int *dst, int val )
{
    unsigned long status;
    int rc;
    int result;
    asm volatile(
        "// atomic_fetchadd\n\t"
        "1:\n\t"
        "ldaxr %w[rc],     [%[dst]]\n\t"
        "add   %w[result], %w[rc],     %w[val]\n\t"
        "stlxr %w[status], %w[result], [%[dst]]\n\t"
        "cbnz  %w[status], 1b\n\t"
    : [result] "=&r" (result), [rc] "=&r" (rc), [status] "=&r" (status), "+o" (*dst)
    : [dst] "r" (dst), [val] "Ir" (val)
    : "cc"
    );
    return rc;
}

static int disable_timer = 0;
static volatile uint64_t framestart;

static void *thread_func( void *arg )
{
    int me = (int)(size_t)arg;
    int ctr = 0;

    pthread_barrier_wait( &bar );
    printf( "%d: Ready\n", me);
    pthread_barrier_wait( &bar );

    int frcnt = frames;

    for( int i = 0; i < frcnt; i++ ) {
        pthread_barrier_wait( &bar );

        uint64_t frstart = framestart;

        while( 1 ) {
            int blockid = atomic_fetchadd( &blockctr, 1 );
            if( blockid >= ROWCOLCNT ) break;

            edgehog_top_left( blockid, &rowbuf[blockid], &colbuf[blockid] );
            ctr++;

            if( !disable_timer && !(ctr&0x07) && get_usec() - frstart >= 15000 ) break;
        }

        pthread_barrier_wait( &bar );
    }

//    printf( "%d: Done\n", me );
    return (void *)(size_t)ctr;
}

// Crapimation
typedef struct {
    float x0, y0, z0;
    float x1, y1, z1;
    int frames;
} Mower;

#define POS0_OUT -1.00f,        0.00f,          3.00f
#define POS0_IN  -1.00f,        0.00f,         10.00f
#define POS1_OUT -1.50211167f,  0.00000000f,    3.00f
#define POS1_IN  -1.50211167f,  0.00000000f, 4096.00f
#define POS2_OUT -0.74515796f,  0.11257483f,    3.00f
#define POS2_IN  -0.74515796f,  0.11257483f,  128.00f
#define POS3_OUT -0.34292933f, -0.63927984f,    3.00f
#define POS3_IN  -0.34292933f, -0.63927984f,   16.00f
#define POS4_OUT  0.28004330f,  0.00900090f,    3.00f
#define POS4_IN   0.28004330f,  0.00900090f,  256.00f

static Mower mowertbl[] = {
    { POS0_OUT, POS0_OUT, 60,  },

    { POS0_OUT, POS0_IN,  120, },
    { POS0_IN,  POS0_IN,  180, },
    { POS0_IN,  POS0_OUT, 180, },
    { POS0_OUT, POS1_OUT, 180, },

    { POS1_OUT, POS1_IN,  600, },
    { POS1_IN,  POS1_IN,   60, },
    { POS1_IN,  POS1_OUT, 600, },
    { POS1_OUT, POS2_OUT, 180, },

    { POS2_OUT, POS2_IN,  360, },
    { POS2_IN,  POS2_IN,   60, },
    { POS2_IN,  POS2_OUT, 360, },
    { POS2_OUT, POS3_OUT, 180, },

    { POS3_OUT, POS3_IN,  300, },
    { POS3_IN,  POS3_IN,   60, },
    { POS3_IN,  POS3_OUT, 300, },
    { POS3_OUT, POS4_OUT, 180, },

    { POS4_OUT, POS4_IN,  300, },
    { POS4_IN,  POS4_IN,   60, },
    { POS4_IN,  POS4_OUT, 300, },
    { POS4_OUT, POS0_OUT, 180, },
};

#define MOWERS ((int)(sizeof(mowertbl)/sizeof(mowertbl[0])))

#define sechf(X) (1.0f/coshf(X))

// This is why you should pay attention in trigonometry.
static float getz( float z0, float z1, float delta )
{
    float secfac = 12.5f;
    float tf;

    if( z0 > z1 ) {
        tf = 1.0f - sechf( -secfac + (1.0f-delta)*secfac );
    } else {
        tf = sechf( -secfac + delta*secfac );
    }
    float zv = z0 * (1.0f-tf) + z1 * tf;

    return zv;
}

static int getnextpos( float *x1, float *y1, float *z1 )
{
    int wrap = 0;
    static int framectr = 0;
    static int mowerframes = 0;
    static int mowerctr = 0;

    Mower *m = &mowertbl[mowerctr];

    if( framectr == mowerframes + m->frames -1 ) {
        mowerframes += m->frames;
        mowerctr++;
        if( mowerctr >= MOWERS ) { mowerctr = 0; wrap = 1; }
        m = &mowertbl[mowerctr];
    }

    int mframe = framectr-mowerframes;
    float fr = mframe/(float)m->frames;

    float fr0 = 1.0f - (sinf( M_PI*0.5f + fr*M_PI ) + 1.0f)*0.5f;
    *x1 = m->x0 * (1.0-fr0) + m->x1 * fr0;
    *y1 = m->y0 * (1.0-fr0) + m->y1 * fr0;
    *z1 = getz( m->z0, m->z1, fr );

    framectr++;
    return wrap;
}

static void memset32( void *p, uint32_t v, int cnt )
{
    uint32_t *p32 = (uint32_t *)p;
    for( int i = 0; i < cnt; i++ ) {
        *p32++ = v;
    }
}

static void drawbar( uint32_t *dst, float val, int maxlen, uint32_t fg )
{
    int barlen = (int)(val*maxlen/100.0f);
    for( int i = 0; i < 16; i++ )
        memset32( dst+1920*i, fg, barlen );
}

// 20 texts to cover 1:20 (+10 sec intro/outro with other text)
static char *yadayada[] = {
    "-*- Edgehog -*-",
    "Let's party like it's 1987!",
    "This is a tech demo only",
    "Corneliusen & Julin Productions 2023",
    "NVidia Jetson Nano",
    "128 dainty GPU cores, 4 wimpy ARM A57 cores",
    "They render 1080p60 depth 256 fractals",
    "Using a kick-ass ARM Neon edge checking algorithm",
    "Edgehog blocks are shown inverted",
    "GPU uses a custom DFA shader",
    "Source code and article: www.ignorantus.com",
    "Music by Xerxes/Triumph. Remixed by Sjur.",
    "\"Good painters imitate nature, bad ones vomit it.\"",
    "What kind of money is there in idioting?",
    "Python uses 76 times more energy than C.",
    "Burn more coal! Write programs in Python!",
    "How many pythoneers does it take to change a lightbulb?",
    "Traceback (most recent call last):",
    "Source code and article: www.ignorantus.com",
    "Watch out for the Xmas Demo 2023!",
};

#define YADAS ((int)(sizeof(yadayada)/sizeof(yadayada[0])))

// Workaround for lame clock control.
// Skip calculating fps on the 60 first frames, hoping clock will stabilize.
// You can play around with /etc/nvpmodel.conf and try to lock them. I gave up.
// It still isn't tuned for burst workloads like this, so it might stick to 30 fps.
#define SKIPFRAMES 60

#define CPULOADW 0.0200f
#define GPULOADW 0.0125f

static GLint sxLoc, syLoc, szLoc, frameLoc, alphaLoc;
static Texture *tex;
static gpufrac gf;

static void coltxt( char *txt, uint32_t *dst, int ctr, uint32_t bg, bool five )
{
    int len = strlen( txt );
    for( int i = 0; i < len; i++ ) {
        char buf[2];
        buf[0] = txt[i];
        buf[1] = 0;
        uint32_t rgba = getRGB( (ctr%300)/299.0f );
        if( five )
            txt8x8x5( dst+40*i, 1920, buf, rgba, bg );
        else
            txt8x8x4( dst+32*i, 1920, buf, rgba, bg );
        ctr += 4;
    }
}

static void PlayIntro( char *t0, char *t1, int dir )
{
    // 5 seconds
    setTexCenter();

    int ctr = 0;
    for( int i = 0; i < 300; i++ ) {
        int len;

        XWinProc();

        float alpha;
        if( ctr < 90 )
            alpha = ctr/90.0f;
        else if( ctr > 210 )
            alpha = 1.0f-(ctr-210)/90.0f;
        else
            alpha = 1.0f;

        uint32_t *data = (uint32_t *)tex->img->data;
        memset32( data, 0x00000000, 1920*80 );
        ctr++;
        len = strlen(t0); coltxt( t0, data+1920* 0 + 960-len*40/2, ctr,    0xff000000, 1 );
        len = strlen(t1); coltxt( t1, data+1920*40 + 960-len*40/2, ctr+60, 0xff000000, 1 );
        UploadTexture(tex);
        glUniform1f( sxLoc, -1.00f );
        glUniform1f( syLoc,  0.00f );
        glUniform1f( szLoc,  3.00f );
        glUniform1f( alphaLoc, alpha );
        for( int y = 0; y < BLOCKSY; y++ ) {
            for( int x = 0; x < BLOCKSX; x++ ) {
                gf.edgy[y*BLOCKSX+x] = ((x+20)*(y+30)+(dir?ctr*8:-ctr*8))&0xff;
            }
        }
        copyToUniformBuffer( &gf, sizeof(gf) );

        Draw();
    }
}

int main(int argc, char* argv[])
{
    int fullscreen = 1;

    if(XWinCreate("X Window")) {
        puts("Failed to create window");
        return -1;
	}

    XWinFullscreen(fullscreen);

    tex  = NewTexture(1920, 80);

    tx_txt = loadtxt( "tex.glsl" );
    vs_txt = loadtxt( "vs.glsl" );
    fs_txt = loadtxt( argc == 2 ? argv[1] : "dfahog_demo.glsl" );
    if( !vs_txt || !fs_txt || !tx_txt ) {
        printf( "Failed loading shaders!\n" );
        return 1;
    }

    InitGL();

    glActiveTexture(GL_TEXTURE0);
    glBindTexture(GL_TEXTURE_2D, tex->id);

    // Init barrier
    int rc = pthread_barrier_init( &bar, NULL, THREADCNT + 1 );
    if( rc != 0 ) {
        printf( "pthread_barrier_init: rc=%d\n", rc );
        return 1;
    }

    // Init threads and lock them to cores
    pthread_t tids[THREADCNT];
    pthread_attr_t attrs[THREADCNT];

    for( int i = 0; i < THREADCNT; i++ ) {
        cpu_set_t c;
        CPU_ZERO( &c );
        CPU_SET( i%CORECNT, &c );

        printf( "Starting thread %d on core %d\n", i, i%CORECNT );

        rc = pthread_attr_init( &attrs[i] );
        if( rc != 0 ) {
            printf( "pthread_init_attr: i=%d, rc=%d\n", i, rc );
            return 1;
        }
        rc = pthread_attr_setaffinity_np( &attrs[i], sizeof(c), &c );
        if( rc != 0 ) {
            printf( "pthread_attr_setaffinity_np: i=%d, rc=%d\n", i, rc );
            return 1;
        }
        rc = pthread_create( &tids[i], &attrs[i], thread_func, (void *)(size_t)i );
        if( rc != 0 ) {
            printf( "pthread_create: i=%d, rc=%d\n", i, rc );
            return 1;
        }
    }

    int loops = 0;

    initUniformBuffer( NULL, sizeof(gf) );

    int runfr = 200;
    frames = runfr*99999; // fix: have some quit signal

    // The init barrier wait
    pthread_barrier_wait( &bar );

    disable_timer = 0;

    sxLoc    = glGetUniformLocation(pr1, "sx");
    syLoc    = glGetUniformLocation(pr1, "sy");
    szLoc    = glGetUniformLocation(pr1, "sz");
    frameLoc = glGetUniformLocation(pr1, "frame");
    alphaLoc = glGetUniformLocation(pr1, "alpha");

    float target_cpuload = 0.0f, cpuload = 0.0f;
    float target_gpuload = 0.0f, gpuload = 0.0f;

    float fps = 60.0f;

    int inverter = 0;

    fix8x8();

    float currx, curry, currz;
    float nextx, nexty, nextz;

    int edgehog_enabled = 1;

    // Very important printf barrier
    pthread_barrier_wait( &bar );

    printf( "Demo: %d cores, %d threads\n", CORECNT, THREADCNT );

    PlayIntro( "NVidia Jetson Nano Technology Demo", "1080p60 Depth 256 Fractals", 1 );

    setTexBottom();

    // Prepare first edgehog buffer
    getnextpos( &nextx, &nexty, &nextz );
    edgehog_setpos( nextx, nexty, nextz );
    blockctr = 0;
    framestart = get_usec();
    pthread_barrier_wait( &bar );
    pthread_barrier_wait( &bar );
    edgehog_generate( rowbuf, colbuf, gf.edgy, inverter );

    float prevTime = 0.0f;
    float starttime = 0.0f;
    int hogs = 0;
    int yadactr = 0;
    float alpha = 1.0f;

    while(!quit) {

        if( loops == SKIPFRAMES ) starttime = GetTime();

        if( loops < 60 )
            alpha = loops/60.0f;
        else if( loops > 4740 )
            alpha = 1.0f-(loops-4740)/60.0f;
        else
            alpha = 1.0f;

        // Mow along, nothing to see here
        currx = nextx; curry = nexty; currz = nextz;
        int wrap = getnextpos( &nextx, &nexty, &nextz );

        for( int i = 0; i < ROWCOLCNT; i++ ) {
            rowbuf[i] = -1;
            colbuf[i] = -1;
        }

        XWinProc();

        if(keys[_ESC_] || keys[_Q_])
            quit = 1;
        if(keys[_F_]) {
            keys[_F_] = 0;
            XWinFullscreen((fullscreen=!fullscreen));
        }

        target_gpuload = get_gpu_load();
        if( loops != 0 && loops%10 == 0 ) {
            target_cpuload = get_cpu_load();
        }
        cpuload = target_cpuload * CPULOADW + cpuload * (1.0f-CPULOADW);
        gpuload = target_gpuload * GPULOADW + gpuload * (1.0f-GPULOADW);

        float now = GetTime();
        if( loops == 0 ) prevTime = now;
        if( loops > 0 && loops%60 == 0 ) {
            float time60 = (now-prevTime);
            float frameTime = time60/60.0f;
            prevTime = now;
            fps = 1.0f / frameTime;
            if( fps > 59.5f ) fps = 60.00f;
            if( fps < 60.0f && loops > SKIPFRAMES )
                printf( "%3d: MS/frame: %.2f, FPS: %.2f, CPU load: %.2f%%, GPU load: %.2f%%\n", loops, frameTime*1000.0f, 1.0f / frameTime, cpuload, gpuload );
        }

        // Hogs on the wing for next frame
        blockctr = 0;
        edgehog_setpos( nextx, nexty, nextz );
        framestart = get_usec();
        pthread_barrier_wait( &bar );

        // Current position for this frame
        glUniform1f( sxLoc, currx );
        glUniform1f( syLoc, curry );
        glUniform1f( szLoc, currz );
        glUniform1f( frameLoc, loops );
        glUniform1f( alphaLoc, alpha );
        copyToUniformBuffer( &gf, sizeof(gf) );

        uint32_t *data = (uint32_t *)tex->img->data;
        memset32( data, 0xff000000, 1920*80 );

        char buf[80];

        if( loops != 0 && loops%240 == 0 ) {
            yadactr++;
            if( yadactr >= YADAS ) yadactr = 0;
        }
        coltxt( yadayada[yadactr], data+1920*0+960-(strlen(yadayada[yadactr])*32)/2, loops, 0xff000000, 0 );

        // 012345678901234567890123456789012345678901234567
        // FPS:xx.yy Invert:Off Edgehog:Off Depth:256 Pos:
        sprintf( buf, "FPS:%5.2f ", fps );
        txt8x8x2(data+1920*32 + 0*16, 1920, buf, fps >= 60.0f ? 0xff00ff00 : 0xff0000ff, 0xff000000 );
        sprintf( buf, "Invert:%s ", inverter ? "On" : "Off" );
        txt8x8x2(data+1920*32 + 10*16, 1920, buf, 0xffffffff, 0xff000000 );
        sprintf( buf, "Edgehog:%s ", edgehog_enabled ? "On" : "Off" );
        txt8x8x2(data+1920*32 + 21*16, 1920, buf, edgehog_enabled ? 0xff00ff00 : 0xff0000ff, 0xff000000 );
        sprintf( buf, "Depth:%d", MAXITER );
        txt8x8x2(data+1920*32 + 33*16, 1920, buf, 0xffffffff, 0xff000000 );
        sprintf( buf, "Pos:%.8f,%.8f,%.8f ", currx, curry, currz );
        txt8x8x2(data+1920*32 + 43*16, 1920, buf, 0xffffffff, 0xff000000 );
        sprintf( buf, "CPU:%6.2f%%", cpuload );
        txt8x8x2(data+(1920*48), 1920, buf, 0xffffffff, 0xff000000 );
        sprintf( buf, "GPU:%6.2f%%", gpuload );
        txt8x8x2(data+(1920*64), 1920, buf, 0xffffffff, 0xff000000 );

        drawbar( data+1920*48 + 12*16, cpuload, 1920-12*16, 0xffffff00 );
        drawbar( data+1920*64 + 12*16, gpuload, 1920-12*16, 0xff0000ff );

        UploadTexture(tex);

        Draw();

        // Wait for threads done
        pthread_barrier_wait( &bar );

        if( loops != 0 && loops%60 == 0 ) inverter = !inverter;
        inverter = 1;

        // Generate Edgehog map after all threads done
        hogs += edgehog_generate( rowbuf, colbuf, gf.edgy, inverter );

        loops++;
        if( wrap ) break;
    }

    float stoptime = GetTime();

    if( loops < 61 ) loops = 61;
    loops -= 60;
    float frameTime = (stoptime-starttime)/loops;
    fps = 1.0f / frameTime;
    if( fps > 59.8 ) fps = 60.0f; // Close enough

    printf( "Frames: %d FPS: %.2f Hogs: %d\n", loops+60+1, fps, hogs );

    if( !quit ) PlayIntro( "Corneliusen & Julin Productions 1987", "www.ignorantus.com", 0 );

    // Nobody gives a waterfowl about this
#if 0
    // Wrap up threads, get blockcounts as return value
    for( int i = 0; i < THREADCNT; i++ ) {
        void *res;
        rc = pthread_join( tids[i], &res );
        if( rc != 0 ) {
            printf( "pthread_join: i=%d, rc=%d\n", i, rc );
            return 1;
        }
        printf( "Thread %d done, blocks = %d\n", i, (int)(size_t)res );
    }
#endif

    XWinDestroy();

    return 0;
}
