// Test code for the ARM Neon scaler.
// Written by Nils Liaaen Corneliusen 2013.
// License: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication license
// Read the 2023 article here: https://www.ignorantus.com/pages/neon_scaler/
// Read the 2018 article here: https://www.ignorantus.com/pages/image_transformation/

// Compile:
// Xavier/Nano: gcc -O3 -Wall -o scaler scale_neon_intrinsics.c bmp_planar.c coeffs.c main.c -lm
// Intel/AMD:   gcc -O3 -Wall -march=native -msse2 -mbmi -o scaler scale_sse2.c bmp_planar.c coeffs.c main.c -lm
// Pi 2 ASM:    gcc -O3 -Wall -march=native -mfpu=neon -o scaler scale_neon_intrinsics.c bmp_planar.c coeffs.c main.c -lm
// Pi 2 intr:   gcc -O3 -Wall -march=native -mfpu=neon -o scaler scale_neon_asm.c bmp_planar.c coeffs.c main.c -lm

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <getopt.h>
#include <assert.h>
#include <math.h>
#include <sys/syscall.h>
#include <unistd.h>

#include "bmp_planar.h"
#include "coeffs.h"
#include "scale.h"

static uint64_t get_usec( void )
{
    struct timeval tv;
    gettimeofday( &tv, NULL );
    return ((uint64_t)tv.tv_sec*1000000 + (uint64_t)tv.tv_usec);
}

static void scale_picture( bmp_planar *src, bmp_planar *dst )
{
    uint32_t *xco, *yco;
    uint32_t x[COEFFS_PHASES];
    uint32_t y[COEFFS_PHASES];

    float xfac = dst->w/(float)src->w;
    float yfac = dst->h/(float)src->h;

    coeffs_get( xfac, x );
    xco = x;
    if( xfac != yfac ) {
        coeffs_get( yfac, y );
        yco = y;
    } else {
        yco = x;
    }

    uint64_t t0 = get_usec();

    scale_plane( src->r, src->w, src->h, src->stride, dst->r, dst->w, dst->h, dst->stride, xco, yco );
    scale_plane( src->g, src->w, src->h, src->stride, dst->g, dst->w, dst->h, dst->stride, xco, yco );
    scale_plane( src->b, src->w, src->h, src->stride, dst->b, dst->w, dst->h, dst->stride, xco, yco );

    uint64_t t1 = get_usec();

    printf( "Scale time: %d\n", (int)(t1-t0) );
}

static void fill_buffer( bmp_planar *buf )
{
    uint32_t *r32 = (uint32_t *)buf->r;
    uint32_t *g32 = (uint32_t *)buf->g;
    uint32_t *b32 = (uint32_t *)buf->b;
    uint32_t val = rand();

    for( int y = 0; y < buf->h; y++ ) {

        int offset = y * buf->stride/4;

        for( int x = 0; x < buf->w/4; x++ ) {

            r32[offset] = val;
            g32[offset] = val;
            b32[offset] = val;
            offset++;

        }

        val = (val<<1)|(val>>31);
    }
}

static void perf_test( int srcw, int srch, int dstw, int dsth, int srcbufs, int dstbufs, int frames )
{
    bmp_planar **srclist = alloca( sizeof(bmp_planar *) * srcbufs );
    bmp_planar **dstlist = alloca( sizeof(bmp_planar *) * dstbufs );

    for( int i = 0; i < srcbufs; i++ ) {
        srclist[i] = bmp_planar_alloc( srcw, srch );
        assert( srclist[i] );
        fill_buffer( srclist[i] );
    }

    for( int i = 0; i < dstbufs; i++ ) {
        dstlist[i] = bmp_planar_alloc( dstw, dsth );
        assert( dstlist[i] );
        fill_buffer( dstlist[i] );
    }

    printf( "Scaling %d frames from %d*%d to %d*%d using %d/%d buffers\n", frames, srcw, srch, dstw, dsth, srcbufs, dstbufs );
    printf( "Memory estimate: %.2fMB src, %.2fMB dst\n", (srclist[0]->stride*srch*3*srcbufs)/1048576.0f,
                                                         (dstlist[0]->stride*dsth*3*dstbufs)/1048576.0f );

    float xfac = dstw/(float)srcw;
    float yfac = dsth/(float)srch;

    uint32_t *xco, *yco;
    uint32_t x[COEFFS_PHASES];
    uint32_t y[COEFFS_PHASES];

    coeffs_get( xfac, x );
    xco = x;
    if( xfac != yfac ) {
        coeffs_get( yfac, y );
        yco = y;
    } else {
        yco = x;
    }

    uint64_t t0 = get_usec();

    for( int i = 0; i < frames; i++ ) {
        bmp_planar *src = srclist[i%srcbufs];
        bmp_planar *dst = dstlist[i%dstbufs];
        scale_plane( src->r, src->w, src->h, src->stride, dst->r, dst->w, dst->h, dst->stride, xco, yco );
        scale_plane( src->g, src->w, src->h, src->stride, dst->g, dst->w, dst->h, dst->stride, xco, yco );
        scale_plane( src->b, src->w, src->h, src->stride, dst->b, dst->w, dst->h, dst->stride, xco, yco );
    }

    uint64_t t1 = get_usec();

    uint64_t total = t1-t0;

    printf( "total: %fms, per frame: %fms, fps: %f\n", total/1000.0f, total/(float)frames/1000.0f, 1000000/(total/(float)frames) );
}

#define DEFAULT_INPUT  "ebu3325.bmp"
#define DEFAULT_OUTPUT "out.bmp"

#define DEFAULT_WIDTH  1280
#define DEFAULT_HEIGHT  720

static void print_usage( const char *exec_name )
{
    printf( "Usage: %s [OPTIONS]\n\n", exec_name );
    printf( "Options:\n" );
    printf( "  -i <file>      Input bmp file, default %s\n", DEFAULT_INPUT );
    printf( "  -o <file>      Output bmp file, default %s\n", DEFAULT_OUTPUT );
    printf( "  -s resolution  Output resolution, default %dx%d\n", DEFAULT_WIDTH, DEFAULT_HEIGHT );
    printf( "  -m <buffers>   Performance measurement.\n" );
}


int main( int argc, char *argv[] )
{
    char *srcfile = DEFAULT_INPUT;
    char *dstfile = DEFAULT_OUTPUT;
    int dstw = DEFAULT_WIDTH;
    int dsth = DEFAULT_HEIGHT;
    int buffers = 0;

    int opt;

    while( (opt = getopt( argc, argv, "hi:o:m:s:" ) ) != -1 ) {
        switch( opt ) {
        default:
        case 'h':
            print_usage( argv[0] );
            return 0;
        case 'i': srcfile     = optarg;         break;
        case 'o': dstfile     = optarg;         break;
        case 'm': buffers     = atoi( optarg ); break;
        case 's': sscanf( optarg, "%dx%d", &dstw, &dsth ); break;
        }
    }

    if( dstw < 128 || dsth < 128 ) {
        printf( "Error: Illegal output size (%d*%d)\n", dstw, dsth );
        return 1;
    }

    if( buffers < 0 ) {
        printf( "Error: Illegal number of buffers (%d)\n", buffers );
        return 1;
    }

    bmp_planar *src = bmp_planar_load( srcfile );
    if( src == NULL ) return 1;

    printf( "Input:  %d*%d\n", src->w, src->h );

    if( buffers ) {
        perf_test( src->w, src->h, dstw, dsth, buffers, buffers, 1000 );
        return 0;
    }

    bmp_planar *dst = bmp_planar_alloc( dstw, dsth );
    if( dst == NULL ) return 1;

    printf( "Output: %d*%d\n", dst->w, dst->h );

    scale_picture( src, dst );

    bmp_planar_save( dst, dstfile );

    return 0;
}
