SRTP SHA1 Optimization

Nils L. Corneliusen
13 December 2010

Introduction

When using SRTP, a SHA1 hash has to be calculated for each packet. On my target system, which has little CPU/ram/cache, this is quite a significant part of the cpu load. When using OpenSSL and the HMAC_ calls to calculate this, the code will probably end up looking something like this:

    HMAC_Init_ex( ..., ..., ..., NULL, NULL);
    HMAC_Update( ..., msg, msglen );
    HMAC_Final( ..., hmac, NULL );

If you study the Init and Final functions, there's really lots of... nothing going on there. In this case lengths are explicitly given and we know where to put things, so let's start off with trying to find the code that actually does the work.

That code is hidden in crypto/sha/sha_locl.h and md32_common.h. The macros used for calulation are quite brilliant, so no point in reinventing the wheel there. Instead, let's see what can be done with the wrapping code.

Easy does it

This can of course be done in a clean, orderly manner. Extract the macros and inner code from the given files and wrap them so that we get exactly what we need: SHA1 hashes for SRTP packets, and nothing more.

We start off with generating the pads when keys are derived. This probably originated somewhere in OpenSSL, I repeat it here for completeness:

void SHA1_GenPads( const uint8_t *key, int keylen, uint32_t *padi, uint32_t *pado )
{
    int i;
    uint8_t pad[64];

    padi[0] = 0x67452301;
    padi[1] = 0xEFCDAB89;
    padi[2] = 0x98BADCFE;
    padi[3] = 0x10325476;
    padi[4] = 0xC3D2E1F0;
    memcpy( pado, padi, 5*4 );

    memcpy( pad, key, keylen );
    memset( pad+keylen, 0, sizeof(pad) - keylen );

    for( i = 0; i < 64; i++ )
        pad[i] ^= 0x36;

    SHA1_ProcessBlocks( padi, pad, 1 );

    memcpy( pad, key, keylen );
    memset( pad+keylen, 0, sizeof(pad) - keylen );

    for( i = 0; i < 64; i++ )
        pad[i] ^= 0x5c;

    SHA1_ProcessBlocks( pado, pad, 1 );
}

Next we make the ProcessBlocks function which should resemble the OpenSSL inner loop, only adjusted so my current compiler can make an optimal loop. Your mileage may vary:

static void SHA1_ProcessBlocks( const uint32_t *inhash, uint32_t *outhash, const uint8_t *msg, int cnt )
{
    const uint32_t *W;
    uint32_t A,B,C,D,E,T;
    uint32_t h0,h1,h2,h3,h4;
    uint32_t XX0,XX1,XX2,XX3,XX4,XX5,XX6,XX7,XX8,XX9,XX10,XX11,XX12,XX13,XX14,XX15;

    W = (const uint32_t *)msg;

    h0 = inhash[0]; h1 = inhash[1];
    h2 = inhash[2]; h3 = inhash[3];
    h4 = inhash[4];

    while( 1 ) {

        A = h0; B = h1;
        C = h2; D = h3;
        E = h4;

        BODY_00_15( 0,A,B,C,D,E,T,W[ 0]);
        (...)

        BODY_16_19(16,C,D,E,T,A,B,X( 0),W[ 0],W[ 2],W[ 8],W[13]);
        (...)

        BODY_20_31(20,E,T,A,B,C,D,X( 4),W[ 4],W[ 6],W[12],X( 1));
        (...)

        BODY_32_39(32,E,T,A,B,C,D,X( 0),X( 2),X( 8),X(13));
        (...)

        BODY_40_59(40,C,D,E,T,A,B,X( 8),X(10),X( 0),X( 5));
        (...)

        BODY_60_79(60,A,B,C,D,E,T,X(12),X(14),X( 4),X( 9));
        (...)

        h0 += E; h1 += T;
        h2 += A; h3 += B;
        h4 += C;

        if( !--cnt ) break;

        W += 16;

    }

    outhash[0] = h0; outhash[1] = h1;
    outhash[2] = h2; outhash[3] = h3;
    outhash[4] = h4;

}

Then we make a SHA1_HMAC function with sensible arguments and the Process() function to sum it all up:

void SHA1_HMAC( const uint8_t *msg, int msglen, uint8_t *dst, int dstlen,
                const uint32_t *padi, const uint32_t *pado )
{
    uint8_t digest[SHA1HashSize];

    SHA1_Process( padi, msg,    msglen,       digest, SHA1HashSize );
    SHA1_Process( pado, digest, SHA1HashSize, dst,    dstlen );
}

static void SHA1_Process( const uint32_t *pad, const uint8_t  *msg, unsigned msglen, uint8_t *dst, int dstlen )
{
    int block64, rest;
    uint8_t buf[64];
    uint32_t hash[5];

    hash[0] = pad[0];
    hash[1] = pad[1];
    hash[2] = pad[2];
    hash[3] = pad[3];
    hash[4] = pad[4];

    rest    = msglen&63;
    block64 = msglen>>6;

    if( block64 ) {
        SHA1_ProcessBlocks( hash, msg, block64 );
        msg += msglen&~63;
    }

    if( rest ) {

        memcpy( buf, msg, rest );

        buf[rest++] = 0x80;

        memset( buf+rest, 0, 64-rest );

        if( rest > 56 ) {
            SHA1_ProcessBlocks( hash, buf, 1 );
            memset( buf, 0, 60 );
        }

    } else {
        memset( buf, 0, 60 );
        buf[0] = 0x80;
    }

    *((uint32_t *)(buf+60)) = msglen * 8 + 512;

    SHA1_ProcessBlocks( hash, buf, 1 );

    memcpy( dst, hash, dstlen );
}

Nothing to it

That's all pretty neat, but I'm interested in squeezing out every last cycle of this. There's little to be done in the ProcessBlocks() function, but those static initializations and mem* operations bother me.

With a little creative use of pointers and global variables, most of these can be removed:

static uint32_t obuf32[SHA1BlockSize/4] = { 0,0,0,0,0,0x80000000,0,0,0,0,0,0,0,0,0,SHA1HashSize*8+512 };

void SHA1_HMAC( const uint8_t *msg, int msglen, uint8_t *dst, int dstlen,
                const uint32_t *padi, const uint32_t *pado )
{
    SHA1_Process_I( padi, msg, msglen, obuf32 );
    SHA1_ProcessBlocks( pado, (uint32_t *)dst, (uint8_t *)obuf32, 1 );
}

static uint32_t zerobuf32[SHA1BlockSize/4] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };

static inline void SHA1_Process_I( const uint32_t *pad, const uint8_t *msg, unsigned msglen, uint32_t *dst )
{
    int block64, rest;
    const uint32_t *thepad;
    uint8_t *thebuf;
    uint8_t buf[SHA1BlockSize];

    rest    = msglen&63;
    block64 = msglen>>6;

    if( block64 ) {
        SHA1_ProcessBlocks( pad, dst, msg, block64 );
        thepad = dst;
        msg += msglen&~63;
    } else {
        thepad = pad;
    }

    if( rest ) {

        memcpy( buf, msg, rest );
        buf[rest++] = 0x80;
        memset( buf+rest, 0, SHA1BlockSize-rest );

        if( rest > 56 ) {
            SHA1_ProcessBlocks( thepad, dst, buf, 1 );
            zerobuf32[0] = 0x00000000;
            thebuf = (uint8_t *)zerobuf32;
            thepad = dst;
        } else {
            thebuf = buf;
        }

    } else {
        zerobuf32[0] = 0x80000000;
        thebuf = (uint8_t *)zerobuf32;
    }

    *((uint32_t *)(thebuf+60)) = msglen * 8 + 512;

    SHA1_ProcessBlocks( thepad, dst, thebuf, 1 );
}

Summing it up

So how does this method perform? For my low end system, I have the following cpu load figures for a 768k SIP SRTP call:

OpenSSL/HMAC 16.3%
Trivial code 11.5%
Fast version 10.8%

The OpenSSL/HMAC numbers depend a lot on your system allocator. On my system, that sucks donkey balls so it would be considered wise to do a benchmark on your own. The interesting numbers to compare are between the trivial, readable code and my less structured one. A free 0.7% less load seems pretty reasonable. Take your pick.


www.ignorantus.com