From 417cda7061ceb50e9dc1d1e4be702f0026071e53 Mon Sep 17 00:00:00 2001
From: Oliver Jowett <oliver@mutability.co.uk>
Date: Fri, 27 Jan 2017 21:58:12 +0000
Subject: [PATCH] Make SC16Q11-nodc conversions table-based for speed. Add a
 mechanism for converters to initialize tables on demand. Move UC8 table setup
 to the new lazy-setup path. Fix uc8 lookup table allocation size.

---
 convert.c           | 243 ++++++++++++++++++++++++++++++++++----------
 convert_benchmark.c |  76 ++++++++++----
 dump1090.c          |  20 +---
 dump1090.h          |   1 -
 4 files changed, 252 insertions(+), 88 deletions(-)

diff --git a/convert.c b/convert.c
index 06b753a..fc2a7ae 100644
--- a/convert.c
+++ b/convert.c
@@ -26,6 +26,36 @@ struct converter_state {
     float z1_Q;
 };
 
+static uint16_t *uc8_lookup;
+static bool init_uc8_lookup()
+{
+    if (uc8_lookup)
+        return true;
+
+    uc8_lookup = malloc(sizeof(uint16_t) * 256 * 256);
+    if (!uc8_lookup) {
+        fprintf(stderr, "can't allocate UC8 conversion lookup table\n");
+        return false;
+    }
+
+    for (int i = 0; i <= 255; i++) {
+        for (int q = 0; q <= 255; q++) {
+            float fI, fQ, magsq;
+
+            fI = (i - 127.5) / 127.5;
+            fQ = (q - 127.5) / 127.5;
+            magsq = fI * fI + fQ * fQ;
+            if (magsq > 1)
+                magsq = 1;
+            float mag = sqrtf(magsq);
+
+            uc8_lookup[le16toh((i*256)+q)] = (uint16_t) (mag * 65535.0f + 0.5f);
+        }
+    }
+
+    return true;
+}
+
 static void convert_uc8_nodc(void *iq_data,
                              uint16_t *mag_data,
                              unsigned nsamples,
@@ -42,55 +72,33 @@ static void convert_uc8_nodc(void *iq_data,
     MODES_NOTUSED(state);
 
     // unroll this a bit
+
+#define DO_ONE_SAMPLE \
+    do {                                            \
+        mag = uc8_lookup[*in++];                    \
+        *mag_data++ = mag;                          \
+        sum_level += mag;                           \
+        sum_power += (uint32_t)mag * (uint32_t)mag; \
+    } while(0)
+
+    // unroll this a bit
     for (i = 0; i < (nsamples>>3); ++i) {
-        mag = Modes.maglut[*in++];
-        *mag_data++ = mag;
-        sum_level += mag;
-        sum_power += (uint32_t)mag * (uint32_t)mag;
-
-        mag = Modes.maglut[*in++];
-        *mag_data++ = mag;
-        sum_level += mag;
-        sum_power += (uint32_t)mag * (uint32_t)mag;
-
-        mag = Modes.maglut[*in++];
-        *mag_data++ = mag;
-        sum_level += mag;
-        sum_power += (uint32_t)mag * (uint32_t)mag;
-
-        mag = Modes.maglut[*in++];
-        *mag_data++ = mag;
-        sum_level += mag;
-        sum_power += (uint32_t)mag * (uint32_t)mag;
-
-        mag = Modes.maglut[*in++];
-        *mag_data++ = mag;
-        sum_level += mag;
-        sum_power += (uint32_t)mag * (uint32_t)mag;
-
-        mag = Modes.maglut[*in++];
-        *mag_data++ = mag;
-        sum_level += mag;
-        sum_power += (uint32_t)mag * (uint32_t)mag;
-
-        mag = Modes.maglut[*in++];
-        *mag_data++ = mag;
-        sum_level += mag;
-        sum_power += (uint32_t)mag * (uint32_t)mag;
-
-        mag = Modes.maglut[*in++];
-        *mag_data++ = mag;
-        sum_level += mag;
-        sum_power += (uint32_t)mag * (uint32_t)mag;
+        DO_ONE_SAMPLE;
+        DO_ONE_SAMPLE;
+        DO_ONE_SAMPLE;
+        DO_ONE_SAMPLE;
+        DO_ONE_SAMPLE;
+        DO_ONE_SAMPLE;
+        DO_ONE_SAMPLE;
+        DO_ONE_SAMPLE;
     }
 
     for (i = 0; i < (nsamples&7); ++i) {
-        mag = Modes.maglut[*in++];
-        *mag_data++ = mag;
-        sum_level += mag;
-        sum_power += (uint32_t)mag * (uint32_t)mag;
+        DO_ONE_SAMPLE;
     }
 
+#undef DO_ONE_SAMPLE
+
     if (out_mean_level) {
         *out_mean_level = sum_level / 65536.0 / nsamples;
     }
@@ -204,6 +212,125 @@ static void convert_sc16_generic(void *iq_data,
     }
 }
 
+static void convert_sc16_nodc(void *iq_data,
+                              uint16_t *mag_data,
+                              unsigned nsamples,
+                              struct converter_state *state,
+                              double *out_mean_level,
+                              double *out_mean_power)
+{
+    MODES_NOTUSED(state);
+
+    uint16_t *in = iq_data;
+
+    unsigned i;
+    int16_t I, Q;
+    float fI, fQ, magsq;
+    float sum_level = 0, sum_power = 0;
+
+    for (i = 0; i < nsamples; ++i) {
+        I = (int16_t)le16toh(*in++);
+        Q = (int16_t)le16toh(*in++);
+        fI = I / 32768.0f;
+        fQ = Q / 32768.0f;
+
+        magsq = fI * fI + fQ * fQ;
+        if (magsq > 1)
+            magsq = 1;
+
+        float mag = sqrtf(magsq);
+        sum_power += magsq;
+        sum_level += mag;
+        *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f);
+    }
+
+    if (out_mean_level) {
+        *out_mean_level = sum_level / nsamples;
+    }
+
+    if (out_mean_power) {
+        *out_mean_power = sum_power / nsamples;
+    }
+}
+
+// SC16Q11_TABLE_BITS controls the size of the lookup table
+// for SC16Q11 data. The size of the table is 2 * (1 << (2*BITS))
+// bytes. Reducing the number of bits reduces precision but
+// can run substantially faster by staying in cache.
+// See convert_benchmark.c for some numbers.
+
+// Leaving SC16QQ_TABLE_BITS undefined will disable the table lookup and always use
+// the floating-point path, which may be faster on some systems
+
+#if defined(SC16Q11_TABLE_BITS)
+
+#define USE_BITS SC16Q11_TABLE_BITS
+#define LOSE_BITS (11 - SC16Q11_TABLE_BITS)
+
+static uint16_t *sc16q11_lookup;
+static bool init_sc16q11_lookup()
+{
+    if (sc16q11_lookup)
+        return true;
+
+    sc16q11_lookup = malloc(sizeof(uint16_t) * (1 << (USE_BITS * 2)));
+    if (!sc16q11_lookup) {
+        fprintf(stderr, "can't allocate SC16Q11 conversion lookup table\n");
+        return false;
+    }
+
+    for (int i = 0; i < 2048; i += (1 << LOSE_BITS)) {
+        for (int q = 0; q < 2048; q += (1 << LOSE_BITS)) {
+            float fI = i / 2048.0, fQ = q / 2048.0;
+            float magsq = fI * fI + fQ * fQ;
+            if (magsq > 1)
+                magsq = 1;
+            float mag = sqrtf(magsq);
+
+            unsigned index = ((i >> LOSE_BITS) << USE_BITS) | (q >> LOSE_BITS);
+            sc16q11_lookup[index] = (uint16_t)(mag * 65535.0f + 0.5f);
+        }
+    }
+
+    return true;
+}
+
+static void convert_sc16q11_table(void *iq_data,
+                                  uint16_t *mag_data,
+                                  unsigned nsamples,
+                                  struct converter_state *state,
+                                  double *out_mean_level,
+                                  double *out_mean_power)
+{
+    uint16_t *in = iq_data;
+    unsigned i;
+    uint16_t I, Q;
+    uint64_t sum_level = 0;
+    uint64_t sum_power = 0;
+    uint16_t mag;
+
+    MODES_NOTUSED(state);
+
+    for (i = 0; i < nsamples; ++i) {
+        I = abs((int16_t)le16toh(*in++)) & 2047;
+        Q = abs((int16_t)le16toh(*in++)) & 2047;
+        mag = sc16q11_lookup[((I >> LOSE_BITS) << USE_BITS) | (Q >> LOSE_BITS)];
+        *mag_data++ = mag;
+        sum_level += mag;
+        sum_power += (uint32_t)mag * (uint32_t)mag;
+    }
+
+    if (out_mean_level) {
+        *out_mean_level = sum_level / 65536.0 / nsamples;
+    }
+
+    if (out_mean_power) {
+        *out_mean_power = sum_power / 65535.0 / 65535.0 / nsamples;
+    }
+}
+
+#else /* ! defined(SC16Q11_TABLE_BITS) */
+
 static void convert_sc16q11_nodc(void *iq_data,
                                  uint16_t *mag_data,
                                  unsigned nsamples,
@@ -211,14 +338,15 @@ static void convert_sc16q11_nodc(void *iq_data,
                                  double *out_mean_level,
                                  double *out_mean_power)
 {
+    MODES_NOTUSED(state);
+
     uint16_t *in = iq_data;
+
     unsigned i;
     int16_t I, Q;
     float fI, fQ, magsq;
     float sum_level = 0, sum_power = 0;
 
-    MODES_NOTUSED(state);
-
     for (i = 0; i < nsamples; ++i) {
         I = (int16_t)le16toh(*in++);
         Q = (int16_t)le16toh(*in++);
@@ -244,6 +372,8 @@ static void convert_sc16q11_nodc(void *iq_data,
     }
 }
 
+#endif /* defined(SC16Q11_TABLE_BITS) */
+
 static void convert_sc16q11_generic(void *iq_data,
                                     uint16_t *mag_data,
                                     unsigned nsamples,
@@ -301,14 +431,20 @@ static struct {
     int can_filter_dc;
     iq_convert_fn fn;
     const char *description;
+    bool (*init)();
 } converters_table[] = {
     // In order of preference
-    { INPUT_UC8,     0, convert_uc8_nodc,         "UC8, integer/table path" },
-    { INPUT_UC8,     1, convert_uc8_generic,      "UC8, float path" },
-    { INPUT_SC16,    1, convert_sc16_generic,     "SC16, float path" },
-    { INPUT_SC16Q11, 0, convert_sc16q11_nodc,     "SC16Q11, float path, no DC block" },
-    { INPUT_SC16Q11, 1, convert_sc16q11_generic,  "SC16Q11, float path" },
-    { 0, 0, NULL, NULL }
+    { INPUT_UC8,          0, convert_uc8_nodc,         "UC8, integer/table path", init_uc8_lookup },
+    { INPUT_UC8,          1, convert_uc8_generic,      "UC8, float path", NULL },
+    { INPUT_SC16,         0, convert_sc16_nodc,        "SC16, float path, no DC", NULL },
+    { INPUT_SC16,         1, convert_sc16_generic,     "SC16, float path", NULL },
+#if defined(SC16Q11_TABLE_BITS)
+    { INPUT_SC16Q11,      0, convert_sc16q11_table,    "SC16Q11, integer/table path", init_sc16q11_lookup },
+#else
+    { INPUT_SC16Q11,      0, convert_sc16q11_nodc,     "SC16Q11, float path, no DC", NULL },
+#endif
+    { INPUT_SC16Q11,      1, convert_sc16q11_generic,  "SC16Q11, float path", NULL },
+    { 0, 0, NULL, NULL, NULL }
 };
 
 iq_convert_fn init_converter(input_format_t format,
@@ -332,6 +468,11 @@ iq_convert_fn init_converter(input_format_t format,
         return NULL;
     }
 
+    if (converters_table[i].init) {
+        if (!converters_table[i].init())
+            return NULL;
+    }
+
     *out_state = malloc(sizeof(struct converter_state));
     if (! *out_state) {
         fprintf(stderr, "can't allocate converter state\n");
diff --git a/convert_benchmark.c b/convert_benchmark.c
index 0348b81..7a0c523 100644
--- a/convert_benchmark.c
+++ b/convert_benchmark.c
@@ -1,35 +1,75 @@
 #include "dump1090.h"
 
-static uint8_t *testdata_uc8;
-static uint16_t *testdata_sc16;
-static uint16_t *testdata_sc16q11;
+static void **testdata_uc8;
+static void **testdata_sc16;
+static void **testdata_sc16q11;
 static uint16_t *outdata;
 
+// SC16Q11_TABLE_BITS notes:
+
+// 11 bits (8MB) gives you full precision, but a large table that doesn't fit in cache
+// 9 bits (512kB) will fit in the Pi 2/3's shared L2 cache
+//   (but there will be contention from other cores)
+// 8 bits (128kB) will fit in the Pi 1's L2 cache
+// 7 bits (32kB) will fit in the Pi 1/2/3's L1 cache
+
+// Sample results for "SC16Q11, no DC":
+
+// Core i7-3610QM @ 2300MHz
+// SC16Q11_TABLE_BITS undefined: 152.80M samples/second
+// SC16Q11_TABLE_BITS=11:        101.22M samples/second
+// SC16Q11_TABLE_BITS=9:         243.04M samples/second
+// SC16Q11_TABLE_BITS=8:         316.84M samples/second
+// SC16Q11_TABLE_BITS=7:         375.70M samples/second
+
+// Pi3B @ 1200MHz
+// SC16Q11_TABLE_BITS undefined: 22.19M samples/second
+// SC16Q11_TABLE_BITS=11:         5.86M samples/second
+// SC16Q11_TABLE_BITS=9:         19.33M samples/second
+// SC16Q11_TABLE_BITS=8:         33.50M samples/second
+// SC16Q11_TABLE_BITS=7:         59.78M samples/second
+
+// Pi1B @ 700MHz
+// SC16Q11_TABLE_BITS undefined:  5.24M samples/second
+// SC16Q11_TABLE_BITS=11:         2.53M samples/second
+// SC16Q11_TABLE_BITS=9:          3.23M samples/second
+// SC16Q11_TABLE_BITS=8:          5.77M samples/second
+// SC16Q11_TABLE_BITS=7:         10.23M samples/second
+
 void prepare()
 {
     srand(1);
 
-    testdata_uc8 = calloc(MODES_MAG_BUF_SAMPLES, 2);
-    testdata_sc16 = calloc(MODES_MAG_BUF_SAMPLES, 4);
-    testdata_sc16q11 = calloc(MODES_MAG_BUF_SAMPLES, 4);
+    testdata_uc8 = calloc(10, sizeof(void*));
+    testdata_sc16 = calloc(10, sizeof(void*));
+    testdata_sc16q11 = calloc(10, sizeof(void*));
     outdata = calloc(MODES_MAG_BUF_SAMPLES, sizeof(uint16_t));
 
-    for (unsigned i = 0; i < MODES_MAG_BUF_SAMPLES; ++i) {
-        double I = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0;
-        double Q = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0;
+    for (int buf = 0; buf < 10; ++buf) {
+        uint8_t *uc8 = calloc(MODES_MAG_BUF_SAMPLES, 2);
+        testdata_uc8[buf] = uc8;;
+        uint16_t *sc16 = calloc(MODES_MAG_BUF_SAMPLES, 4);
+        testdata_sc16[buf] = sc16;
+        uint16_t *sc16q11 = calloc(MODES_MAG_BUF_SAMPLES, 4);
+        testdata_sc16q11[buf] = sc16q11;
 
-        testdata_uc8[i*2] = (uint8_t) (I * 128 + 128);
-        testdata_uc8[i*2+1] = (uint8_t) (Q * 128 + 128);
+        for (unsigned i = 0; i < MODES_MAG_BUF_SAMPLES; ++i) {
+            double I = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0;
+            double Q = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0;
 
-        testdata_sc16[i*2] = htole16( (int16_t) (I * 32768.0) );
-        testdata_sc16[i*2+1] = htole16( (int16_t) (Q * 32768.0) );
+            uc8[i*2] = (uint8_t) (I * 128 + 128);
+            uc8[i*2+1] = (uint8_t) (Q * 128 + 128);
 
-        testdata_sc16q11[i*2] = htole16( (int16_t) (I * 2048.0) );
-        testdata_sc16q11[i*2+1] = htole16( (int16_t) (Q * 2048.0) );
+            sc16[i*2] = htole16( (int16_t) (I * 32768.0) );
+            sc16[i*2+1] = htole16( (int16_t) (Q * 32768.0) );
+
+            sc16q11[i*2] = htole16( (int16_t) (I * 2048.0) );
+            sc16q11[i*2+1] = htole16( (int16_t) (Q * 2048.0) );
+        }
     }
 }
 
-void test(const char *what, input_format_t format, void *data, double sample_rate, bool filter_dc) {
+void test(const char *what, input_format_t format, void **data, double sample_rate, bool filter_dc) {
     fprintf(stderr, "Benchmarking: %s ", what);
 
     struct converter_state *state;
@@ -43,7 +83,7 @@ void test(const char *what, input_format_t format, void *data, double sample_rat
     int iterations = 0;
 
     // Run it once to force init.
-    converter(data, outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL);
+    converter(data[0], outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL);
 
     while (total.tv_sec < 5) {
         fprintf(stderr, ".");
@@ -52,7 +92,7 @@ void test(const char *what, input_format_t format, void *data, double sample_rat
         start_cpu_timing(&start);
 
         for (int i = 0; i < 10; ++i) {
-            converter(data, outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL);
+            converter(data[i], outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL);
         }
 
         end_cpu_timing(&start, &total);
diff --git a/dump1090.c b/dump1090.c
index 4003c74..d3d6076 100644
--- a/dump1090.c
+++ b/dump1090.c
@@ -129,7 +129,7 @@ void modesInitConfig(void) {
 //=========================================================================
 //
 void modesInit(void) {
-    int i, q;
+    int i;
 
     pthread_mutex_init(&Modes.data_mutex,NULL);
     pthread_cond_init(&Modes.data_cond,NULL);
@@ -139,8 +139,7 @@ void modesInit(void) {
     // Allocate the various buffers used by Modes
     Modes.trailing_samples = (MODES_PREAMBLE_US + MODES_LONG_MSG_BITS + 16) * 1e-6 * Modes.sample_rate;
 
-    if ( ((Modes.maglut     = (uint16_t *) malloc(sizeof(uint16_t) * 256 * 256)                                 ) == NULL) ||
-         ((Modes.log10lut   = (uint16_t *) malloc(sizeof(uint16_t) * 256 * 256)                                 ) == NULL) )
+    if ( ((Modes.log10lut   = (uint16_t *) malloc(sizeof(uint16_t) * 256 * 256)                                 ) == NULL) )
     {
         fprintf(stderr, "Out of memory allocating data buffer.\n");
         exit(1);
@@ -184,21 +183,6 @@ void modesInit(void) {
     if (Modes.net_sndbuf_size > (MODES_NET_SNDBUF_MAX))
       {Modes.net_sndbuf_size = MODES_NET_SNDBUF_MAX;}
 
-    // compute UC8 magnitude lookup table
-    for (i = 0; i <= 255; i++) {
-        for (q = 0; q <= 255; q++) {
-            float fI, fQ, magsq;
-
-            fI = (i - 127.5) / 127.5;
-            fQ = (q - 127.5) / 127.5;
-            magsq = fI * fI + fQ * fQ;
-            if (magsq > 1)
-                magsq = 1;
-
-            Modes.maglut[le16toh((i*256)+q)] = (uint16_t) round(sqrtf(magsq) * 65535.0);
-        }
-    }
-
     // Prepare the log10 lookup table: 100log10(x)
     Modes.log10lut[0] = 0; // poorly defined..
     for (i = 1; i <= 65535; i++) {
diff --git a/dump1090.h b/dump1090.h
index 2b26a3e..07c2247 100644
--- a/dump1090.h
+++ b/dump1090.h
@@ -276,7 +276,6 @@ struct {                             // Internal state
     unsigned        trailing_samples;                     // extra trailing samples in magnitude buffers
     double          sample_rate;                          // actual sample rate in use (in hz)
 
-    uint16_t       *maglut;          // I/Q -> Magnitude lookup table
     uint16_t       *log10lut;        // Magnitude -> log10 lookup table
     int             exit;            // Exit from the main loop when true