From 417cda7061ceb50e9dc1d1e4be702f0026071e53 Mon Sep 17 00:00:00 2001 From: Oliver Jowett Date: Fri, 27 Jan 2017 21:58:12 +0000 Subject: [PATCH] Make SC16Q11-nodc conversions table-based for speed. Add a mechanism for converters to initialize tables on demand. Move UC8 table setup to the new lazy-setup path. Fix uc8 lookup table allocation size. --- convert.c | 243 ++++++++++++++++++++++++++++++++++---------- convert_benchmark.c | 76 ++++++++++---- dump1090.c | 20 +--- dump1090.h | 1 - 4 files changed, 252 insertions(+), 88 deletions(-) diff --git a/convert.c b/convert.c index 06b753a..fc2a7ae 100644 --- a/convert.c +++ b/convert.c @@ -26,6 +26,36 @@ struct converter_state { float z1_Q; }; +static uint16_t *uc8_lookup; +static bool init_uc8_lookup() +{ + if (uc8_lookup) + return true; + + uc8_lookup = malloc(sizeof(uint16_t) * 256 * 256); + if (!uc8_lookup) { + fprintf(stderr, "can't allocate UC8 conversion lookup table\n"); + return false; + } + + for (int i = 0; i <= 255; i++) { + for (int q = 0; q <= 255; q++) { + float fI, fQ, magsq; + + fI = (i - 127.5) / 127.5; + fQ = (q - 127.5) / 127.5; + magsq = fI * fI + fQ * fQ; + if (magsq > 1) + magsq = 1; + float mag = sqrtf(magsq); + + uc8_lookup[le16toh((i*256)+q)] = (uint16_t) (mag * 65535.0f + 0.5f); + } + } + + return true; +} + static void convert_uc8_nodc(void *iq_data, uint16_t *mag_data, unsigned nsamples, @@ -42,55 +72,33 @@ static void convert_uc8_nodc(void *iq_data, MODES_NOTUSED(state); // unroll this a bit + +#define DO_ONE_SAMPLE \ + do { \ + mag = uc8_lookup[*in++]; \ + *mag_data++ = mag; \ + sum_level += mag; \ + sum_power += (uint32_t)mag * (uint32_t)mag; \ + } while(0) + + // unroll this a bit for (i = 0; i < (nsamples>>3); ++i) { - mag = Modes.maglut[*in++]; - *mag_data++ = mag; - sum_level += mag; - sum_power += (uint32_t)mag * (uint32_t)mag; - - mag = Modes.maglut[*in++]; - *mag_data++ = mag; - sum_level += mag; - sum_power += (uint32_t)mag * (uint32_t)mag; - - mag = Modes.maglut[*in++]; - *mag_data++ = mag; - sum_level += mag; - sum_power += (uint32_t)mag * (uint32_t)mag; - - mag = Modes.maglut[*in++]; - *mag_data++ = mag; - sum_level += mag; - sum_power += (uint32_t)mag * (uint32_t)mag; - - mag = Modes.maglut[*in++]; - *mag_data++ = mag; - sum_level += mag; - sum_power += (uint32_t)mag * (uint32_t)mag; - - mag = Modes.maglut[*in++]; - *mag_data++ = mag; - sum_level += mag; - sum_power += (uint32_t)mag * (uint32_t)mag; - - mag = Modes.maglut[*in++]; - *mag_data++ = mag; - sum_level += mag; - sum_power += (uint32_t)mag * (uint32_t)mag; - - mag = Modes.maglut[*in++]; - *mag_data++ = mag; - sum_level += mag; - sum_power += (uint32_t)mag * (uint32_t)mag; + DO_ONE_SAMPLE; + DO_ONE_SAMPLE; + DO_ONE_SAMPLE; + DO_ONE_SAMPLE; + DO_ONE_SAMPLE; + DO_ONE_SAMPLE; + DO_ONE_SAMPLE; + DO_ONE_SAMPLE; } for (i = 0; i < (nsamples&7); ++i) { - mag = Modes.maglut[*in++]; - *mag_data++ = mag; - sum_level += mag; - sum_power += (uint32_t)mag * (uint32_t)mag; + DO_ONE_SAMPLE; } +#undef DO_ONE_SAMPLE + if (out_mean_level) { *out_mean_level = sum_level / 65536.0 / nsamples; } @@ -204,6 +212,125 @@ static void convert_sc16_generic(void *iq_data, } } +static void convert_sc16_nodc(void *iq_data, + uint16_t *mag_data, + unsigned nsamples, + struct converter_state *state, + double *out_mean_level, + double *out_mean_power) +{ + MODES_NOTUSED(state); + + uint16_t *in = iq_data; + + unsigned i; + int16_t I, Q; + float fI, fQ, magsq; + float sum_level = 0, sum_power = 0; + + for (i = 0; i < nsamples; ++i) { + I = (int16_t)le16toh(*in++); + Q = (int16_t)le16toh(*in++); + fI = I / 32768.0f; + fQ = Q / 32768.0f; + + magsq = fI * fI + fQ * fQ; + if (magsq > 1) + magsq = 1; + + float mag = sqrtf(magsq); + sum_power += magsq; + sum_level += mag; + *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f); + } + + if (out_mean_level) { + *out_mean_level = sum_level / nsamples; + } + + if (out_mean_power) { + *out_mean_power = sum_power / nsamples; + } +} + +// SC16Q11_TABLE_BITS controls the size of the lookup table +// for SC16Q11 data. The size of the table is 2 * (1 << (2*BITS)) +// bytes. Reducing the number of bits reduces precision but +// can run substantially faster by staying in cache. +// See convert_benchmark.c for some numbers. + +// Leaving SC16QQ_TABLE_BITS undefined will disable the table lookup and always use +// the floating-point path, which may be faster on some systems + +#if defined(SC16Q11_TABLE_BITS) + +#define USE_BITS SC16Q11_TABLE_BITS +#define LOSE_BITS (11 - SC16Q11_TABLE_BITS) + +static uint16_t *sc16q11_lookup; +static bool init_sc16q11_lookup() +{ + if (sc16q11_lookup) + return true; + + sc16q11_lookup = malloc(sizeof(uint16_t) * (1 << (USE_BITS * 2))); + if (!sc16q11_lookup) { + fprintf(stderr, "can't allocate SC16Q11 conversion lookup table\n"); + return false; + } + + for (int i = 0; i < 2048; i += (1 << LOSE_BITS)) { + for (int q = 0; q < 2048; q += (1 << LOSE_BITS)) { + float fI = i / 2048.0, fQ = q / 2048.0; + float magsq = fI * fI + fQ * fQ; + if (magsq > 1) + magsq = 1; + float mag = sqrtf(magsq); + + unsigned index = ((i >> LOSE_BITS) << USE_BITS) | (q >> LOSE_BITS); + sc16q11_lookup[index] = (uint16_t)(mag * 65535.0f + 0.5f); + } + } + + return true; +} + +static void convert_sc16q11_table(void *iq_data, + uint16_t *mag_data, + unsigned nsamples, + struct converter_state *state, + double *out_mean_level, + double *out_mean_power) +{ + uint16_t *in = iq_data; + unsigned i; + uint16_t I, Q; + uint64_t sum_level = 0; + uint64_t sum_power = 0; + uint16_t mag; + + MODES_NOTUSED(state); + + for (i = 0; i < nsamples; ++i) { + I = abs((int16_t)le16toh(*in++)) & 2047; + Q = abs((int16_t)le16toh(*in++)) & 2047; + mag = sc16q11_lookup[((I >> LOSE_BITS) << USE_BITS) | (Q >> LOSE_BITS)]; + *mag_data++ = mag; + sum_level += mag; + sum_power += (uint32_t)mag * (uint32_t)mag; + } + + if (out_mean_level) { + *out_mean_level = sum_level / 65536.0 / nsamples; + } + + if (out_mean_power) { + *out_mean_power = sum_power / 65535.0 / 65535.0 / nsamples; + } +} + +#else /* ! defined(SC16Q11_TABLE_BITS) */ + static void convert_sc16q11_nodc(void *iq_data, uint16_t *mag_data, unsigned nsamples, @@ -211,14 +338,15 @@ static void convert_sc16q11_nodc(void *iq_data, double *out_mean_level, double *out_mean_power) { + MODES_NOTUSED(state); + uint16_t *in = iq_data; + unsigned i; int16_t I, Q; float fI, fQ, magsq; float sum_level = 0, sum_power = 0; - MODES_NOTUSED(state); - for (i = 0; i < nsamples; ++i) { I = (int16_t)le16toh(*in++); Q = (int16_t)le16toh(*in++); @@ -244,6 +372,8 @@ static void convert_sc16q11_nodc(void *iq_data, } } +#endif /* defined(SC16Q11_TABLE_BITS) */ + static void convert_sc16q11_generic(void *iq_data, uint16_t *mag_data, unsigned nsamples, @@ -301,14 +431,20 @@ static struct { int can_filter_dc; iq_convert_fn fn; const char *description; + bool (*init)(); } converters_table[] = { // In order of preference - { INPUT_UC8, 0, convert_uc8_nodc, "UC8, integer/table path" }, - { INPUT_UC8, 1, convert_uc8_generic, "UC8, float path" }, - { INPUT_SC16, 1, convert_sc16_generic, "SC16, float path" }, - { INPUT_SC16Q11, 0, convert_sc16q11_nodc, "SC16Q11, float path, no DC block" }, - { INPUT_SC16Q11, 1, convert_sc16q11_generic, "SC16Q11, float path" }, - { 0, 0, NULL, NULL } + { INPUT_UC8, 0, convert_uc8_nodc, "UC8, integer/table path", init_uc8_lookup }, + { INPUT_UC8, 1, convert_uc8_generic, "UC8, float path", NULL }, + { INPUT_SC16, 0, convert_sc16_nodc, "SC16, float path, no DC", NULL }, + { INPUT_SC16, 1, convert_sc16_generic, "SC16, float path", NULL }, +#if defined(SC16Q11_TABLE_BITS) + { INPUT_SC16Q11, 0, convert_sc16q11_table, "SC16Q11, integer/table path", init_sc16q11_lookup }, +#else + { INPUT_SC16Q11, 0, convert_sc16q11_nodc, "SC16Q11, float path, no DC", NULL }, +#endif + { INPUT_SC16Q11, 1, convert_sc16q11_generic, "SC16Q11, float path", NULL }, + { 0, 0, NULL, NULL, NULL } }; iq_convert_fn init_converter(input_format_t format, @@ -332,6 +468,11 @@ iq_convert_fn init_converter(input_format_t format, return NULL; } + if (converters_table[i].init) { + if (!converters_table[i].init()) + return NULL; + } + *out_state = malloc(sizeof(struct converter_state)); if (! *out_state) { fprintf(stderr, "can't allocate converter state\n"); diff --git a/convert_benchmark.c b/convert_benchmark.c index 0348b81..7a0c523 100644 --- a/convert_benchmark.c +++ b/convert_benchmark.c @@ -1,35 +1,75 @@ #include "dump1090.h" -static uint8_t *testdata_uc8; -static uint16_t *testdata_sc16; -static uint16_t *testdata_sc16q11; +static void **testdata_uc8; +static void **testdata_sc16; +static void **testdata_sc16q11; static uint16_t *outdata; +// SC16Q11_TABLE_BITS notes: + +// 11 bits (8MB) gives you full precision, but a large table that doesn't fit in cache +// 9 bits (512kB) will fit in the Pi 2/3's shared L2 cache +// (but there will be contention from other cores) +// 8 bits (128kB) will fit in the Pi 1's L2 cache +// 7 bits (32kB) will fit in the Pi 1/2/3's L1 cache + +// Sample results for "SC16Q11, no DC": + +// Core i7-3610QM @ 2300MHz +// SC16Q11_TABLE_BITS undefined: 152.80M samples/second +// SC16Q11_TABLE_BITS=11: 101.22M samples/second +// SC16Q11_TABLE_BITS=9: 243.04M samples/second +// SC16Q11_TABLE_BITS=8: 316.84M samples/second +// SC16Q11_TABLE_BITS=7: 375.70M samples/second + +// Pi3B @ 1200MHz +// SC16Q11_TABLE_BITS undefined: 22.19M samples/second +// SC16Q11_TABLE_BITS=11: 5.86M samples/second +// SC16Q11_TABLE_BITS=9: 19.33M samples/second +// SC16Q11_TABLE_BITS=8: 33.50M samples/second +// SC16Q11_TABLE_BITS=7: 59.78M samples/second + +// Pi1B @ 700MHz +// SC16Q11_TABLE_BITS undefined: 5.24M samples/second +// SC16Q11_TABLE_BITS=11: 2.53M samples/second +// SC16Q11_TABLE_BITS=9: 3.23M samples/second +// SC16Q11_TABLE_BITS=8: 5.77M samples/second +// SC16Q11_TABLE_BITS=7: 10.23M samples/second + void prepare() { srand(1); - testdata_uc8 = calloc(MODES_MAG_BUF_SAMPLES, 2); - testdata_sc16 = calloc(MODES_MAG_BUF_SAMPLES, 4); - testdata_sc16q11 = calloc(MODES_MAG_BUF_SAMPLES, 4); + testdata_uc8 = calloc(10, sizeof(void*)); + testdata_sc16 = calloc(10, sizeof(void*)); + testdata_sc16q11 = calloc(10, sizeof(void*)); outdata = calloc(MODES_MAG_BUF_SAMPLES, sizeof(uint16_t)); - for (unsigned i = 0; i < MODES_MAG_BUF_SAMPLES; ++i) { - double I = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0; - double Q = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0; + for (int buf = 0; buf < 10; ++buf) { + uint8_t *uc8 = calloc(MODES_MAG_BUF_SAMPLES, 2); + testdata_uc8[buf] = uc8;; + uint16_t *sc16 = calloc(MODES_MAG_BUF_SAMPLES, 4); + testdata_sc16[buf] = sc16; + uint16_t *sc16q11 = calloc(MODES_MAG_BUF_SAMPLES, 4); + testdata_sc16q11[buf] = sc16q11; - testdata_uc8[i*2] = (uint8_t) (I * 128 + 128); - testdata_uc8[i*2+1] = (uint8_t) (Q * 128 + 128); + for (unsigned i = 0; i < MODES_MAG_BUF_SAMPLES; ++i) { + double I = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0; + double Q = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0; - testdata_sc16[i*2] = htole16( (int16_t) (I * 32768.0) ); - testdata_sc16[i*2+1] = htole16( (int16_t) (Q * 32768.0) ); + uc8[i*2] = (uint8_t) (I * 128 + 128); + uc8[i*2+1] = (uint8_t) (Q * 128 + 128); - testdata_sc16q11[i*2] = htole16( (int16_t) (I * 2048.0) ); - testdata_sc16q11[i*2+1] = htole16( (int16_t) (Q * 2048.0) ); + sc16[i*2] = htole16( (int16_t) (I * 32768.0) ); + sc16[i*2+1] = htole16( (int16_t) (Q * 32768.0) ); + + sc16q11[i*2] = htole16( (int16_t) (I * 2048.0) ); + sc16q11[i*2+1] = htole16( (int16_t) (Q * 2048.0) ); + } } } -void test(const char *what, input_format_t format, void *data, double sample_rate, bool filter_dc) { +void test(const char *what, input_format_t format, void **data, double sample_rate, bool filter_dc) { fprintf(stderr, "Benchmarking: %s ", what); struct converter_state *state; @@ -43,7 +83,7 @@ void test(const char *what, input_format_t format, void *data, double sample_rat int iterations = 0; // Run it once to force init. - converter(data, outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL); + converter(data[0], outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL); while (total.tv_sec < 5) { fprintf(stderr, "."); @@ -52,7 +92,7 @@ void test(const char *what, input_format_t format, void *data, double sample_rat start_cpu_timing(&start); for (int i = 0; i < 10; ++i) { - converter(data, outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL); + converter(data[i], outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL); } end_cpu_timing(&start, &total); diff --git a/dump1090.c b/dump1090.c index 4003c74..d3d6076 100644 --- a/dump1090.c +++ b/dump1090.c @@ -129,7 +129,7 @@ void modesInitConfig(void) { //========================================================================= // void modesInit(void) { - int i, q; + int i; pthread_mutex_init(&Modes.data_mutex,NULL); pthread_cond_init(&Modes.data_cond,NULL); @@ -139,8 +139,7 @@ void modesInit(void) { // Allocate the various buffers used by Modes Modes.trailing_samples = (MODES_PREAMBLE_US + MODES_LONG_MSG_BITS + 16) * 1e-6 * Modes.sample_rate; - if ( ((Modes.maglut = (uint16_t *) malloc(sizeof(uint16_t) * 256 * 256) ) == NULL) || - ((Modes.log10lut = (uint16_t *) malloc(sizeof(uint16_t) * 256 * 256) ) == NULL) ) + if ( ((Modes.log10lut = (uint16_t *) malloc(sizeof(uint16_t) * 256 * 256) ) == NULL) ) { fprintf(stderr, "Out of memory allocating data buffer.\n"); exit(1); @@ -184,21 +183,6 @@ void modesInit(void) { if (Modes.net_sndbuf_size > (MODES_NET_SNDBUF_MAX)) {Modes.net_sndbuf_size = MODES_NET_SNDBUF_MAX;} - // compute UC8 magnitude lookup table - for (i = 0; i <= 255; i++) { - for (q = 0; q <= 255; q++) { - float fI, fQ, magsq; - - fI = (i - 127.5) / 127.5; - fQ = (q - 127.5) / 127.5; - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - - Modes.maglut[le16toh((i*256)+q)] = (uint16_t) round(sqrtf(magsq) * 65535.0); - } - } - // Prepare the log10 lookup table: 100log10(x) Modes.log10lut[0] = 0; // poorly defined.. for (i = 1; i <= 65535; i++) { diff --git a/dump1090.h b/dump1090.h index 2b26a3e..07c2247 100644 --- a/dump1090.h +++ b/dump1090.h @@ -276,7 +276,6 @@ struct { // Internal state unsigned trailing_samples; // extra trailing samples in magnitude buffers double sample_rate; // actual sample rate in use (in hz) - uint16_t *maglut; // I/Q -> Magnitude lookup table uint16_t *log10lut; // Magnitude -> log10 lookup table int exit; // Exit from the main loop when true