Make SC16Q11-nodc conversions table-based for speed.
Add a mechanism for converters to initialize tables on demand. Move UC8 table setup to the new lazy-setup path. Fix uc8 lookup table allocation size.
This commit is contained in:
parent
8a41bcb730
commit
417cda7061
243
convert.c
243
convert.c
|
@ -26,6 +26,36 @@ struct converter_state {
|
|||
float z1_Q;
|
||||
};
|
||||
|
||||
static uint16_t *uc8_lookup;
|
||||
static bool init_uc8_lookup()
|
||||
{
|
||||
if (uc8_lookup)
|
||||
return true;
|
||||
|
||||
uc8_lookup = malloc(sizeof(uint16_t) * 256 * 256);
|
||||
if (!uc8_lookup) {
|
||||
fprintf(stderr, "can't allocate UC8 conversion lookup table\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i <= 255; i++) {
|
||||
for (int q = 0; q <= 255; q++) {
|
||||
float fI, fQ, magsq;
|
||||
|
||||
fI = (i - 127.5) / 127.5;
|
||||
fQ = (q - 127.5) / 127.5;
|
||||
magsq = fI * fI + fQ * fQ;
|
||||
if (magsq > 1)
|
||||
magsq = 1;
|
||||
float mag = sqrtf(magsq);
|
||||
|
||||
uc8_lookup[le16toh((i*256)+q)] = (uint16_t) (mag * 65535.0f + 0.5f);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void convert_uc8_nodc(void *iq_data,
|
||||
uint16_t *mag_data,
|
||||
unsigned nsamples,
|
||||
|
@ -42,55 +72,33 @@ static void convert_uc8_nodc(void *iq_data,
|
|||
MODES_NOTUSED(state);
|
||||
|
||||
// unroll this a bit
|
||||
|
||||
#define DO_ONE_SAMPLE \
|
||||
do { \
|
||||
mag = uc8_lookup[*in++]; \
|
||||
*mag_data++ = mag; \
|
||||
sum_level += mag; \
|
||||
sum_power += (uint32_t)mag * (uint32_t)mag; \
|
||||
} while(0)
|
||||
|
||||
// unroll this a bit
|
||||
for (i = 0; i < (nsamples>>3); ++i) {
|
||||
mag = Modes.maglut[*in++];
|
||||
*mag_data++ = mag;
|
||||
sum_level += mag;
|
||||
sum_power += (uint32_t)mag * (uint32_t)mag;
|
||||
|
||||
mag = Modes.maglut[*in++];
|
||||
*mag_data++ = mag;
|
||||
sum_level += mag;
|
||||
sum_power += (uint32_t)mag * (uint32_t)mag;
|
||||
|
||||
mag = Modes.maglut[*in++];
|
||||
*mag_data++ = mag;
|
||||
sum_level += mag;
|
||||
sum_power += (uint32_t)mag * (uint32_t)mag;
|
||||
|
||||
mag = Modes.maglut[*in++];
|
||||
*mag_data++ = mag;
|
||||
sum_level += mag;
|
||||
sum_power += (uint32_t)mag * (uint32_t)mag;
|
||||
|
||||
mag = Modes.maglut[*in++];
|
||||
*mag_data++ = mag;
|
||||
sum_level += mag;
|
||||
sum_power += (uint32_t)mag * (uint32_t)mag;
|
||||
|
||||
mag = Modes.maglut[*in++];
|
||||
*mag_data++ = mag;
|
||||
sum_level += mag;
|
||||
sum_power += (uint32_t)mag * (uint32_t)mag;
|
||||
|
||||
mag = Modes.maglut[*in++];
|
||||
*mag_data++ = mag;
|
||||
sum_level += mag;
|
||||
sum_power += (uint32_t)mag * (uint32_t)mag;
|
||||
|
||||
mag = Modes.maglut[*in++];
|
||||
*mag_data++ = mag;
|
||||
sum_level += mag;
|
||||
sum_power += (uint32_t)mag * (uint32_t)mag;
|
||||
DO_ONE_SAMPLE;
|
||||
DO_ONE_SAMPLE;
|
||||
DO_ONE_SAMPLE;
|
||||
DO_ONE_SAMPLE;
|
||||
DO_ONE_SAMPLE;
|
||||
DO_ONE_SAMPLE;
|
||||
DO_ONE_SAMPLE;
|
||||
DO_ONE_SAMPLE;
|
||||
}
|
||||
|
||||
for (i = 0; i < (nsamples&7); ++i) {
|
||||
mag = Modes.maglut[*in++];
|
||||
*mag_data++ = mag;
|
||||
sum_level += mag;
|
||||
sum_power += (uint32_t)mag * (uint32_t)mag;
|
||||
DO_ONE_SAMPLE;
|
||||
}
|
||||
|
||||
#undef DO_ONE_SAMPLE
|
||||
|
||||
if (out_mean_level) {
|
||||
*out_mean_level = sum_level / 65536.0 / nsamples;
|
||||
}
|
||||
|
@ -204,6 +212,125 @@ static void convert_sc16_generic(void *iq_data,
|
|||
}
|
||||
}
|
||||
|
||||
static void convert_sc16_nodc(void *iq_data,
|
||||
uint16_t *mag_data,
|
||||
unsigned nsamples,
|
||||
struct converter_state *state,
|
||||
double *out_mean_level,
|
||||
double *out_mean_power)
|
||||
{
|
||||
MODES_NOTUSED(state);
|
||||
|
||||
uint16_t *in = iq_data;
|
||||
|
||||
unsigned i;
|
||||
int16_t I, Q;
|
||||
float fI, fQ, magsq;
|
||||
float sum_level = 0, sum_power = 0;
|
||||
|
||||
for (i = 0; i < nsamples; ++i) {
|
||||
I = (int16_t)le16toh(*in++);
|
||||
Q = (int16_t)le16toh(*in++);
|
||||
fI = I / 32768.0f;
|
||||
fQ = Q / 32768.0f;
|
||||
|
||||
magsq = fI * fI + fQ * fQ;
|
||||
if (magsq > 1)
|
||||
magsq = 1;
|
||||
|
||||
float mag = sqrtf(magsq);
|
||||
sum_power += magsq;
|
||||
sum_level += mag;
|
||||
*mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f);
|
||||
}
|
||||
|
||||
if (out_mean_level) {
|
||||
*out_mean_level = sum_level / nsamples;
|
||||
}
|
||||
|
||||
if (out_mean_power) {
|
||||
*out_mean_power = sum_power / nsamples;
|
||||
}
|
||||
}
|
||||
|
||||
// SC16Q11_TABLE_BITS controls the size of the lookup table
|
||||
// for SC16Q11 data. The size of the table is 2 * (1 << (2*BITS))
|
||||
// bytes. Reducing the number of bits reduces precision but
|
||||
// can run substantially faster by staying in cache.
|
||||
// See convert_benchmark.c for some numbers.
|
||||
|
||||
// Leaving SC16QQ_TABLE_BITS undefined will disable the table lookup and always use
|
||||
// the floating-point path, which may be faster on some systems
|
||||
|
||||
#if defined(SC16Q11_TABLE_BITS)
|
||||
|
||||
#define USE_BITS SC16Q11_TABLE_BITS
|
||||
#define LOSE_BITS (11 - SC16Q11_TABLE_BITS)
|
||||
|
||||
static uint16_t *sc16q11_lookup;
|
||||
static bool init_sc16q11_lookup()
|
||||
{
|
||||
if (sc16q11_lookup)
|
||||
return true;
|
||||
|
||||
sc16q11_lookup = malloc(sizeof(uint16_t) * (1 << (USE_BITS * 2)));
|
||||
if (!sc16q11_lookup) {
|
||||
fprintf(stderr, "can't allocate SC16Q11 conversion lookup table\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2048; i += (1 << LOSE_BITS)) {
|
||||
for (int q = 0; q < 2048; q += (1 << LOSE_BITS)) {
|
||||
float fI = i / 2048.0, fQ = q / 2048.0;
|
||||
float magsq = fI * fI + fQ * fQ;
|
||||
if (magsq > 1)
|
||||
magsq = 1;
|
||||
float mag = sqrtf(magsq);
|
||||
|
||||
unsigned index = ((i >> LOSE_BITS) << USE_BITS) | (q >> LOSE_BITS);
|
||||
sc16q11_lookup[index] = (uint16_t)(mag * 65535.0f + 0.5f);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void convert_sc16q11_table(void *iq_data,
|
||||
uint16_t *mag_data,
|
||||
unsigned nsamples,
|
||||
struct converter_state *state,
|
||||
double *out_mean_level,
|
||||
double *out_mean_power)
|
||||
{
|
||||
uint16_t *in = iq_data;
|
||||
unsigned i;
|
||||
uint16_t I, Q;
|
||||
uint64_t sum_level = 0;
|
||||
uint64_t sum_power = 0;
|
||||
uint16_t mag;
|
||||
|
||||
MODES_NOTUSED(state);
|
||||
|
||||
for (i = 0; i < nsamples; ++i) {
|
||||
I = abs((int16_t)le16toh(*in++)) & 2047;
|
||||
Q = abs((int16_t)le16toh(*in++)) & 2047;
|
||||
mag = sc16q11_lookup[((I >> LOSE_BITS) << USE_BITS) | (Q >> LOSE_BITS)];
|
||||
*mag_data++ = mag;
|
||||
sum_level += mag;
|
||||
sum_power += (uint32_t)mag * (uint32_t)mag;
|
||||
}
|
||||
|
||||
if (out_mean_level) {
|
||||
*out_mean_level = sum_level / 65536.0 / nsamples;
|
||||
}
|
||||
|
||||
if (out_mean_power) {
|
||||
*out_mean_power = sum_power / 65535.0 / 65535.0 / nsamples;
|
||||
}
|
||||
}
|
||||
|
||||
#else /* ! defined(SC16Q11_TABLE_BITS) */
|
||||
|
||||
static void convert_sc16q11_nodc(void *iq_data,
|
||||
uint16_t *mag_data,
|
||||
unsigned nsamples,
|
||||
|
@ -211,14 +338,15 @@ static void convert_sc16q11_nodc(void *iq_data,
|
|||
double *out_mean_level,
|
||||
double *out_mean_power)
|
||||
{
|
||||
MODES_NOTUSED(state);
|
||||
|
||||
uint16_t *in = iq_data;
|
||||
|
||||
unsigned i;
|
||||
int16_t I, Q;
|
||||
float fI, fQ, magsq;
|
||||
float sum_level = 0, sum_power = 0;
|
||||
|
||||
MODES_NOTUSED(state);
|
||||
|
||||
for (i = 0; i < nsamples; ++i) {
|
||||
I = (int16_t)le16toh(*in++);
|
||||
Q = (int16_t)le16toh(*in++);
|
||||
|
@ -244,6 +372,8 @@ static void convert_sc16q11_nodc(void *iq_data,
|
|||
}
|
||||
}
|
||||
|
||||
#endif /* defined(SC16Q11_TABLE_BITS) */
|
||||
|
||||
static void convert_sc16q11_generic(void *iq_data,
|
||||
uint16_t *mag_data,
|
||||
unsigned nsamples,
|
||||
|
@ -301,14 +431,20 @@ static struct {
|
|||
int can_filter_dc;
|
||||
iq_convert_fn fn;
|
||||
const char *description;
|
||||
bool (*init)();
|
||||
} converters_table[] = {
|
||||
// In order of preference
|
||||
{ INPUT_UC8, 0, convert_uc8_nodc, "UC8, integer/table path" },
|
||||
{ INPUT_UC8, 1, convert_uc8_generic, "UC8, float path" },
|
||||
{ INPUT_SC16, 1, convert_sc16_generic, "SC16, float path" },
|
||||
{ INPUT_SC16Q11, 0, convert_sc16q11_nodc, "SC16Q11, float path, no DC block" },
|
||||
{ INPUT_SC16Q11, 1, convert_sc16q11_generic, "SC16Q11, float path" },
|
||||
{ 0, 0, NULL, NULL }
|
||||
{ INPUT_UC8, 0, convert_uc8_nodc, "UC8, integer/table path", init_uc8_lookup },
|
||||
{ INPUT_UC8, 1, convert_uc8_generic, "UC8, float path", NULL },
|
||||
{ INPUT_SC16, 0, convert_sc16_nodc, "SC16, float path, no DC", NULL },
|
||||
{ INPUT_SC16, 1, convert_sc16_generic, "SC16, float path", NULL },
|
||||
#if defined(SC16Q11_TABLE_BITS)
|
||||
{ INPUT_SC16Q11, 0, convert_sc16q11_table, "SC16Q11, integer/table path", init_sc16q11_lookup },
|
||||
#else
|
||||
{ INPUT_SC16Q11, 0, convert_sc16q11_nodc, "SC16Q11, float path, no DC", NULL },
|
||||
#endif
|
||||
{ INPUT_SC16Q11, 1, convert_sc16q11_generic, "SC16Q11, float path", NULL },
|
||||
{ 0, 0, NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
iq_convert_fn init_converter(input_format_t format,
|
||||
|
@ -332,6 +468,11 @@ iq_convert_fn init_converter(input_format_t format,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (converters_table[i].init) {
|
||||
if (!converters_table[i].init())
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*out_state = malloc(sizeof(struct converter_state));
|
||||
if (! *out_state) {
|
||||
fprintf(stderr, "can't allocate converter state\n");
|
||||
|
|
|
@ -1,35 +1,75 @@
|
|||
#include "dump1090.h"
|
||||
|
||||
static uint8_t *testdata_uc8;
|
||||
static uint16_t *testdata_sc16;
|
||||
static uint16_t *testdata_sc16q11;
|
||||
static void **testdata_uc8;
|
||||
static void **testdata_sc16;
|
||||
static void **testdata_sc16q11;
|
||||
static uint16_t *outdata;
|
||||
|
||||
// SC16Q11_TABLE_BITS notes:
|
||||
|
||||
// 11 bits (8MB) gives you full precision, but a large table that doesn't fit in cache
|
||||
// 9 bits (512kB) will fit in the Pi 2/3's shared L2 cache
|
||||
// (but there will be contention from other cores)
|
||||
// 8 bits (128kB) will fit in the Pi 1's L2 cache
|
||||
// 7 bits (32kB) will fit in the Pi 1/2/3's L1 cache
|
||||
|
||||
// Sample results for "SC16Q11, no DC":
|
||||
|
||||
// Core i7-3610QM @ 2300MHz
|
||||
// SC16Q11_TABLE_BITS undefined: 152.80M samples/second
|
||||
// SC16Q11_TABLE_BITS=11: 101.22M samples/second
|
||||
// SC16Q11_TABLE_BITS=9: 243.04M samples/second
|
||||
// SC16Q11_TABLE_BITS=8: 316.84M samples/second
|
||||
// SC16Q11_TABLE_BITS=7: 375.70M samples/second
|
||||
|
||||
// Pi3B @ 1200MHz
|
||||
// SC16Q11_TABLE_BITS undefined: 22.19M samples/second
|
||||
// SC16Q11_TABLE_BITS=11: 5.86M samples/second
|
||||
// SC16Q11_TABLE_BITS=9: 19.33M samples/second
|
||||
// SC16Q11_TABLE_BITS=8: 33.50M samples/second
|
||||
// SC16Q11_TABLE_BITS=7: 59.78M samples/second
|
||||
|
||||
// Pi1B @ 700MHz
|
||||
// SC16Q11_TABLE_BITS undefined: 5.24M samples/second
|
||||
// SC16Q11_TABLE_BITS=11: 2.53M samples/second
|
||||
// SC16Q11_TABLE_BITS=9: 3.23M samples/second
|
||||
// SC16Q11_TABLE_BITS=8: 5.77M samples/second
|
||||
// SC16Q11_TABLE_BITS=7: 10.23M samples/second
|
||||
|
||||
void prepare()
|
||||
{
|
||||
srand(1);
|
||||
|
||||
testdata_uc8 = calloc(MODES_MAG_BUF_SAMPLES, 2);
|
||||
testdata_sc16 = calloc(MODES_MAG_BUF_SAMPLES, 4);
|
||||
testdata_sc16q11 = calloc(MODES_MAG_BUF_SAMPLES, 4);
|
||||
testdata_uc8 = calloc(10, sizeof(void*));
|
||||
testdata_sc16 = calloc(10, sizeof(void*));
|
||||
testdata_sc16q11 = calloc(10, sizeof(void*));
|
||||
outdata = calloc(MODES_MAG_BUF_SAMPLES, sizeof(uint16_t));
|
||||
|
||||
for (unsigned i = 0; i < MODES_MAG_BUF_SAMPLES; ++i) {
|
||||
double I = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0;
|
||||
double Q = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0;
|
||||
for (int buf = 0; buf < 10; ++buf) {
|
||||
uint8_t *uc8 = calloc(MODES_MAG_BUF_SAMPLES, 2);
|
||||
testdata_uc8[buf] = uc8;;
|
||||
uint16_t *sc16 = calloc(MODES_MAG_BUF_SAMPLES, 4);
|
||||
testdata_sc16[buf] = sc16;
|
||||
uint16_t *sc16q11 = calloc(MODES_MAG_BUF_SAMPLES, 4);
|
||||
testdata_sc16q11[buf] = sc16q11;
|
||||
|
||||
testdata_uc8[i*2] = (uint8_t) (I * 128 + 128);
|
||||
testdata_uc8[i*2+1] = (uint8_t) (Q * 128 + 128);
|
||||
for (unsigned i = 0; i < MODES_MAG_BUF_SAMPLES; ++i) {
|
||||
double I = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0;
|
||||
double Q = 2.0 * rand() / (RAND_MAX + 1.0) - 1.0;
|
||||
|
||||
testdata_sc16[i*2] = htole16( (int16_t) (I * 32768.0) );
|
||||
testdata_sc16[i*2+1] = htole16( (int16_t) (Q * 32768.0) );
|
||||
uc8[i*2] = (uint8_t) (I * 128 + 128);
|
||||
uc8[i*2+1] = (uint8_t) (Q * 128 + 128);
|
||||
|
||||
testdata_sc16q11[i*2] = htole16( (int16_t) (I * 2048.0) );
|
||||
testdata_sc16q11[i*2+1] = htole16( (int16_t) (Q * 2048.0) );
|
||||
sc16[i*2] = htole16( (int16_t) (I * 32768.0) );
|
||||
sc16[i*2+1] = htole16( (int16_t) (Q * 32768.0) );
|
||||
|
||||
sc16q11[i*2] = htole16( (int16_t) (I * 2048.0) );
|
||||
sc16q11[i*2+1] = htole16( (int16_t) (Q * 2048.0) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void test(const char *what, input_format_t format, void *data, double sample_rate, bool filter_dc) {
|
||||
void test(const char *what, input_format_t format, void **data, double sample_rate, bool filter_dc) {
|
||||
fprintf(stderr, "Benchmarking: %s ", what);
|
||||
|
||||
struct converter_state *state;
|
||||
|
@ -43,7 +83,7 @@ void test(const char *what, input_format_t format, void *data, double sample_rat
|
|||
int iterations = 0;
|
||||
|
||||
// Run it once to force init.
|
||||
converter(data, outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL);
|
||||
converter(data[0], outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL);
|
||||
|
||||
while (total.tv_sec < 5) {
|
||||
fprintf(stderr, ".");
|
||||
|
@ -52,7 +92,7 @@ void test(const char *what, input_format_t format, void *data, double sample_rat
|
|||
start_cpu_timing(&start);
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
converter(data, outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL);
|
||||
converter(data[i], outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL);
|
||||
}
|
||||
|
||||
end_cpu_timing(&start, &total);
|
||||
|
|
20
dump1090.c
20
dump1090.c
|
@ -129,7 +129,7 @@ void modesInitConfig(void) {
|
|||
//=========================================================================
|
||||
//
|
||||
void modesInit(void) {
|
||||
int i, q;
|
||||
int i;
|
||||
|
||||
pthread_mutex_init(&Modes.data_mutex,NULL);
|
||||
pthread_cond_init(&Modes.data_cond,NULL);
|
||||
|
@ -139,8 +139,7 @@ void modesInit(void) {
|
|||
// Allocate the various buffers used by Modes
|
||||
Modes.trailing_samples = (MODES_PREAMBLE_US + MODES_LONG_MSG_BITS + 16) * 1e-6 * Modes.sample_rate;
|
||||
|
||||
if ( ((Modes.maglut = (uint16_t *) malloc(sizeof(uint16_t) * 256 * 256) ) == NULL) ||
|
||||
((Modes.log10lut = (uint16_t *) malloc(sizeof(uint16_t) * 256 * 256) ) == NULL) )
|
||||
if ( ((Modes.log10lut = (uint16_t *) malloc(sizeof(uint16_t) * 256 * 256) ) == NULL) )
|
||||
{
|
||||
fprintf(stderr, "Out of memory allocating data buffer.\n");
|
||||
exit(1);
|
||||
|
@ -184,21 +183,6 @@ void modesInit(void) {
|
|||
if (Modes.net_sndbuf_size > (MODES_NET_SNDBUF_MAX))
|
||||
{Modes.net_sndbuf_size = MODES_NET_SNDBUF_MAX;}
|
||||
|
||||
// compute UC8 magnitude lookup table
|
||||
for (i = 0; i <= 255; i++) {
|
||||
for (q = 0; q <= 255; q++) {
|
||||
float fI, fQ, magsq;
|
||||
|
||||
fI = (i - 127.5) / 127.5;
|
||||
fQ = (q - 127.5) / 127.5;
|
||||
magsq = fI * fI + fQ * fQ;
|
||||
if (magsq > 1)
|
||||
magsq = 1;
|
||||
|
||||
Modes.maglut[le16toh((i*256)+q)] = (uint16_t) round(sqrtf(magsq) * 65535.0);
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare the log10 lookup table: 100log10(x)
|
||||
Modes.log10lut[0] = 0; // poorly defined..
|
||||
for (i = 1; i <= 65535; i++) {
|
||||
|
|
|
@ -276,7 +276,6 @@ struct { // Internal state
|
|||
unsigned trailing_samples; // extra trailing samples in magnitude buffers
|
||||
double sample_rate; // actual sample rate in use (in hz)
|
||||
|
||||
uint16_t *maglut; // I/Q -> Magnitude lookup table
|
||||
uint16_t *log10lut; // Magnitude -> log10 lookup table
|
||||
int exit; // Exit from the main loop when true
|
||||
|
||||
|
|
Loading…
Reference in a new issue