From 9b9744ba152264654bd12c9a5924c5185d50e678 Mon Sep 17 00:00:00 2001 From: wb2osz Date: Mon, 22 Nov 2021 21:15:17 -0500 Subject: [PATCH] Speed up 9600 demodulator. --- src/demod.c | 85 +++++------------ src/demod_9600.c | 206 ++++++++++++++++++++++++++++++------------ src/demod_9600.h | 4 +- src/fsk_demod_state.h | 13 ++- 4 files changed, 180 insertions(+), 128 deletions(-) diff --git a/src/demod.c b/src/demod.c index 59173a5..982d08a 100644 --- a/src/demod.c +++ b/src/demod.c @@ -1,7 +1,7 @@ // // This file is part of Dire Wolf, an amateur radio packet TNC. // -// Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2019 John Langner, WB2OSZ +// Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2019, 2021 John Langner, WB2OSZ // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -60,10 +60,6 @@ static struct audio_s *save_audio_config_p; -// TODO: temp experiment. - - -static int zerostuff = 1; // temp experiment. // Current state of all the decoders. @@ -676,12 +672,6 @@ int demod_init (struct audio_s *pa) strlcpy (save_audio_config_p->achan[chan].profiles, "+", sizeof(save_audio_config_p->achan[chan].profiles)); } - -#ifdef TUNE_ZEROSTUFF - zerostuff = TUNE_ZEROSTUFF; -#endif - - /* * We need a minimum number of audio samples per bit time for good performance. * Easier to check here because demod_9600_init might have an adjusted sample rate. @@ -696,26 +686,32 @@ int demod_init (struct audio_s *pa) if (save_audio_config_p->achan[chan].upsample == 0) { - if (ratio < 5) { + if (ratio < 4) { - // example: 44100 / 9600 is 4.59 - // Big improvement with x2. - // x4 seems to work the best. - // The other parameters are not as touchy. - // Might reduce on ARM if it takes too much CPU power. + // This is extreme. + // No one should be using a sample rate this low but + // amazingly a recording with 22050 rate can be decoded. + // 3 and 4 are the same. Need more tests. save_audio_config_p->achan[chan].upsample = 4; } + else if (ratio < 5) { + + // example: 44100 / 9600 is 4.59 + // 3 is slightly better than 2 or 4. + + save_audio_config_p->achan[chan].upsample = 3; + } else if (ratio < 10) { - // 48000 / 9600 is 5.00 - // Need more research. Treat like above for now. + // example: 48000 / 9600 = 5 + // 3 is slightly better than 2 or 4. - save_audio_config_p->achan[chan].upsample = 4; + save_audio_config_p->achan[chan].upsample = 3; } else if (ratio < 15) { - // ... + // ... guessing save_audio_config_p->achan[chan].upsample = 2; } @@ -786,7 +782,8 @@ int demod_init (struct audio_s *pa) } demod_9600_init (save_audio_config_p->achan[chan].modem_type, - save_audio_config_p->achan[chan].upsample * save_audio_config_p->adev[ACHAN2ADEV(chan)].samples_per_sec, + save_audio_config_p->adev[ACHAN2ADEV(chan)].samples_per_sec, + save_audio_config_p->achan[chan].upsample, save_audio_config_p->achan[chan].baud, D); if (strchr(save_audio_config_p->achan[chan].profiles, '+') != NULL) { @@ -924,7 +921,7 @@ __attribute__((hot)) void demod_process_sample (int chan, int subchan, int sam) { float fsam; - int k; + //int k; struct demodulator_state_s *D; @@ -1016,47 +1013,7 @@ void demod_process_sample (int chan, int subchan, int sam) case MODEM_AIS: default: - if (zerostuff) { - /* Literature says this is better if followed */ - /* by appropriate low pass filter. */ - /* So far, both are same in tests with different */ - /* optimal low pass filter parameters. */ - - for (k=1; kachan[chan].upsample; k++) { - demod_9600_process_sample (chan, 0, D); - } - demod_9600_process_sample (chan, sam * save_audio_config_p->achan[chan].upsample, D); - } - else { - - /* Linear interpolation. */ - static int prev_sam; - - switch (save_audio_config_p->achan[chan].upsample) { - case 1: - demod_9600_process_sample (chan, sam, D); - break; - case 2: - demod_9600_process_sample (chan, (prev_sam + sam) / 2, D); - demod_9600_process_sample (chan, sam, D); - break; - case 3: - demod_9600_process_sample (chan, (2 * prev_sam + sam) / 3, D); - demod_9600_process_sample (chan, (prev_sam + 2 * sam) / 3, D); - demod_9600_process_sample (chan, sam, D); - break; - case 4: - demod_9600_process_sample (chan, (3 * prev_sam + sam) / 4, D); - demod_9600_process_sample (chan, (prev_sam + sam) / 2, D); - demod_9600_process_sample (chan, (prev_sam + 3 * sam) / 4, D); - demod_9600_process_sample (chan, sam, D); - break; - default: - assert (0); - break; - } - prev_sam = sam; - } + demod_9600_process_sample (chan, sam, save_audio_config_p->achan[chan].upsample, D); break; } /* switch modem_type */ diff --git a/src/demod_9600.c b/src/demod_9600.c index a909848..1df6006 100644 --- a/src/demod_9600.c +++ b/src/demod_9600.c @@ -1,7 +1,7 @@ // // This file is part of Dire Wolf, an amateur radio packet TNC. // -// Copyright (C) 2011, 2012, 2013, 2015, 2019 John Langner, WB2OSZ +// Copyright (C) 2011, 2012, 2013, 2015, 2019, 2021 John Langner, WB2OSZ // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -25,7 +25,8 @@ * * Module: demod_9600.c * - * Purpose: Demodulator for scrambled baseband encoding. + * Purpose: Demodulator for baseband signal. + * This is used for AX.25 (with scrambling) and IL2P without. * * Input: Audio samples from either a file or the "sound card." * @@ -45,12 +46,14 @@ #include // Fine tuning for different demodulator types. +// Don't remove this section. It is here for a reason. + +#define DCD_THRESH_ON 32 // Hysteresis: Can miss 0 out of 32 for detecting lock. + // This is best for actual on-the-air signals. + // Still too many brief false matches. +#define DCD_THRESH_OFF 8 // Might want a little more fine tuning. +#define DCD_GOOD_WIDTH 1024 // No more than 1024!!! -#define DCD_THRESH_ON 32 // Hysteresis: Can miss 0 out of 32 for detecting lock. - // This is best for actual on-the-air signals. - // Still too many brief false matches. -#define DCD_THRESH_OFF 8 // Might want a little more fine tuning. -#define DCD_GOOD_WIDTH 1024 // No more than 1024!!! #include "fsk_demod_state.h" // Values above override defaults. #include "tune.h" @@ -125,9 +128,12 @@ static inline float agc (float in, float fast_attack, float slow_decay, float *p * * Inputs: modem_type - Determines whether scrambling is used. * - * samples_per_sec - Number of samples per second. - * Might be upsampled in hopes of - * reducing the PLL jitter. + * samples_per_sec - Number of samples per second for audio. + * + * upsample - Factor to upsample the incoming stream. + * After a lot of experimentation, I discovered that + * it works better if the data is upsampled. + * This reduces the jitter for PLL syncronization. * * baud - Data rate in bits per second. * @@ -137,10 +143,13 @@ static inline float agc (float in, float fast_attack, float slow_decay, float *p * *----------------------------------------------------------------*/ -void demod_9600_init (enum modem_t modem_type, int samples_per_sec, int baud, struct demodulator_state_s *D) +void demod_9600_init (enum modem_t modem_type, int original_sample_rate, int upsample, int baud, struct demodulator_state_s *D) { float fc; int j; + if (upsample < 1) upsample = 1; + if (upsample > 4) upsample = 4; + memset (D, 0, sizeof(struct demodulator_state_s)); D->modem_type = modem_type; @@ -155,12 +164,13 @@ void demod_9600_init (enum modem_t modem_type, int samples_per_sec, int baud, st // case 'L': // upsample x4 with filtering. - D->lp_filter_len_bits = 1.0; + D->lp_filter_len_bits = 1.0; // -U4 = 61 4.59 samples/symbol // Works best with odd number in some tests. Even is better in others. - //D->lp_filter_size = ((int) (0.5f * ( D->lp_filter_len_bits * (float)samples_per_sec / (float)baud ))) * 2 + 1; + //D->lp_filter_size = ((int) (0.5f * ( D->lp_filter_len_bits * (float)original_sample_rate / (float)baud ))) * 2 + 1; - D->lp_filter_size = (int) (( D->lp_filter_len_bits * (float)samples_per_sec / baud) + 0.5f); + // Just round to nearest integer. + D->lp_filter_size = (int) (( D->lp_filter_len_bits * (float)original_sample_rate / baud) + 0.5f); D->lp_window = BP_WINDOW_COSINE; @@ -185,8 +195,11 @@ void demod_9600_init (enum modem_t modem_type, int samples_per_sec, int baud, st dw_printf ("samples per bit = %.1f\n", (double)samples_per_sec / baud); #endif + + // PLL needs to use the upsampled rate. + D->pll_step_per_sample = - (int) round(TICKS_PER_PLL_CYCLE * (double) baud / (double)samples_per_sec); + (int) round(TICKS_PER_PLL_CYCLE * (double) baud / (double)(original_sample_rate * upsample)); #ifdef TUNE_LP_WINDOW @@ -217,13 +230,87 @@ void demod_9600_init (enum modem_t modem_type, int samples_per_sec, int baud, st D->pll_searching_inertia = TUNE_PLL_SEARCHING; #endif - fc = (float)baud * D->lpf_baud / (float)samples_per_sec; + // Initial filter (before scattering) is based on upsampled rate. + + fc = (float)baud * D->lpf_baud / (float)(original_sample_rate * upsample); //dw_printf ("demod_9600_init: call gen_lowpass(fc=%.2f, , size=%d, )\n", fc, D->lp_filter_size); - gen_lowpass (fc, D->lp_filter, D->lp_filter_size, D->lp_window); + gen_lowpass (fc, D->u.bb.lp_filter, D->lp_filter_size * upsample, D->lp_window); + +// New in 1.7 - +// Use a polyphase filter to reduce the CPU load. +// Originally I used zero stuffing to upsample. +// Here is the general idea. +// +// Suppose the input samples are 1 2 3 4 5 6 7 8 9 ... +// Filter coefficents are a b c d e f g h i ... +// +// With original sampling rate, the filtering would involve multiplying and adding: +// +// 1a 2b 3c 4d 5e 6f ... +// +// When upsampling by 3, each of these would need to be evaluated +// for each audio sample: +// +// 1a 0b 0c 2d 0e 0f 3g 0h 0i ... +// 0a 1b 0c 0d 2e 0f 0g 3h 0i ... +// 0a 0b 1c 0d 0e 2f 0g 0h 3i ... +// +// 2/3 of the multiplies are always by a stuffed zero. +// We can do this more efficiently by removing them. +// +// 1a 2d 3g ... +// 1b 2e 3h ... +// 1c 2f 3i ... +// +// We scatter the original filter across multiple shorter filters. +// Each input sample cycles around them to produce the upsampled rate. +// +// a d g ... +// b e h ... +// c f i ... +// +// There are countless sources of information DSP but this one is unique +// in that it is a college course that mentions APRS. +// https://www2.eecs.berkeley.edu/Courses/EE123 +// +// Was the effort worthwhile? Times on an RPi 3. +// +// command: atest -B9600 ~/walkabout9600[abc]-compressed*.wav +// +// These are 3 recordings of a portable system being carried out of +// range and back in again. It is a real world test for weak signals. +// +// options num decoded seconds x realtime +// 1.6 1.7 1.6 1.7 1.6 1.7 +// --- --- --- --- --- --- +// -P- 171 172 23.928 17.967 14.9 19.9 +// -P+ 180 180 54.688 48.772 6.5 7.3 +// -P- -F1 177 178 32.686 26.517 10.9 13.5 +// +// So, it turns out that -P+ doesn't have a dramatic improvement, only +// around 4%, for drastically increased CPU requirements. +// Maybe we should turn that off by default, especially for ARM. +// + + int k = 0; + for (int i = 0; i < D->lp_filter_size; i++) { + D->u.bb.lp_polyphase_1[i] = D->u.bb.lp_filter[k++]; + if (upsample >= 2) { + D->u.bb.lp_polyphase_2[i] = D->u.bb.lp_filter[k++]; + if (upsample >= 3) { + D->u.bb.lp_polyphase_3[i] = D->u.bb.lp_filter[k++]; + if (upsample >= 4) { + D->u.bb.lp_polyphase_4[i] = D->u.bb.lp_filter[k++]; + } + } + } + } + /* Version 1.2: Experiment with different slicing levels. */ + // Really didn't help that much because we should have a symmetrical signal. for (j = 0; j < MAX_SUBCHANS; j++) { slice_point[j] = 0.02f * (j - 0.5f * (MAX_SUBCHANS-1)); @@ -259,7 +346,7 @@ void demod_9600_init (enum modem_t modem_type, int samples_per_sec, int baud, st * been distorted by going thru voice transceivers not * intended to pass this sort of "audio" signal. * - * Data is "scrambled" to reduce the amount of DC bias. + * For G3RUH mode, data is "scrambled" to reduce the amount of DC bias. * The data stream must be unscrambled at the receiving end. * * We also have a digital phase locked loop (PLL) @@ -276,6 +363,9 @@ void demod_9600_init (enum modem_t modem_type, int samples_per_sec, int baud, st * of the function to be called for each bit recovered * from the demodulator. For now, it's simply hard-coded. * + * After experimentation, I found that this works better if + * the original signal is upsampled by 2x or even 4x. + * * References: 9600 Baud Packet Radio Modem Design * http://www.amsat.org/amsat/articles/g3ruh/109.html * @@ -290,63 +380,57 @@ void demod_9600_init (enum modem_t modem_type, int samples_per_sec, int baud, st inline static void nudge_pll (int chan, int subchan, int slice, float demod_out, struct demodulator_state_s *D); -__attribute__((hot)) -void demod_9600_process_sample (int chan, int sam, struct demodulator_state_s *D) -{ +static void process_filtered_sample (int chan, float fsam, struct demodulator_state_s *D); + +__attribute__((hot)) +void demod_9600_process_sample (int chan, int sam, int upsample, struct demodulator_state_s *D) +{ float fsam; - float amp; - float demod_out; #if DEBUG4 static FILE *demod_log_fp = NULL; static int log_file_seq = 0; /* Part of log file name */ #endif - int subchan = 0; - int demod_data; /* Still scrambled. */ - assert (chan >= 0 && chan < MAX_CHANS); assert (subchan >= 0 && subchan < MAX_SUBCHANS); - -/* - * Filters use last 'filter_size' samples. - * - * First push the older samples down. - * - * Finally, put the most recent at the beginning. - * - * Future project? Rather than shifting the samples, - * it might be faster to add another variable to keep - * track of the most recent sample and change the - * indexing in the later loops that multiply and add. - */ - /* Scale to nice number for convenience. */ /* Consistent with the AFSK demodulator, we'd like to use */ /* only half of the dynamic range to have some headroom. */ /* i.e. input range +-16k becomes +-1 here and is */ /* displayed in the heard line as audio level 100. */ - fsam = sam / 16384.0; + fsam = (float)sam / 16384.0f; -#if defined(TUNE_ZEROSTUFF) && TUNE_ZEROSTUFF == 0 -// experiment - no filtering. + // Low pass filter + push_sample (fsam, D->u.bb.audio_in, D->lp_filter_size); - amp = fsam; + fsam = convolve (D->u.bb.audio_in, D->u.bb.lp_polyphase_1, D->lp_filter_size); + process_filtered_sample (chan, fsam, D); + if (upsample >= 2) { + fsam = convolve (D->u.bb.audio_in, D->u.bb.lp_polyphase_2, D->lp_filter_size); + process_filtered_sample (chan, fsam, D); + if (upsample >= 3) { + fsam = convolve (D->u.bb.audio_in, D->u.bb.lp_polyphase_3, D->lp_filter_size); + process_filtered_sample (chan, fsam, D); + if (upsample >= 4) { + fsam = convolve (D->u.bb.audio_in, D->u.bb.lp_polyphase_4, D->lp_filter_size); + process_filtered_sample (chan, fsam, D); + } + } + } +} -#else - push_sample (fsam, D->raw_cb, D->lp_filter_size); -/* - * Low pass filter to reduce noise yet pass the data. - */ +__attribute__((hot)) +static void process_filtered_sample (int chan, float fsam, struct demodulator_state_s *D) +{ - amp = convolve (D->raw_cb, D->lp_filter, D->lp_filter_size); -#endif + int subchan = 0; /* * Version 1.2: Capture the post-filtering amplitude for display. @@ -359,18 +443,18 @@ void demod_9600_process_sample (int chan, int sam, struct demodulator_state_s *D // TODO: probably no need for this. Just use D->m_peak, D->m_valley - if (amp >= D->alevel_mark_peak) { - D->alevel_mark_peak = amp * D->quick_attack + D->alevel_mark_peak * (1.0f - D->quick_attack); + if (fsam >= D->alevel_mark_peak) { + D->alevel_mark_peak = fsam * D->quick_attack + D->alevel_mark_peak * (1.0f - D->quick_attack); } else { - D->alevel_mark_peak = amp * D->sluggish_decay + D->alevel_mark_peak * (1.0f - D->sluggish_decay); + D->alevel_mark_peak = fsam * D->sluggish_decay + D->alevel_mark_peak * (1.0f - D->sluggish_decay); } - if (amp <= D->alevel_space_peak) { - D->alevel_space_peak = amp * D->quick_attack + D->alevel_space_peak * (1.0f - D->quick_attack); + if (fsam <= D->alevel_space_peak) { + D->alevel_space_peak = fsam * D->quick_attack + D->alevel_space_peak * (1.0f - D->quick_attack); } else { - D->alevel_space_peak = amp * D->sluggish_decay + D->alevel_space_peak * (1.0f - D->sluggish_decay); + D->alevel_space_peak = fsam * D->sluggish_decay + D->alevel_space_peak * (1.0f - D->sluggish_decay); } /* @@ -381,12 +465,14 @@ void demod_9600_process_sample (int chan, int sam, struct demodulator_state_s *D * This works by looking at the minimum and maximum signal peaks * and scaling the results to be roughly in the -1.0 to +1.0 range. */ + float demod_out; + int demod_data; /* Still scrambled. */ - demod_out = agc (amp, D->agc_fast_attack, D->agc_slow_decay, &(D->m_peak), &(D->m_valley)); + demod_out = agc (fsam, D->agc_fast_attack, D->agc_slow_decay, &(D->m_peak), &(D->m_valley)); // TODO: There is potential for multiple decoders with one filter. -//dw_printf ("peak=%.2f valley=%.2f amp=%.2f norm=%.2f\n", D->m_peak, D->m_valley, amp, norm); +//dw_printf ("peak=%.2f valley=%.2f fsam=%.2f norm=%.2f\n", D->m_peak, D->m_valley, fsam, norm); if (D->num_slicers <= 1) { @@ -435,7 +521,7 @@ void demod_9600_process_sample (int chan, int sam, struct demodulator_state_s *D fprintf (demod_log_fp, "%.3f, %.3f, %.3f, %.3f, %.3f, %d, %.2f\n", fsam + 6, - amp + 4, + fsam + 4, D->m_peak + 4, D->m_valley + 4, demod_out + 2, diff --git a/src/demod_9600.h b/src/demod_9600.h index ac3e747..51fc15e 100644 --- a/src/demod_9600.h +++ b/src/demod_9600.h @@ -6,9 +6,9 @@ #include "fsk_demod_state.h" -void demod_9600_init (enum modem_t modem_type, int samples_per_sec, int baud, struct demodulator_state_s *D); +void demod_9600_init (enum modem_t modem_type, int original_sample_rate, int upsample, int baud, struct demodulator_state_s *D); -void demod_9600_process_sample (int chan, int sam, struct demodulator_state_s *D); +void demod_9600_process_sample (int chan, int sam, int upsample, struct demodulator_state_s *D); diff --git a/src/fsk_demod_state.h b/src/fsk_demod_state.h index 33f7901..bf8d23b 100644 --- a/src/fsk_demod_state.h +++ b/src/fsk_demod_state.h @@ -301,6 +301,8 @@ struct demodulator_state_s // // ////////////////////////////////////////////////////////////////////////////////// +// TODO: Continue experiments with root raised cosine filter. +// Either switch to that or take out all the related stuff. struct bb_only_s { @@ -314,8 +316,15 @@ struct demodulator_state_s float audio_in[MAX_FILTER_SIZE] __attribute__((aligned(16))); // Audio samples in. -// FIXME: use lp_filter - float rrc_filter[MAX_FILTER_SIZE] __attribute__((aligned(16))); // RRC Low pass filter. + + float lp_filter[MAX_FILTER_SIZE] __attribute__((aligned(16))); // Low pass filter. + + // New in 1.7 - Polyphase filter to reduce CPU requirements. + + float lp_polyphase_1[MAX_FILTER_SIZE] __attribute__((aligned(16))); + float lp_polyphase_2[MAX_FILTER_SIZE] __attribute__((aligned(16))); + float lp_polyphase_3[MAX_FILTER_SIZE] __attribute__((aligned(16))); + float lp_polyphase_4[MAX_FILTER_SIZE] __attribute__((aligned(16))); float lp_1_iir_param; // very low pass filters to get DC offset. float lp_1_out;