1+ namespace audio_tools {
2+
3+ #pragma once
4+
5+ #include < algorithm>
6+ #include < cmath>
7+
8+ #include " AudioOutput.h"
9+ #include " Vector.h"
10+
11+ namespace audio_tools {
12+
13+ /*
14+ * @brief Frame holding the indices of the top 3 frequencies in an FFT window.
15+ *
16+ * Used as a compact representation of the dominant frequency content in a frame
17+ * of audio.
18+ */
19+ template <size_t N>
20+ struct FrequencyFrame {
21+ uint16_t top_freqs[N]; // /< Indices of top 3 frequencies in FFT
22+ };
23+
24+ /* *
25+ * @class WakeWordDetector
26+ * @brief Template-based wake word detector for microcontrollers using dominant
27+ * frequency patterns.
28+ *
29+ * This class detects wake words by comparing the sequence of the top N dominant
30+ * frequencies in each audio frame to stored templates for each wake word. When
31+ * the percentage of matching frames exceeds a configurable threshold, the
32+ * corresponding wake word is considered detected.
33+ *
34+ * @tparam N Number of dominant frequencies to track per frame (default: 3)
35+ *
36+ * Usage:
37+ * - Record each wake word and extract the top N frequencies for each frame to
38+ * build templates.
39+ * - Instantiate WakeWordDetector<N> and add templates for each wake word.
40+ * - Register a callback to handle detection events using setWakeWordCallback().
41+ *
42+ * Example:
43+ * @code
44+ * audio_tools::WakeWordDetector<3> detector(fft, fft_size, frame_size);
45+ *detector.addTemplate(my_template_frames, 80.0f, "hello");
46+ *detector.setWakeWordCallback([](const char* name) { Serial.println(name); });
47+ ... (file header and includes)
48+ */
49+ template <size_t N = 3 >
50+ class WakeWordDetector : public AudioOutput {
51+ public:
52+ struct Template {
53+ Vector<FrequencyFrame<N>>
54+ frames; // /< Sequence of frequency frames for the wake word
55+ float threshold_percent; // /< Minimum percent of matching frames required
56+ // /< for detection (0-100)
57+ const char * name; // /< Name/label of the wake word
58+ float last_match_percent =
59+ 0 .0f ; // /< Last computed match percent for this template
60+ };
61+
62+ using WakeWordCallback = void (*)(const char * name);
63+
64+ WakeWordDetector (AudioFFTBase& fft, size_t fft_size, size_t frame_size)
65+ : _fft_size(fft_size), _frame_size(frame_size), p_fft(&fft) {
66+ _buffer.resize (_frame_size, 0 );
67+ _frame_pos = 0 ;
68+ fft.config ().ref = this ;
69+ fft.callback = fftResult;
70+ }
71+
72+ void startRecording () {
73+ _recent_frames.clear ();
74+ _is_recording = true ;
75+ }
76+
77+ Vector<FrequencyFrame<N>> stopRecording () {
78+ _is_recording = false ;
79+ return _recent_frames;
80+ }
81+
82+ bool isRecording () const { return _is_recording; }
83+
84+ void addTemplate (const Vector<FrequencyFrame<N>>& frames,
85+ float threshold_percent, const char * name) {
86+ Template t;
87+ t.frames = frames;
88+ t.threshold_percent = threshold_percent;
89+ t.name = name;
90+ t.last_match_percent = 0 .0f ;
91+ _templates.push_back (t);
92+ if (frames.size () > _max_template_len) _max_template_len = frames.size ();
93+ }
94+
95+ void setWakeWordCallback (WakeWordCallback cb) { _callback = cb; }
96+
97+ size_t write (const void * buf, size_t count) override {
98+ return p_fft->write ((const uint8_t *)buf, count);
99+ }
100+
101+ static void fftResult (AudioFFTBase& fft) {
102+ // This static method must access instance data via fft.config().ref
103+ auto * self = static_cast <WakeWordDetector<N>*>(fft.config ().ref );
104+ if (!self) return ;
105+ FrequencyFrame<N> frame;
106+ AudioFFTResult result[N];
107+ self->p_fft ->resultArray (result, N);
108+ for (size_t j = 0 ; j < N; j++) {
109+ frame.top_freqs [j] = result[j].frequency ;
110+ }
111+ self->_recent_frames .push_back (frame);
112+
113+ if (self->_is_recording ) {
114+ return ;
115+ }
116+
117+ if (self->_recent_frames .size () > self->_max_template_len )
118+ self->_recent_frames .erase (self->_recent_frames .begin ());
119+ for (size_t i = 0 ; i < self->_templates .size (); ++i) {
120+ Template& tmpl = self->_templates [i];
121+ if (self->_recent_frames .size () >= tmpl.frames .size ()) {
122+ float percent = self->matchTemplate (tmpl);
123+ if (percent >= tmpl.threshold_percent ) {
124+ if (self->_callback ) self->_callback (tmpl.name );
125+ }
126+ }
127+ }
128+ }
129+
130+ protected:
131+ Vector<Template> _templates; // /< List of wake word templates
132+ Vector<FrequencyFrame<N>> _recent_frames; // /< Recent frames for comparison
133+ Vector<int16_t > _buffer; // /< Buffer for incoming PCM samples
134+ AudioFFTBase* p_fft = nullptr ;
135+ bool _is_recording = false ; // /< True if currently recording a template
136+ size_t _fft_size; // /< FFT size per frame
137+ size_t _frame_size; // /< Number of PCM samples per frame
138+ size_t _frame_pos; // /< Current position in frame buffer
139+ size_t _max_template_len = 0 ; // /< Length of the longest template
140+ WakeWordCallback _callback = nullptr ;
141+
142+ float matchTemplate (Template& tmpl) {
143+ size_t matches = 0 ;
144+ size_t offset = _recent_frames.size () - tmpl.frames .size ();
145+ for (size_t i = 0 ; i < tmpl.frames .size (); ++i) {
146+ size_t frame_matches = 0 ;
147+ for (size_t j = 0 ; j < N; ++j) {
148+ if (tmpl.frames [i].top_freqs [j] ==
149+ _recent_frames[offset + i].top_freqs [j])
150+ frame_matches++;
151+ }
152+ if (frame_matches >= (N >= 2 ? N - 1 : 1 )) // at least N-1 out of N match
153+ matches++;
154+ }
155+ float percent = (tmpl.frames .size () > 0 )
156+ ? (100 .0f * matches / tmpl.frames .size ())
157+ : 0 .0f ;
158+ tmpl.last_match_percent = percent;
159+ return percent;
160+ }
161+ };
162+
163+ } // namespace audio_tools
0 commit comments