Skip to content

Commit 1b13c36

Browse files
committed
added new SpeakerStream to simplify working with speaker_labels - not currently compatible with timing-stream
1 parent a2d78d8 commit 1b13c36

11 files changed

+2829
-728
lines changed

dist/watson-speech.js

Lines changed: 1999 additions & 683 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

examples/static/index.html

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,36 @@
11
<!DOCTYPE html>
22
<html lang="en">
33
<head>
4-
<meta charset="UTF-8">
5-
<title>IBM Watson Speech JavaScript SDK Example</title>
4+
<meta charset="UTF-8">
5+
<title>IBM Watson Speech JavaScript SDK Example</title>
66
</head>
77
<body>
88
<h1>IBM Watson Speech JavaScript SDK Examples</h1>
99
<h2>Speech to Text</h2>
1010
<ul>
11-
<li><a href="microphone-streaming.html">Transcribe from Microphone, Streaming</a></li>
12-
<li><a href="microphone-streaming-auto-stop.html">Transcribe from Microphone, Streaming, automatically stop at first pause</a></li>
13-
<li><a href="microphone-alternatives.html">Transcribe from Microphone, with Alternatives</a></li>
14-
<li><a href="microphone-word-confidence.html">Transcribe from Microphone, with Word Confidence</a></li>
15-
<li><a href="microphone-streaming-text-to-console.html">Transcribe from Microphone, send text to console</a></li>
16-
<li><a href="microphone-streaming-object-to-console.html">Transcribe from Microphone, send JSON to console (includes text and metadata; v0.22+ format)</a></li>
17-
<li><a href="microphone-streaming-object-extracted-to-console.html">Transcribe from Microphone, send JSON to console with results extracted (pre-v0.22 format)</a></li>
18-
<li><a href="microphone-streaming-model.html">Transcribe from Microphone, Streaming with chosen model</a></li>
19-
<li><a href="file-streaming.html">Transcribe from file, Streaming</a></li>
20-
<li><a href="multi-speaker-file-console.html">Transcribe from file, multiple speakers</a></li>
21-
<li><a href="file-realtime-vs-no-realtime.html">Transcribe from file, Comparing <code>{realtime: true}</code> to <code>{realtime: false}</code></a></li>
22-
<li><a href="file-promise.html">Transcribe from file, Promise</a></li>
23-
<li><a href="file-ajax.html">Transcribe from file loaded over AJAX</a></li>
24-
<li><a href="browserify.html">Example bundled with browserify</a> <b>(Node.js server only)</b></li>
25-
<li><strike><a href="audio-video-deprecated/">Deprecated: Transcribe from HTML5 &lt;audio&gt; or &lt;video&gt; element</a></strike> <b>(Node.js server only)</b></li>
11+
<li><a href="microphone-streaming.html">Transcribe from Microphone, Streaming</a></li>
12+
<li><a href="microphone-streaming-auto-stop.html">Transcribe from Microphone, Streaming, automatically stop at first pause</a></li>
13+
<li><a href="microphone-alternatives.html">Transcribe from Microphone, with Alternatives</a></li>
14+
<li><a href="microphone-word-confidence.html">Transcribe from Microphone, with Word Confidence</a></li>
15+
<li><a href="microphone-streaming-text-to-console.html">Transcribe from Microphone, send text to console</a></li>
16+
<li><a href="microphone-streaming-object-to-console.html">Transcribe from Microphone, send JSON to console (includes text and metadata; v0.22+ format)</a></li>
17+
<li><a href="microphone-streaming-object-extracted-to-console.html">Transcribe from Microphone, send JSON to console with results extracted (pre-v0.22 format)</a></li>
18+
<li><a href="microphone-streaming-model.html">Transcribe from Microphone, Streaming with chosen model</a></li>
19+
<li><a href="file-streaming.html">Transcribe from file, Streaming</a></li>
20+
<li><a href="speaker-labels-file-console.html">Transcribe from file with <code>{speaker_labels: true}</code>, output to console</a></li>
21+
<li><a href="speaker-stream-file-console.html">Transcribe from file with <code>{resultsBySpeaker: true}</code>, output to console</a></li>
22+
<li><a href="speaker-stream-file-html.html">Transcribe from file with <code>{resultsBySpeaker: true}</code>, output HTML</a></li>
23+
<li><a href="file-realtime-vs-no-realtime.html">Transcribe from file, Comparing <code>{realtime: true}</code> to <code>{realtime: false}</code></a></li>
24+
<li><a href="file-promise.html">Transcribe from file, Promise</a></li>
25+
<li><a href="file-ajax.html">Transcribe from file loaded over AJAX</a></li>
26+
<li><a href="browserify.html">Example bundled with browserify</a> <b>(Node.js server only)</b></li>
27+
<li><strike><a href="audio-video-deprecated/">Deprecated: Transcribe from HTML5 &lt;audio&gt; or &lt;video&gt; element</a></strike> <b>(Node.js server only)</b></li>
2628
</ul>
2729

2830
<h2>Text to Speech</h2>
2931
<ul>
30-
<li><a href="text-to-speech.html">Synthesize text</a></li>
31-
<li><a href="text-to-speech-custom-voice.html">Synthesize text w/ custom voice</a></li>
32+
<li><a href="text-to-speech.html">Synthesize text</a></li>
33+
<li><a href="text-to-speech-custom-voice.html">Synthesize text w/ custom voice</a></li>
3234
</ul>
3335
</body>
3436
</html>

examples/static/multi-speaker-file-console.html renamed to examples/static/speaker-labels-file-console.html

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
<body>
88

99
<section>
10-
<h2>Transcribe from Microphone</h2>
10+
<h2>Transcribe from file with <code>{speaker_labels: true}</code>, output to console</h2>
1111
<button id="button">Transcribe File</button>
1212
<button id="stop">Stop</button>
1313

@@ -47,15 +47,12 @@ <h2>Code for this demo:</h2>
4747
model: 'en-US_NarrowbandModel',
4848
objectMode: true, // send objects instead of text
4949
realtime: true, // don't slow down the results if transcription occurs faster than playback
50-
format: false,
50+
format: false, // enable resultsBySpeaker when formatting for multiple speakers
5151
play: true
5252
});
5353

54-
window.allResults = [];
55-
5654
stream.on('data', function(data) {
5755
console.log(data);
58-
allResults.push(data);
5956
});
6057

6158
stream.on('error', function(err) {
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Watson Speech to Text client example</title>
6+
</head>
7+
<body>
8+
9+
<section>
10+
<h2>Transcribe from file with <code>{resultsBySpeaker: true}</code>, output to console</h2>
11+
<button id="button">Transcribe File</button>
12+
<button id="stop">Stop</button>
13+
14+
<h2>Output:</h2>
15+
<div id="output">Open your browser's console to view the output. Note: it will take some time before results begin to appear.</div>
16+
</section>
17+
18+
<script src="watson-speech.js"></script>
19+
<!-- window.fetch pollyfill for IE/Edge & Older Chrome/FireFox -->
20+
<script src="bower_components/fetch/fetch.js"></script>
21+
22+
<h2>Code for this demo:</h2>
23+
24+
<pre><code><script style="display: block;">
25+
26+
// preloading the data for a smoother experience
27+
var preloadTokenAndAudio = Promise.all([
28+
fetch('/api/speech-to-text/token').then(function(response) {
29+
return response.text();
30+
}),
31+
fetch('/en-us-multi-speaker-narrowband.wav').then(function(response) {
32+
return response.blob();
33+
})
34+
]);
35+
36+
document.querySelector('#button').onclick = function () {
37+
preloadTokenAndAudio.then(function (values) {
38+
var token = values[0];
39+
var file = values[1];
40+
41+
var stream = WatsonSpeech.SpeechToText.recognizeFile({
42+
token: token,
43+
data: file,
44+
// only certain models support speaker labels currently,
45+
// see http://www.ibm.com/watson/developercloud/doc/speech-to-text/output.shtml#speaker_labels
46+
model: 'en-US_NarrowbandModel',
47+
resultsBySpeaker: true, // pipes results through a SpeakerStream, and also enables speaker_labels and objectMode
48+
realtime: false, // don't slow down the results if transcription occurs faster than playback
49+
play: true
50+
});
51+
52+
stream.on('data', function(data) {
53+
// SpeakerStream's data events are different in that most include multiple result objects, and currently, they
54+
// are all interim until the last data event.
55+
56+
// The result objects look similar to those returned by the RecognizeStream, except that they each have a
57+
// `speaker` key with a numeric value. Additionally, extra features, such as alternatives and word alternatives
58+
// will be lost in the SpeakerStream results.
59+
60+
console.log(data);
61+
});
62+
63+
stream.on('error', function(err) {
64+
console.log(err);
65+
});
66+
67+
document.querySelector('#stop').onclick = stream.stop.bind(stream);
68+
69+
}).catch(function(error) {
70+
console.log(error);
71+
});
72+
};
73+
74+
</script></code></pre>
75+
76+
</body>
77+
</html>
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Watson Speech to Text client example</title>
6+
</head>
7+
<body>
8+
9+
<section>
10+
<h2>Transcribe from file with <code>{resultsBySpeaker: true}</code>, output HTML</h2>
11+
<button id="button">Transcribe File</button>
12+
<button id="stop">Stop</button>
13+
14+
<h2>Output:</h2>
15+
<div id="output"></div>
16+
</section>
17+
18+
<script src="watson-speech.js"></script>
19+
<!-- window.fetch pollyfill for IE/Edge & Older Chrome/FireFox -->
20+
<script src="bower_components/fetch/fetch.js"></script>
21+
22+
<h2>Code for this demo:</h2>
23+
24+
<pre><code><script style="display: block;">
25+
26+
// preloading the data for a smoother experience
27+
var preloadTokenAndAudio = Promise.all([
28+
fetch('/api/speech-to-text/token').then(function(response) {
29+
return response.text();
30+
}),
31+
fetch('/en-us-multi-speaker-narrowband.wav').then(function(response) {
32+
return response.blob();
33+
})
34+
]);
35+
36+
document.querySelector('#button').onclick = function () {
37+
preloadTokenAndAudio.then(function (values) {
38+
var token = values[0];
39+
var file = values[1];
40+
41+
document.querySelector('#output').innerHTML = 'Processing. Note: it will take some time for the first results to appear.';
42+
43+
var stream = WatsonSpeech.SpeechToText.recognizeFile({
44+
token: token,
45+
data: file,
46+
speaker_labels: true,
47+
// only certain models support speaker labels currently,
48+
// see http://www.ibm.com/watson/developercloud/doc/speech-to-text/output.shtml#speaker_labels
49+
model: 'en-US_NarrowbandModel',
50+
resultsBySpeaker: true, // pipes results through a SpeakerStream, and also enables speaker_labels and objectMode
51+
play: true
52+
});
53+
54+
stream.on('data', function(data) {
55+
// With resultsBySpeaker, the data events are different in that most include multiple result objects, and
56+
// currently, they are all interim until the last data event.
57+
58+
// The result objects look similar to normal ones, except that they each have a `speaker` key with a numeric
59+
// value. Additionally, extra features, such as alternatives and word alternatives will be lost.
60+
61+
var lines = data.results.map(function(result) {
62+
return '<div class="line speaker-' + result.speaker + '">' +
63+
'<b class="speaker-label">Speaker ' + result.speaker + ':</b> ' +
64+
result.alternatives[0].transcript +
65+
'</div>';
66+
});
67+
68+
document.querySelector('#output').innerHTML = lines.join('\n');
69+
});
70+
71+
stream.on('error', function(err) {
72+
console.log(err);
73+
});
74+
75+
document.querySelector('#stop').onclick = stream.stop.bind(stream);
76+
77+
}).catch(function(error) {
78+
console.log(error);
79+
});
80+
};
81+
82+
</script></code></pre>
83+
84+
</body>
85+
</html>

speech-to-text/index.js

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@ module.exports = {
6262
*/
6363
ResultStream: require('./result-stream'),
6464

65+
/**
66+
* @see SpeakerStream
67+
*/
68+
SpeakerStream: require('./speaker-stream'),
69+
6570
/**
6671
* @see WritableElementStream
6772
*/

speech-to-text/recognize-file.js

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ var TimingStream = require('./timing-stream.js');
2323
var assign = require('object.assign/polyfill')();
2424
var WritableElementStream = require('./writable-element-stream');
2525
var ResultStream = require('./result-stream');
26+
var SpeakerStream = require('./speaker-stream');
2627

2728
/**
2829
* @module watson-speech/speech-to-text/recognize-file
@@ -37,11 +38,12 @@ var ResultStream = require('./result-stream');
3738
* @param {Blob|File} options.data - the raw audio data as a Blob or File instance
3839
* @param {Boolean} [options.play=false] - If a file is set, play it locally as it's being uploaded
3940
* @param {Boolena} [options.format=true] - pipe the text through a {FormatStream} which performs light formatting. Also controls smart_formatting option unless explicitly set.
40-
* @param {Boolena} [options.realtime=options.play] - pipe the text through a {TimingStream} which slows the output down to real-time to match the audio playback.
41+
* @param {Boolena} [options.realtime=options.play] - pipe the text through a {TimingStream} which slows the output down to real-time to match the audio playback. Not currently compatible with resultsBySpeaker option.
4142
* @param {String|DOMElement} [options.outputElement] pipe the text to a WriteableElementStream targeting the specified element. Also defaults objectMode to true to enable interim results.
42-
* @param {Boolean} [options.extractResults=false] pipe results through a ResultExtractor stream to simplify the objects. (Default behavior before v0.22) Requires objectMode.
43+
* @param {Boolean} [options.extractResults=false] pipe results through a ResultExtractor stream to simplify the objects. (Default behavior before v0.22) Automatically enables objectMode.
44+
* @param {Boolean} [options.resultsBySpeaker=false] pipe results through a SpeakerStream. Causes each data event to include multiple results, each with a speaker field. Automatically enables objectMode and speaker_labels. Automatically disables the realtime option due to incompatibilities. Adds some delay to processing.
4345
*
44-
* @returns {RecognizeStream|FormatStream|TimingStream}
46+
* @returns {RecognizeStream|SpeakerStream|FormatStream|ResultStream|TimingStream}
4547
*/
4648
module.exports = function recognizeFile(options) { // eslint-disable-line complexity
4749
if (!options || !options.token) {
@@ -53,8 +55,14 @@ module.exports = function recognizeFile(options) { // eslint-disable-line comple
5355
options.objectMode = true;
5456
}
5557
// the ResultExtractor only works in objectMode
56-
if (options.extractResults && options.objectMode !== false) {
58+
if (options.extractResults) {
59+
options.objectMode = true;
60+
}
61+
// SpeakerStream requires objectMode and speaker_labels
62+
if (options.resultsBySpeaker) {
5763
options.objectMode = true;
64+
options.speaker_labels = true;
65+
options.realtime = false;
5866
}
5967

6068
// default format to true (capitals and periods)
@@ -78,19 +86,34 @@ module.exports = function recognizeFile(options) { // eslint-disable-line comple
7886
delete rsOpts.objectMode;
7987

8088

89+
90+
var stream = new BlobStream(options.data);
8191
var recognizeStream = new RecognizeStream(rsOpts);
82-
var stream = new BlobStream(options.data).pipe(recognizeStream);
92+
var streams = [stream, recognizeStream]; // collect all of the streams so that we can bundle up errors and send them to the last one
93+
stream = stream.pipe(recognizeStream);
8394

84-
if (options.format) {
85-
stream = stream.pipe(new FormatStream(options));
86-
}
95+
// note: the TimingStream cannot currently handle results as regrouped by the SpeakerStream
96+
// so it must come first
97+
var timingStream;
8798
if (realtime) {
88-
stream = stream.pipe(new TimingStream(options));
99+
timingStream = new TimingStream(options);
100+
stream = stream.pipe(timingStream);
101+
streams.push(stream);
89102
stream.on('stop', recognizeStream.stop.bind(recognizeStream));
90103
} else {
91104
stream.stop = recognizeStream.stop.bind(recognizeStream);
92105
}
93106

107+
if (options.resultsBySpeaker) {
108+
stream = stream.pipe(new SpeakerStream(options));
109+
streams.push(stream);
110+
}
111+
112+
if (options.format) {
113+
stream = stream.pipe(new FormatStream(options));
114+
streams.push(stream);
115+
}
116+
94117
if (options.play) {
95118
FilePlayer.playFile(options.data).then(function(player) {
96119
recognizeStream.on('stop', player.stop.bind(player));
@@ -101,18 +124,30 @@ module.exports = function recognizeFile(options) { // eslint-disable-line comple
101124
}
102125

103126
if (options.outputElement) {
104-
stream.pipe(new WritableElementStream(options));
127+
// we don't want to return the WES, just send data to it
128+
streams.push(stream.pipe(new WritableElementStream(options)));
105129
}
106130

107131
if (options.extractResults) {
108-
var stop = stream.stop.bind(stream);
132+
var stop = stream.stop ? stream.stop.bind(stream) : recognizeStream.stop.bind(recognizeStream);
109133
stream = stream.pipe(new ResultStream());
110134
stream.stop = stop;
135+
streams.push(stream);
111136
}
112137

113-
// Capture error from original RecognizeStream
114-
if (stream !== recognizeStream) {
115-
recognizeStream.on('error', stream.emit.bind(stream, 'error'));
138+
// Capture errors from any stream except the last one and emit them on the last one
139+
streams.forEach(function(prevStream) {
140+
if (prevStream !== stream) {
141+
prevStream.on('error', stream.emit.bind(stream, 'error'));
142+
}
143+
});
144+
145+
if (!stream.stop) {
146+
if (timingStream) {
147+
stream.stop = timingStream.stop.bind(timingStream);
148+
} else {
149+
stream.stop = recognizeStream.stop.bind(recognizeStream);
150+
}
116151
}
117152

118153
return stream;

0 commit comments

Comments
 (0)