21 Understanding Chrome Extensions Text to Speech

The chrome.tts API is used to convert text into audible speech generated by the text-to-speech system. It takes advantage of the already built in TTS capabilities of the operating system.

The official Chrome Extensions documentation on TTS is here: https://developer.chrome.com/docs/extensions/reference/api/tts

How to declare in the Manifest.json

{
...
"permissions":[
"tts"
],
...
}

How to generate Speech from text

Run a simple command in the extension page, sidepanel page or background script of your extensions. This does not work in the content scripts like many other APIs.

chrome.tts.speak('Hello, world.');

Customization to speech

There a quick a number of options provided to generate different types of speech that the chrome.tts provides. The official documentation provides a comprehensive list of options that can be applied: https://developer.chrome.com/docs/extensions/reference/api/tts#type-TtsOptions

chrome.tts.speak('Hello, world.', {

// All these are optional

volume: 0.5, // 0.0 - 1.0
rate: 2.0, // 0.1 - 10.0
pitch: 1.0, // 0.0 - 2.0
lang: 'en-US', // in the form language-region. Examples: 'en', 'en-US', 'en-GB', 'zh-CN'.

// override any current TTSs in progress.
enqueue: true,

// If empty, uses any available voice.
voiceName: "",

// events the voice desired or must support
// https://developer.chrome.com/docs/extensions/reference/api/tts#type-EventType
// useful in the onEvent function.
desiredEventTypes: [],
requiredEventTypes: [],

// event handler
onEvent: (ttsEvent)=>{
const charIndex = ttsEvent.charIndex;
const errorMessage = ttsEvent.errorMessage;

// The length of the next part of the utterance.
const length = ttsEvent.length;

//event type e.g start, end, word, sentence, interrupted, cancelled
const type = ttsEvent.type;

}

});

Get Voices

chrome.tts.getVoices((voices)=>{
for (var i = 0; i < voices.length; i++) {
console.log('Voice ' + i + ':');
console.log(' name: ' + voices[i].voiceName);
console.log(' lang: ' + voices[i].lang);
console.log(' extension id: ' + voices[i].extensionId);
console.log(' event types: ' + voices[i].eventTypes);
}
});

Control Speech

Here are different function you can call from the chrome.tts API to control the system speech.

// Check if the system is speaking
chrome.tts.isSpeaking((speaking) => {
if(speaking){
// Todo Code
}
})

// Pause Speech
chrome.tts.pause();

// Resume Speech
chrome.tts.resume();

// Stop Speech
chrome.tts.stop();

Sample Project

We are going to make a chrome extension that reads out the text a user submits. In addition we will use the chrome.contextMenus API for a user to right-click on selectable text for the system to read out. If you have no clue on the chrome.contextMenus API you can read our article on it Understand Chrome Extensions Context Menus

Text to speech chrome extension sample project

Alright we will need 4 files for this project background.js manifest.json and popup.html and popup.js

You can find the source to this project herehttps://github.com/BuildChromeExtensions/textToSpeech

manifest.json

{
"name": "Text to Speech",
"version": "1.0.0.0",
"manifest_version": 3,
"permissions": [
"tts",
"contextMenus"
],
"action": {
"default_popup": "popup.html"
},
"background": {
"service_worker": "background.js"
}
}

popup.html

<!DOCTYPE html>
<html lang="en">

<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Document</title>
<style>
body {
width: 300px;
height: 400px;
background: linear-gradient(#a2a3a2, #FFAA44);
}

form {
justify-content: center;
display: flex;
flex-direction: column;
padding: 10px 20px;
color: white;
}

label {
display: flex;
width: 100%;
margin: 4px 0px;
}

span {
margin: 0px 10px;
}

input {
width: 100%;
padding: 4px 8px;
border-radius: 30px;
}

button {
background: #764d00;
color: #ffffff;
padding: 10px 20px;
border: none;
cursor: pointer;
box-shadow: rgba(100, 100, 111, 0.2) 0px 7px 29px 0px;
transition: all 0.4s;
border-radius: 30px;
font-weight: 700;
margin:4px;
}
</style>
</head>

<body>
<form>
<h1>Text to Speech</h1>
<input required placeholder="Text to Read" name="text" maxlength="200" />
<label><span>Volume</span><input name="volume" type="range" step="0.05" value="0.5" min="0" max="1" /></label>
<label><span>Pitch</span><input name="pitch" type="range" step="0.05" value="1" min="0" max="1" /></label>
<label><span>Rate</span><input name="rate" type="range" step="0.05" value="1" min="0" max="10" /></label>
<label><span>Enqueue</span><input name="enqueue" type="checkbox" /></label>
<select name="voiceName"></select>
<button type="submit">Speak</button>
</form>
<button id="resume">Resume</button>
<button id="pause">Pause</button>
<button id="stop">Stop</button>

<script type="text/javascript" src="/popup.js"></script>
</body>

</html>

popup.js


// add voice to select list on load
chrome.tts.getVoices((voices) => {
const select = document.querySelector('select');
for (var i = 0; i < voices.length; i++) {
const option = document.createElement('option');
option.textContent = voices[i].voiceName + `${voices[i].voiceName} (${voices[i].lang})`;
option.value = voices[i].voiceName;
select.appendChild(option);
}
});

document.querySelector('form').onsubmit = (e) => {
e.preventDefault();
const text = e.target.text.value;
const volume = parseFloat(e.target.volume.value);
const pitch = parseFloat(e.target.pitch.value);
const rate = parseFloat(e.target.rate.value);
const enqueue = e.target.enqueue.checked;
const voiceName = e.target.voiceName.value;

chrome.tts.speak(text, {

/**************************
*
* All these are optional
*
********************** */

volume: volume, // 0.0 - 1.0
rate: rate, // 0.1 - 10.0
pitch: pitch, // 0.0 - 2.0
lang: 'en-US', // in the form language-region. Examples: 'en', 'en-US', 'en-GB', 'zh-CN'.

// override any current TTSs in progress.
enqueue: enqueue,

// If empty, uses any available voice.
voiceName: voiceName,

// event handler
onEvent: (ttsEvent) => {
console.log(ttsEvent)
}

});
}

document.getElementById('pause').onclick = () => {
chrome.tts.pause();
}

document.getElementById('resume').onclick = () => {
chrome.tts.resume();
}
document.getElementById('stop').onclick = () => {
chrome.tts.stop();
}

background.js

// when extension is installe create context menu
chrome.runtime.onInstalled.addListener(() => {
chrome.contextMenus.create({ id: "speak", title: "Read", contexts: ["selection"] })
});

chrome.contextMenus.onClicked.addListener((info, tab) => {
if (info.menuItemId == 'speak') {

// get text
const text = info.selectionText;

// read out text
chrome.tts.speak(text);
}
});

If you’re interested you can implement a text-to-speech engine using the ttsEngine API. The documentation for that is here.

Leave a Reply

Your email address will not be published. Required fields are marked *

More Articles & Posts