我正在开展一个需要视频配音的项目。我正在寻找一个免费/便宜的选项来获得更自然的语音合成器选项,并遇到了一篇建议使用 Azure TTS 服务的文章。截至 2024 年 1 月 23 日,Azure 认知服务文本转语音服务仍然免费,最多 50 万个字符。非常适合我正在做的事情。
我在 Azure 中注册并创建了 TTS 服务。我选择 en-US-NancyNeural 作为我的主要声音,因为她的“耳语”风格听起来比其他人更好。
我想让窃窃私语的声音比默认情况下更柔和。我认为使用 SSML 是更改 TTS 结果的正确方法。我想知道是否有人可以分享他们使用选项并使低语更慢、更柔和、更安静(更自然)的经验。虽然默认南希的耳语比其他人好,但“她”仍然耳语得很快,很大声,哈哈。
什么效果好?什么不起作用?请分享你的经验
这是我的 TTS NodeJS 函数的示例
async function generateSpeechFromText(name, text, tempDirectory) {
console.log(`Generating speech from text for section: ${name}`)
const audioFile = `${tempDirectory}/${name}.wav`
const speechConfig = TTSSdk.SpeechConfig.fromSubscription(
process.env.AZURE_TTS_KEY,
process.env.AZURE_TTS_REGION
)
const audioConfig = TTSSdk.AudioConfig.fromAudioFileOutput(audioFile)
speechConfig.speechSynthesisVoiceName = "en-US-NancyNeural"
let synthesizer = new TTSSdk.SpeechSynthesizer(speechConfig, audioConfig)
const ssml = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
<voice name="${speechConfig.speechSynthesisVoiceName}">
<mstts:express-as style="whispering">
${text}
</mstts:express-as>
</voice>
</speak>`
return new Promise((resolve, reject) => {
synthesizer.speakSsmlAsync(
ssml,
(result) => {
if (result.reason === TTSSdk.ResultReason.SynthesizingAudioCompleted) {
console.log("Synthesis finished for: " + name)
resolve(audioFile)
} else {
console.error(
"Speech synthesis failed for: " + name,
result.errorDetails
)
reject(result.errorDetails)
}
synthesizer.close()
},
(err) => {
console.error("Error during synthesis for: " + name, err)
synthesizer.close()
reject(err)
}
)
})
}
并且这里是链接,指向介绍 SSML 结构和事件的页面
我想让窃窃私语的声音比默认情况下更柔和。我认为使用 SSML 是更改 TTS 结果的正确方法。
SSML(语音合成标记语言)可以利用各种属性来控制合成语音的速度、音量和音调。
<prosody rate="slow">
这是缓慢的耳语声。
<prosody volume="soft">
这是轻柔的耳语声。
<prosody pitch="-50%">
这是一种低音调的耳语声。配置如下:
const ssml = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
<voice name="${speechConfig.speechSynthesisVoiceName}">
<mstts:express-as style="whispering">
<prosody rate="slow" volume="soft" pitch="-50%">
${text}
</prosody>
</mstts:express-as>
</voice>
</speak>`;
const TTSSdk = require("microsoft-cognitiveservices-speech-sdk");
async function generateSpeechFromText(name, text, tempDirectory) {
console.log(`Generating speech from text for section: ${name}`);
const audioFile = `${tempDirectory}/${name}.wav`;
const speechConfig = TTSSdk.SpeechConfig.fromSubscription(
"tts-key",
"region"
);
const audioConfig = TTSSdk.AudioConfig.fromAudioFileOutput(audioFile);
speechConfig.speechSynthesisVoiceName = "en-US-NancyNeural";
let synthesizer = new TTSSdk.SpeechSynthesizer(speechConfig, audioConfig);
const ssml = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
<voice name="${speechConfig.speechSynthesisVoiceName}">
<mstts:express-as style="whispering">
// <prosody rate="slow" volume="soft" pitch="-50%">
${text}
</prosody>
</mstts:express-as>
</voice>
</speak>`;
return new Promise((resolve, reject) => {
synthesizer.speakSsmlAsync(
ssml,
(result) => {
if (result.reason === TTSSdk.ResultReason.SynthesizingAudioCompleted) {
console.log("Synthesis finished for: " + name);
resolve(audioFile);
} else {
console.error(
"Speech synthesis failed for: " + name,
result.errorDetails
);
reject(result.errorDetails);
}
synthesizer.close();
},
(err) => {
console.error("Error during synthesis for: " + name, err);
synthesizer.close();
reject(err);
}
);
});
}
// Example usage
const tempDirectory = "./output";
const sectionName = "example-modified";
const textToSynthesize = "Hello, this is a test whispering voice.";
generateSpeechFromText(sectionName, textToSynthesize, tempDirectory)
.then((audioFile) => {
console.log(`Audio file generated: ${audioFile}`);
})
.catch((error) => {
console.error("Error generating speech:", error);
console.error("Error during synthesis for: " + name, err);
console.error("Error stack trace:", err.stack);
});
输出:
生成的音频文件: