docs: update Chinese documentation and add Vue components

- Update AI capability dictionary by removing redundant mention of Baidu's model
- Add new Vue components for context engineering visualization (IntroProblemReasonSolution, MemoryPalaceDemo, MemoryPalaceActionDemo, KVCacheDemo, LostInMiddleDemo)
- Register new components in theme index.js
- Enhance audio introduction with new interactive demos (AudioQuickStartDemo, MelSpectrogramDemo, TTSPipelineDemo, VoiceCloningDemo, ASRvsTTSDemo, AudioTokenizationDemo, EmotionControlDemo)
- Improve existing context engineering demos with Chinese localization and better tokenization
- Fix Japanese documentation layout by properly closing NavGrid components
This commit is contained in:
sanbuphy
2026-02-03 19:41:14 +08:00
parent e5b1c6cc88
commit 084ebed417
30 changed files with 11563 additions and 2126 deletions
@@ -0,0 +1,789 @@
<!--
ASRvsTTSDemo.vue
ASR TTS 双向转换演示组件
用途
展示语音识别(ASR)和语音合成(TTS)的互逆过程
-->
<template>
<div class="asr-tts-demo">
<div class="header">
<div class="title">🔄 ASR TTS语音的双向转换</div>
<div class="subtitle">
探索语音识别和语音合成的互逆过程
</div>
</div>
<div class="conversion-flow">
<!-- ASR 区域 -->
<div class="flow-section">
<div class="section-header">
<span class="section-icon">🎙</span>
<div>
<div class="section-name">ASR 语音识别</div>
<div class="section-desc">音频 文本</div>
</div>
</div>
<div class="demo-box">
<div class="input-area">
<button
class="record-btn"
:class="{ recording: isRecording }"
@click="toggleRecording"
>
<span class="record-icon">{{ isRecording ? '⏹' : '🎤' }}</span>
<span>{{ isRecording ? '停止录音' : '开始录音' }}</span>
</button>
<div class="or-text"></div>
<button class="upload-audio-btn" @click="uploadAudio">
📁 上传音频
</button>
</div>
<div v-if="recordedAudio" class="audio-preview">
<canvas ref="inputWaveform" width="300" height="60"></canvas>
</div>
<button
class="process-btn"
:disabled="!recordedAudio || isProcessingASR"
@click="processASR"
>
<span v-if="isProcessingASR" class="spinner"></span>
<span v-else>🔍 识别语音</span>
</button>
<div v-if="asrResult" class="result-box">
<div class="result-label">识别结果</div>
<div class="result-text">{{ asrResult }}</div>
<div class="result-meta">
<span>置信度: {{ asrConfidence }}%</span>
<span>耗时: {{ asrTime }}ms</span>
</div>
</div>
</div>
</div>
<!-- 中间转换 -->
<div class="flow-arrow">
<div class="arrow-line"></div>
<div class="arrow-btns">
<button
class="arrow-btn"
:class="{ active: direction === 'asr' }"
@click="direction = 'asr'"
>
ASR
</button>
<button
class="arrow-btn"
:class="{ active: direction === 'tts' }"
@click="direction = 'tts'"
>
TTS
</button>
</div>
</div>
<!-- TTS 区域 -->
<div class="flow-section">
<div class="section-header">
<span class="section-icon">🔊</span>
<div>
<div class="section-name">TTS 语音合成</div>
<div class="section-desc">文本 音频</div>
</div>
</div>
<div class="demo-box">
<div class="input-area">
<textarea
v-model="ttsInput"
placeholder="输入要合成的文本..."
rows="3"
></textarea>
</div>
<div class="voice-select">
<label>选择声音:</label>
<div class="voice-options">
<button
v-for="voice in voices"
:key="voice.id"
class="voice-btn"
:class="{ active: selectedVoice === voice.id }"
@click="selectedVoice = voice.id"
>
{{ voice.icon }} {{ voice.name }}
</button>
</div>
</div>
<button
class="process-btn tts"
:disabled="!ttsInput.trim() || isProcessingTTS"
@click="processTTS"
>
<span v-if="isProcessingTTS" class="spinner"></span>
<span v-else>🗣 合成语音</span>
</button>
<div v-if="ttsResult" class="result-box audio-result">
<div class="result-label">合成结果</div>
<canvas ref="outputWaveform" width="300" height="60"></canvas>
<div class="audio-controls">
<button class="play-btn" @click="playResult">
{{ playing ? '' : '' }}
</button>
<div class="progress-bar">
<div class="progress" :style="{ width: playProgress + '%' }"></div>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="comparison-section">
<div class="comp-title">📊 ASR vs TTS 对比</div>
<div class="comp-grid">
<div class="comp-card">
<div class="comp-icon">🎙</div>
<div class="comp-name">ASR</div>
<div class="comp-items">
<div class="comp-item">
<span class="label">输入:</span>
<span>音频波形</span>
</div>
<div class="comp-item">
<span class="label">输出:</span>
<span>文本序列</span>
</div>
<div class="comp-item">
<span class="label">难点:</span>
<span>噪声口音同音词</span>
</div>
</div>
</div>
<div class="comp-card">
<div class="comp-icon">🔊</div>
<div class="comp-name">TTS</div>
<div class="comp-items">
<div class="comp-item">
<span class="label">输入:</span>
<span>文本序列</span>
</div>
<div class="comp-item">
<span class="label">输出:</span>
<span>音频波形</span>
</div>
<div class="comp-item">
<span class="label">难点:</span>
<span>韵律情感自然度</span>
</div>
</div>
</div>
</div>
</div>
<div class="pipeline-comparison">
<div class="pipe-title">🔀 架构对比</div>
<div class="pipeline-diagram">
<div class="pipeline asr-pipe">
<div class="pipe-label">ASR Pipeline</div>
<div class="pipe-flow">
<div class="pipe-step">音频</div>
<span></span>
<div class="pipe-step">特征</div>
<span></span>
<div class="pipe-step">Encoder</div>
<span></span>
<div class="pipe-step">Decoder</div>
<span></span>
<div class="pipe-step output">文本</div>
</div>
</div>
<div class="pipeline tts-pipe">
<div class="pipe-label">TTS Pipeline</div>
<div class="pipe-flow">
<div class="pipe-step">文本</div>
<span></span>
<div class="pipe-step">Encoder</div>
<span></span>
<div class="pipe-step">Decoder</div>
<span></span>
<div class="pipe-step">声码器</div>
<span></span>
<div class="pipe-step output">音频</div>
</div>
</div>
</div>
</div>
<div class="info-box">
<span class="icon">💡</span>
<p>
<strong>互逆关系</strong>
ASR TTS 是语音技术的两个核心方向互为逆过程
ASR 将连续的音频信号转换为离散的文本TTS 则将离散的文本转换为连续的音频信号
两者都依赖于声学模型和语言模型
</p>
</div>
</div>
</template>
<script setup>
import { ref, onMounted, watch } from 'vue'
const direction = ref('asr')
const isRecording = ref(false)
const recordedAudio = ref(false)
const isProcessingASR = ref(false)
const asrResult = ref('')
const asrConfidence = ref(0)
const asrTime = ref(0)
const ttsInput = ref('')
const selectedVoice = ref('default')
const isProcessingTTS = ref(false)
const ttsResult = ref(false)
const playing = ref(false)
const playProgress = ref(0)
const voices = [
{ id: 'default', name: '默认', icon: '🎙️' },
{ id: 'male', name: '男声', icon: '👨' },
{ id: 'female', name: '女声', icon: '👩' },
{ id: 'child', name: '童声', icon: '🧒' }
]
const inputWaveform = ref(null)
const outputWaveform = ref(null)
const toggleRecording = () => {
isRecording.value = !isRecording.value
if (!isRecording.value) {
recordedAudio.value = true
drawWaveform(inputWaveform.value)
}
}
const uploadAudio = () => {
recordedAudio.value = true
setTimeout(() => drawWaveform(inputWaveform.value), 100)
}
const drawWaveform = (canvas) => {
if (!canvas) return
const ctx = canvas.getContext('2d')
const w = canvas.width
const h = canvas.height
ctx.clearRect(0, 0, w, h)
ctx.strokeStyle = '#409eff'
ctx.lineWidth = 2
ctx.beginPath()
for (let x = 0; x < w; x += 2) {
const y = h / 2 + Math.sin(x * 0.1) * 20 + (Math.random() - 0.5) * 10
if (x === 0) ctx.moveTo(x, y)
else ctx.lineTo(x, y)
}
ctx.stroke()
}
const processASR = () => {
isProcessingASR.value = true
asrResult.value = ''
setTimeout(() => {
isProcessingASR.value = false
asrResult.value = '这是一段示例语音识别结果,展示了 ASR 的工作效果。'
asrConfidence.value = 94
asrTime.value = 320
ttsInput.value = asrResult.value
}, 1500)
}
const processTTS = () => {
isProcessingTTS.value = true
ttsResult.value = false
setTimeout(() => {
isProcessingTTS.value = false
ttsResult.value = true
setTimeout(() => drawWaveform(outputWaveform.value), 100)
}, 1500)
}
const playResult = () => {
playing.value = !playing.value
if (playing.value) {
playProgress.value = 0
const interval = setInterval(() => {
playProgress.value += 2
if (playProgress.value >= 100) {
playing.value = false
playProgress.value = 0
clearInterval(interval)
}
}, 100)
}
}
onMounted(() => {
if (recordedAudio.value) drawWaveform(inputWaveform.value)
})
</script>
<style scoped>
.asr-tts-demo {
background: var(--vp-c-bg-soft);
border: 1px solid var(--vp-c-divider);
border-radius: 12px;
padding: 24px;
margin: 24px 0;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
}
.header {
text-align: center;
margin-bottom: 24px;
}
.title {
font-size: 18px;
font-weight: 700;
margin-bottom: 8px;
background: linear-gradient(120deg, #409eff, #67c23a);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.subtitle {
font-size: 14px;
color: var(--vp-c-text-2);
}
.conversion-flow {
display: grid;
grid-template-columns: 1fr auto 1fr;
gap: 20px;
margin-bottom: 24px;
}
.flow-section {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 20px;
}
.section-header {
display: flex;
align-items: center;
gap: 12px;
margin-bottom: 16px;
}
.section-icon {
font-size: 32px;
}
.section-name {
font-weight: 600;
}
.section-desc {
font-size: 12px;
color: var(--vp-c-text-3);
}
.demo-box {
display: flex;
flex-direction: column;
gap: 12px;
}
.input-area {
display: flex;
flex-direction: column;
gap: 8px;
}
.record-btn {
padding: 16px;
background: var(--vp-c-bg-soft);
border: 2px solid var(--vp-c-divider);
border-radius: 8px;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
font-size: 14px;
transition: all 0.2s;
}
.record-btn:hover {
border-color: #f56c6c;
}
.record-btn.recording {
background: #f56c6c;
color: white;
border-color: #f56c6c;
animation: pulse 1.5s infinite;
}
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.7; }
}
.record-icon {
font-size: 20px;
}
.or-text {
text-align: center;
font-size: 12px;
color: var(--vp-c-text-3);
}
.upload-audio-btn {
padding: 12px;
background: var(--vp-c-bg-soft);
border: 1px dashed var(--vp-c-divider);
border-radius: 8px;
cursor: pointer;
color: var(--vp-c-text-2);
}
.audio-preview {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 12px;
}
.audio-preview canvas {
width: 100%;
height: auto;
}
.process-btn {
padding: 12px;
background: var(--vp-c-brand);
color: white;
border: none;
border-radius: 8px;
cursor: pointer;
font-weight: 500;
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
}
.process-btn.tts {
background: #67c23a;
}
.process-btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.spinner {
width: 16px;
height: 16px;
border: 2px solid rgba(255,255,255,0.3);
border-top-color: white;
border-radius: 50%;
animation: spin 1s linear infinite;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
.result-box {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 16px;
border: 1px solid var(--vp-c-divider);
}
.result-label {
font-size: 12px;
color: var(--vp-c-text-3);
margin-bottom: 8px;
}
.result-text {
font-size: 14px;
line-height: 1.5;
}
.result-meta {
display: flex;
gap: 16px;
margin-top: 12px;
font-size: 12px;
color: var(--vp-c-text-3);
}
textarea {
width: 100%;
padding: 12px;
border: 1px solid var(--vp-c-divider);
border-radius: 8px;
background: var(--vp-c-bg-soft);
font-size: 14px;
resize: vertical;
}
.voice-select {
display: flex;
flex-direction: column;
gap: 8px;
}
.voice-select label {
font-size: 12px;
color: var(--vp-c-text-3);
}
.voice-options {
display: flex;
gap: 8px;
flex-wrap: wrap;
}
.voice-btn {
padding: 8px 12px;
background: var(--vp-c-bg-soft);
border: 1px solid var(--vp-c-divider);
border-radius: 6px;
cursor: pointer;
font-size: 13px;
}
.voice-btn.active {
background: #67c23a;
color: white;
border-color: #67c23a;
}
.audio-result canvas {
width: 100%;
height: auto;
margin-bottom: 12px;
}
.audio-controls {
display: flex;
align-items: center;
gap: 12px;
}
.play-btn {
width: 36px;
height: 36px;
border-radius: 50%;
border: none;
background: #67c23a;
color: white;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
}
.progress-bar {
flex: 1;
height: 6px;
background: var(--vp-c-bg);
border-radius: 3px;
overflow: hidden;
}
.progress {
height: 100%;
background: #67c23a;
transition: width 0.1s;
}
.flow-arrow {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
gap: 12px;
}
.arrow-line {
width: 2px;
height: 100px;
background: var(--vp-c-divider);
}
.arrow-btns {
display: flex;
flex-direction: column;
gap: 8px;
}
.arrow-btn {
padding: 8px 16px;
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 20px;
cursor: pointer;
font-size: 12px;
}
.arrow-btn.active {
background: var(--vp-c-brand);
color: white;
border-color: var(--vp-c-brand);
}
.comparison-section {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
}
.comp-title {
font-weight: 600;
margin-bottom: 16px;
text-align: center;
}
.comp-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 16px;
}
.comp-card {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 20px;
text-align: center;
}
.comp-icon {
font-size: 32px;
margin-bottom: 8px;
}
.comp-name {
font-weight: 600;
margin-bottom: 12px;
}
.comp-items {
display: flex;
flex-direction: column;
gap: 8px;
text-align: left;
}
.comp-item {
font-size: 13px;
}
.comp-item .label {
color: var(--vp-c-text-3);
}
.pipeline-comparison {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
}
.pipe-title {
font-weight: 600;
margin-bottom: 16px;
text-align: center;
}
.pipeline-diagram {
display: flex;
flex-direction: column;
gap: 20px;
}
.pipeline {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 16px;
}
.pipe-label {
font-size: 12px;
color: var(--vp-c-text-3);
margin-bottom: 12px;
}
.pipe-flow {
display: flex;
align-items: center;
gap: 8px;
flex-wrap: wrap;
justify-content: center;
}
.pipe-step {
padding: 8px 12px;
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 6px;
font-size: 12px;
}
.pipe-step.output {
background: var(--vp-c-brand);
color: white;
border-color: var(--vp-c-brand);
}
.info-box {
display: flex;
gap: 12px;
padding: 16px;
background: var(--vp-c-bg-mute);
border-radius: 8px;
font-size: 13px;
line-height: 1.6;
}
.info-box .icon {
font-size: 18px;
flex-shrink: 0;
}
@media (max-width: 768px) {
.conversion-flow {
grid-template-columns: 1fr;
}
.flow-arrow {
flex-direction: row;
}
.arrow-line {
width: 100px;
height: 2px;
}
.arrow-btns {
flex-direction: row;
}
}
</style>
File diff suppressed because it is too large Load Diff
@@ -1,318 +1,687 @@
<!--
AudioTokenizationDemo.vue
音频 Tokenization 演示组件
用途
展示音频如何通过神经编解码器( EnCodecSoundStream)被压缩成离散的 Token
交互功能
- 音频压缩/解压流程
- 不同码率对比
- Token 可视化
- 重建质量评估
-->
<template>
<div class="tokenization-demo">
<div class="audio-tokenization-demo">
<el-card shadow="never">
<div class="controls">
<el-button type="primary" @click="playDemo" :loading="isPlaying">
<el-icon><VideoPlay /></el-icon> 演示处理流程
</el-button>
</div>
<el-steps
:active="activeStep"
align-center
finish-status="success"
class="steps"
>
<el-step title="音频信号" description="连续波形" />
<el-step title="切片 (Chunking)" description="20ms/帧" />
<el-step title="量化 (Quantization)" description="查字典" />
<el-step title="Token 序列" description="离散数字" />
</el-steps>
<div class="stage-display">
<!-- Stage 0: Audio -->
<div v-if="activeStep === 0" class="stage-content audio-stage">
<div class="waveform-viz">
<div
class="wave-bar"
v-for="n in 20"
:key="n"
:style="{
height: 30 + Math.random() * 50 + '%',
animationDelay: n * 0.1 + 's'
}"
></div>
</div>
<div class="stage-desc">原始的连续模拟信号或高采样率数字信号</div>
<template #header>
<div class="header-title">
<el-icon><Grid /></el-icon>
<span>🎵 音频 Tokenization神经编解码器</span>
</div>
</template>
<!-- Stage 1: Chunks -->
<div v-if="activeStep === 1" class="stage-content chunks-stage">
<div class="chunks-container">
<div class="chunk-item" v-for="n in 5" :key="n">
<span class="chunk-label">Frame {{ n }}</span>
</div>
</div>
<div class="stage-desc">
将音频切分为固定长度的小片段例如 20ms
</div>
</div>
<!-- Stage 2: Codebook -->
<div v-if="activeStep === 2" class="stage-content codebook-stage">
<div class="codebook-grid">
<div
class="codebook-entry"
v-for="n in 9"
:key="n"
:class="{ highlight: n === currentMatch }"
>
{{ 1024 + n * 50 }}
</div>
</div>
<div class="stage-desc">
在预训练的"声音字典"中寻找最接近的特征向量
</div>
</div>
<!-- Stage 3: Tokens -->
<div v-if="activeStep === 3" class="stage-content token-stage">
<div class="token-list">
<el-tag
v-for="(token, index) in tokens"
:key="index"
effect="dark"
size="large"
class="token-tag"
>
{{ token }}
</el-tag>
</div>
<div class="stage-desc">最终转换为 GPT 可以理解的数字序列</div>
</div>
</div>
<el-divider />
<div class="comparison-box">
<el-row :gutter="20">
<el-col :span="12">
<div class="compare-card">
<div class="compare-title">文本 GPT</div>
<div class="compare-content">
<el-tag type="info"></el-tag>
<el-tag type="info"></el-tag>
<el-tag type="info"></el-tag>
<el-tag type="info"></el-tag>
<div class="demo-content">
<!-- 流程图 -->
<div class="codec-flow">
<div class="flow-section encode">
<div class="section-title">🔽 编码器 (Encoder)</div>
<div class="flow-steps">
<div class="codec-step">
<div class="step-visual">
<canvas ref="originalWaveformCanvas" width="150" height="60" />
</div>
<div class="step-label">原始波形</div>
<div class="step-meta">24kHz, 16-bit</div>
</div>
<el-icon class="flow-arrow"><ArrowRight /></el-icon>
<div class="codec-step">
<div class="step-visual">
<div class="cnn-layers">
<div class="cnn-layer" v-for="i in 4" :key="i" :style="{ opacity: 0.3 + i * 0.2 }">
Conv {{ i }}
</div>
</div>
</div>
<div class="step-label">CNN 下采样</div>
<div class="step-meta">降维 320x</div>
</div>
<el-icon class="flow-arrow"><ArrowRight /></el-icon>
<div class="codec-step">
<div class="step-visual">
<div class="vq-codebook">
<div class="codebook-grid">
<div
v-for="i in 16"
:key="i"
class="codebook-cell"
:class="{ active: i <= 4 }"
/>
</div>
</div>
</div>
<div class="step-label">VQ 量化</div>
<div class="step-meta">离散 Token</div>
</div>
</div>
</el-col>
<el-col :span="12">
<div class="compare-card highlight-border">
<div class="compare-title">音频 GPT</div>
<div class="compare-content">
<el-tag type="warning">1024</el-tag>
<el-tag type="warning">5678</el-tag>
<el-tag type="warning">2340</el-tag>
<el-tag type="warning">8901</el-tag>
</div>
<div class="flow-divider">
<div class="divider-line"></div>
<div class="divider-label">压缩后: ~1.5 kbps</div>
<div class="divider-line"></div>
</div>
<div class="flow-section decode">
<div class="section-title">🔼 解码器 (Decoder)</div>
<div class="flow-steps reverse">
<div class="codec-step">
<div class="step-visual">
<div class="token-sequence">
<span
v-for="(token, i) in [42, 128, 7, 255, 33, 91]"
:key="i"
class="token"
:style="{ background: `hsl(${token}, 70%, 50%)` }"
>
{{ token }}
</span>
</div>
</div>
<div class="step-label">离散 Token</div>
<div class="step-meta">Codebook 索引</div>
</div>
<el-icon class="flow-arrow"><ArrowRight /></el-icon>
<div class="codec-step">
<div class="step-visual">
<div class="cnn-layers">
<div class="cnn-layer" v-for="i in 4" :key="i" :style="{ opacity: 1 - i * 0.15 }">
ConvT {{ 5 - i }}
</div>
</div>
</div>
<div class="step-label">转置卷积</div>
<div class="step-meta">上采样</div>
</div>
<el-icon class="flow-arrow"><ArrowRight /></el-icon>
<div class="codec-step">
<div class="step-visual">
<canvas ref="reconstructedWaveformCanvas" width="150" height="60" />
</div>
<div class="step-label">重建波形</div>
<div class="step-meta">24kHz</div>
</div>
</div>
</el-col>
</el-row>
</div>
</div>
<!-- 码率对比 -->
<div class="bitrate-comparison">
<div class="comparison-title">📊 不同码率对比</div>
<div class="bitrate-cards">
<div
v-for="config in bitrateConfigs"
:key="config.name"
class="bitrate-card"
:class="{ active: selectedBitrate === config.name }"
@click="selectedBitrate = config.name"
>
<div class="bitrate-value">{{ config.bitrate }}</div>
<div class="bitrate-name">{{ config.name }}</div>
<div class="bitrate-detail">
<div class="detail-item">
<span class="label">采样率:</span>
<span>{{ config.sampleRate }}</span>
</div>
<div class="detail-item">
<span class="label">帧率:</span>
<span>{{ config.frameRate }}</span>
</div>
<div class="detail-item">
<span class="label">码本大小:</span>
<span>{{ config.codebookSize }}</span>
</div>
</div>
<el-rate
v-model="config.quality"
disabled
show-score
text-color="#ff9900"
/>
</div>
</div>
</div>
<!-- Token 可视化 -->
<div class="token-visualization">
<div class="viz-title">🔢 Token 序列可视化</div>
<div class="token-display">
<div class="token-ruler">
<span v-for="i in 20" :key="i" class="ruler-mark">{{ i * 0.1 }}s</span>
</div>
<div class="token-stream">
<div
v-for="(token, i) in tokenSequence"
:key="i"
class="token-block"
:style="{
background: `hsl(${token % 360}, 70%, ${50 + (token % 20)}%)`,
height: `${20 + (token % 30)}px`
}"
:title="`Token: ${token}`"
/>
</div>
</div>
<div class="token-legend">
<span class="legend-item">
<span class="legend-color" style="background: #409eff"></span>
低频成分
</span>
<span class="legend-item">
<span class="legend-color" style="background: #67c23a"></span>
中频成分
</span>
<span class="legend-item">
<span class="legend-color" style="background: #e6a23c"></span>
高频成分
</span>
</div>
</div>
<!-- 应用场景 -->
<div class="applications">
<div class="apps-title">🎯 为什么需要音频 Tokenization</div>
<div class="apps-grid">
<div class="app-card">
<div class="app-icon">🚀</div>
<div class="app-title">高效传输</div>
<div class="app-desc">
将音频压缩到 ~1.5 kbps比原始音频小 256 适合网络传输
</div>
</div>
<div class="app-card">
<div class="app-icon">🧠</div>
<div class="app-title">语言模型友好</div>
<div class="app-desc">
离散 Token 可以被 LLM 直接处理实现文本到音频的统一建模
</div>
</div>
<div class="app-card">
<div class="app-icon">🎵</div>
<div class="app-title">音乐生成</div>
<div class="app-desc">
MusicGenAudioLDM 等模型使用音频 Token 生成音乐和音效
</div>
</div>
<div class="app-card">
<div class="app-icon">🗣</div>
<div class="app-title">语音合成</div>
<div class="app-desc">
VALL-ESoundStorm TTS 模型直接生成音频 Token
</div>
</div>
</div>
</div>
</div>
<el-alert
title="为什么要做 Tokenization?"
type="warning"
:closable="false"
description="因为 GPT 本质上是一个'预测下一个数字'的机器。只有把连续的声音变成离散的数字,才能用 GPT 来生成音频。"
show-icon
/>
<div class="info-box">
<p>
<span class="icon">💡</span>
<strong>神经音频编解码器</strong>
EnCodec (Meta)SoundStream (Google)SNAC 等模型使用 VQ-VAE 架构将音频压缩成离散 Token这些 Token 可以被语言模型处理实现高质量的音频生成和压缩
</p>
</div>
</el-card>
</div>
</template>
<script setup>
import { ref } from 'vue'
import { VideoPlay } from '@element-plus/icons-vue'
import { ref, onMounted } from 'vue'
import { Grid, ArrowRight } from '@element-plus/icons-vue'
const activeStep = ref(0)
const isPlaying = ref(false)
const currentMatch = ref(0)
const tokens = [1024, 5678, 2340, 8901, 3342]
const selectedBitrate = ref('EnCodec-24k')
const originalWaveformCanvas = ref(null)
const reconstructedWaveformCanvas = ref(null)
const playDemo = async () => {
if (isPlaying.value) return
isPlaying.value = true
activeStep.value = 0
// Step 0 -> 1
await wait(1000)
activeStep.value = 1
// Step 1 -> 2
await wait(1500)
activeStep.value = 2
// Simulate codebook matching
for (let i = 0; i < 5; i++) {
currentMatch.value = Math.floor(Math.random() * 9) + 1
await wait(200)
const bitrateConfigs = [
{
name: 'EnCodec-24k',
bitrate: '1.5 kbps',
sampleRate: '24 kHz',
frameRate: '75 Hz',
codebookSize: '1024',
quality: 4
},
{
name: 'EnCodec-48k',
bitrate: '3.0 kbps',
sampleRate: '48 kHz',
frameRate: '75 Hz',
codebookSize: '1024',
quality: 5
},
{
name: 'SoundStream',
bitrate: '6.0 kbps',
sampleRate: '16 kHz',
frameRate: '50 Hz',
codebookSize: '1024',
quality: 4.5
},
{
name: 'SNAC',
bitrate: '0.98 kbps',
sampleRate: '24 kHz',
frameRate: '43 Hz',
codebookSize: '4096',
quality: 4
}
currentMatch.value = 0
]
// Step 2 -> 3
activeStep.value = 3
// 生成模拟 Token 序列
const tokenSequence = Array.from({ length: 50 }, () => Math.floor(Math.random() * 1024))
isPlaying.value = false
// 绘制波形
const drawWaveform = (canvas, isNoisy = false) => {
if (!canvas) return
const ctx = canvas.getContext('2d')
const width = canvas.width
const height = canvas.height
ctx.clearRect(0, 0, width, height)
ctx.strokeStyle = '#409eff'
ctx.lineWidth = 1.5
ctx.beginPath()
for (let x = 0; x < width; x++) {
const t = x / width
let y = height / 2
// 基础波形
y += Math.sin(t * Math.PI * 8) * 15
y += Math.sin(t * Math.PI * 16) * 10
// 添加噪声(重建版本)
if (isNoisy) {
y += (Math.random() - 0.5) * 8
}
if (x === 0) {
ctx.moveTo(x, y)
} else {
ctx.lineTo(x, y)
}
}
ctx.stroke()
// 中心线
ctx.strokeStyle = '#e0e0e0'
ctx.lineWidth = 1
ctx.beginPath()
ctx.moveTo(0, height / 2)
ctx.lineTo(width, height / 2)
ctx.stroke()
}
const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
onMounted(() => {
drawWaveform(originalWaveformCanvas.value, false)
drawWaveform(reconstructedWaveformCanvas.value, true)
})
</script>
<style scoped>
.tokenization-demo {
margin: 20px 0;
.audio-tokenization-demo {
margin: 1rem 0;
}
.controls {
text-align: center;
margin-bottom: 20px;
}
.steps {
margin-bottom: 30px;
}
.stage-display {
background: var(--el-fill-color-light);
border-radius: 8px;
padding: 30px;
min-height: 200px;
.header-title {
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
font-weight: 600;
}
.demo-content {
display: flex;
flex-direction: column;
gap: 24px;
}
.stage-content {
.codec-flow {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 20px;
}
.flow-section {
margin-bottom: 16px;
}
.section-title {
font-weight: 500;
margin-bottom: 16px;
color: var(--vp-c-brand);
}
.flow-steps {
display: flex;
align-items: center;
justify-content: center;
gap: 16px;
flex-wrap: wrap;
}
.flow-steps.reverse {
flex-direction: row-reverse;
}
.codec-step {
text-align: center;
min-width: 120px;
}
.step-visual {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 12px;
margin-bottom: 8px;
min-height: 80px;
display: flex;
align-items: center;
justify-content: center;
}
.step-visual canvas {
width: 100%;
height: auto;
}
.stage-desc {
margin-top: 15px;
color: var(--el-text-color-secondary);
font-size: 0.9em;
.step-label {
font-weight: 500;
font-size: 0.875rem;
}
/* Audio Stage */
.waveform-viz {
height: 80px;
.step-meta {
font-size: 0.75rem;
color: var(--vp-c-text-3);
}
.flow-arrow {
color: var(--vp-c-text-3);
}
.cnn-layers {
display: flex;
align-items: center;
justify-content: center;
gap: 3px;
flex-direction: column;
gap: 4px;
}
.wave-bar {
width: 6px;
background: var(--el-color-primary);
border-radius: 3px;
animation: wave 1s ease-in-out infinite;
}
@keyframes wave {
0%,
100% {
height: 30%;
opacity: 0.5;
}
50% {
height: 100%;
opacity: 1;
}
}
/* Chunks Stage */
.chunks-container {
display: flex;
gap: 5px;
justify-content: center;
}
.chunk-item {
width: 60px;
height: 60px;
background: var(--el-color-primary-light-8);
border: 1px solid var(--el-color-primary);
.cnn-layer {
background: #409eff;
color: white;
padding: 4px 8px;
border-radius: 4px;
display: flex;
align-items: center;
justify-content: center;
font-size: 0.7rem;
}
.chunk-label {
font-size: 10px;
color: var(--el-color-primary);
.vq-codebook {
padding: 8px;
}
/* Codebook Stage */
.codebook-grid {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 10px;
max-width: 300px;
margin: 0 auto;
grid-template-columns: repeat(4, 1fr);
gap: 4px;
}
.codebook-entry {
padding: 10px;
background: var(--el-bg-color);
border: 1px solid var(--el-border-color);
.codebook-cell {
width: 16px;
height: 16px;
background: #e0e0e0;
border-radius: 2px;
}
.codebook-cell.active {
background: #67c23a;
}
.token-sequence {
display: flex;
gap: 4px;
flex-wrap: wrap;
max-width: 120px;
}
.token {
padding: 2px 6px;
border-radius: 4px;
font-family: monospace;
transition: all 0.3s;
}
.codebook-entry.highlight {
background: var(--el-color-warning);
font-size: 0.7rem;
color: white;
transform: scale(1.1);
border-color: var(--el-color-warning);
}
/* Token Stage */
.token-list {
display: flex;
gap: 10px;
justify-content: center;
flex-wrap: wrap;
}
.token-tag {
font-family: monospace;
font-weight: bold;
}
.comparison-box {
margin-top: 20px;
margin-bottom: 20px;
}
.compare-card {
background: var(--el-bg-color-page);
padding: 15px;
border-radius: 8px;
text-align: center;
border: 1px solid transparent;
}
.highlight-border {
border-color: var(--el-color-warning);
background: var(--el-color-warning-light-9);
}
.compare-title {
font-weight: bold;
margin-bottom: 10px;
font-size: 0.9em;
}
.compare-content {
.flow-divider {
display: flex;
align-items: center;
gap: 16px;
margin: 16px 0;
}
.divider-line {
flex: 1;
height: 1px;
background: var(--vp-c-divider);
}
.divider-label {
font-size: 0.875rem;
color: var(--vp-c-text-3);
white-space: nowrap;
}
.bitrate-comparison {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 20px;
}
.comparison-title {
font-weight: 500;
margin-bottom: 16px;
text-align: center;
}
.bitrate-cards {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
gap: 16px;
}
.bitrate-card {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 16px;
text-align: center;
cursor: pointer;
transition: all 0.2s;
border: 2px solid transparent;
}
.bitrate-card:hover {
border-color: var(--vp-c-brand);
}
.bitrate-card.active {
border-color: var(--vp-c-brand);
background: var(--vp-c-bg-mute);
}
.bitrate-value {
font-size: 1.5rem;
font-weight: 600;
color: var(--vp-c-brand);
margin-bottom: 4px;
}
.bitrate-name {
font-weight: 500;
margin-bottom: 12px;
}
.bitrate-detail {
font-size: 0.75rem;
color: var(--vp-c-text-3);
margin-bottom: 12px;
}
.detail-item {
display: flex;
justify-content: space-between;
padding: 2px 0;
}
.detail-item .label {
color: var(--vp-c-text-2);
}
.token-visualization {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 20px;
}
.viz-title {
font-weight: 500;
margin-bottom: 16px;
text-align: center;
}
.token-display {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 16px;
overflow-x: auto;
}
.token-ruler {
display: flex;
gap: 8px;
margin-bottom: 8px;
font-size: 0.7rem;
color: var(--vp-c-text-3);
}
.ruler-mark {
min-width: 30px;
}
.token-stream {
display: flex;
gap: 2px;
align-items: flex-end;
height: 60px;
}
.token-block {
flex: 1;
min-width: 8px;
border-radius: 2px;
transition: all 0.2s;
}
.token-block:hover {
transform: scaleY(1.2);
z-index: 1;
}
.token-legend {
display: flex;
gap: 5px;
justify-content: center;
flex-wrap: wrap;
gap: 24px;
margin-top: 16px;
}
.legend-item {
display: flex;
align-items: center;
gap: 8px;
font-size: 0.875rem;
}
.legend-color {
width: 16px;
height: 16px;
border-radius: 4px;
}
.applications {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 20px;
}
.apps-title {
font-weight: 500;
margin-bottom: 16px;
text-align: center;
}
.apps-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 16px;
}
.app-card {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 16px;
text-align: center;
}
.app-icon {
font-size: 2rem;
margin-bottom: 8px;
}
.app-title {
font-weight: 600;
margin-bottom: 8px;
}
.app-desc {
font-size: 0.8rem;
color: var(--vp-c-text-3);
line-height: 1.5;
}
.info-box {
margin-top: 16px;
padding: 12px;
background: var(--vp-c-bg-mute);
border-radius: 6px;
font-size: 0.9rem;
line-height: 1.6;
}
.icon {
font-size: 1.2em;
}
@media (max-width: 640px) {
.flow-steps {
flex-direction: column;
}
.flow-steps.reverse {
flex-direction: column;
}
.flow-arrow {
transform: rotate(90deg);
}
}
</style>
@@ -0,0 +1,533 @@
<!--
EmotionControlDemo.vue
情感控制演示组件
用途
展示如何在 TTS 中控制情感语速语调等风格特征
交互功能
- 情感选择器
- 语速和音调滑块
- 实时预览
- 情感向量可视化
-->
<template>
<div class="emotion-control-demo">
<el-card shadow="never">
<template #header>
<div class="header-title">
<el-icon><MagicStick /></el-icon>
<span>🎭 情感与风格控制</span>
</div>
</template>
<div class="demo-content">
<!-- 情感选择 -->
<div class="emotion-selector">
<div class="selector-title">选择情感风格</div>
<div class="emotion-grid">
<div
v-for="emotion in emotions"
:key="emotion.id"
class="emotion-card"
:class="{ active: selectedEmotion === emotion.id }"
@click="selectEmotion(emotion.id)"
>
<div class="emotion-emoji">{{ emotion.emoji }}</div>
<div class="emotion-name">{{ emotion.name }}</div>
<div class="emotion-desc">{{ emotion.description }}</div>
</div>
</div>
</div>
<!-- 情感向量可视化 -->
<div class="emotion-embedding">
<div class="embedding-title">情感向量空间 (Emotion Embedding)</div>
<canvas
ref="emotionCanvas"
width="400"
height="200"
class="emotion-canvas"
/>
<div class="embedding-legend">
<span
v-for="emotion in emotions"
:key="emotion.id"
class="legend-item"
>
<span
class="legend-dot"
:style="{ background: emotion.color }"
/>
{{ emotion.name }}
</span>
</div>
</div>
<!-- 参数控制 -->
<div class="parameter-controls">
<div class="control-title">🎚 细粒度控制</div>
<div class="controls-grid">
<div class="control-item">
<div class="control-label">
<span>语速</span>
<el-tag size="small">{{ speed }}x</el-tag>
</div>
<el-slider v-model="speed" :min="0.5" :max="2" :step="0.1" />
<div class="control-hint">
<span></span>
<span>正常</span>
<span></span>
</div>
</div>
<div class="control-item">
<div class="control-label">
<span>音调</span>
<el-tag size="small">{{ pitch > 0 ? '+' : '' }}{{ pitch }}</el-tag>
</div>
<el-slider v-model="pitch" :min="-10" :max="10" :step="1" />
<div class="control-hint">
<span></span>
<span>正常</span>
<span></span>
</div>
</div>
<div class="control-item">
<div class="control-label">
<span>音量动态</span>
<el-tag size="small">{{ energy }}%</el-tag>
</div>
<el-slider v-model="energy" :min="50" :max="150" :step="5" />
<div class="control-hint">
<span>柔和</span>
<span>适中</span>
<span>激昂</span>
</div>
</div>
<div class="control-item">
<div class="control-label">
<span>停顿控制</span>
<el-tag size="small">{{ pause }}ms</el-tag>
</div>
<el-slider v-model="pause" :min="0" :max="500" :step="50" />
<div class="control-hint">
<span>紧凑</span>
<span>自然</span>
<span>舒缓</span>
</div>
</div>
</div>
</div>
<!-- 文本输入和预览 -->
<div class="preview-section">
<div class="preview-title">🎙 预览合成</div>
<el-input
v-model="previewText"
type="textarea"
:rows="2"
placeholder="输入要合成的文本..."
class="preview-input"
/>
<div class="preview-actions">
<el-button type="primary" @click="synthesize">
<el-icon><VideoPlay /></el-icon>
合成预览
</el-button>
<el-button @click="resetParameters">
<el-icon><RefreshRight /></el-icon>
重置参数
</el-button>
</div>
</div>
<!-- 技术说明 -->
<div class="tech-explanation">
<el-collapse>
<el-collapse-item title="🔬 情感控制原理">
<div class="tech-content">
<h4>全局风格 Token (Global Style Token)</h4>
<p>
GST (Global Style Token) 是一种从参考音频中提取风格特征的方法模型学习将情感语速语调等风格信息编码成一组 Token
在推理时可以通过选择或插值这些 Token 来控制合成风格
</p>
<h4>参考音频编码</h4>
<p>
用户提供一段带有目标情感的参考音频编码器提取其风格特征向量这个向量作为条件输入到 TTS 模型
指导生成相似风格的语音
</p>
<h4>细粒度控制</h4>
<p>
现代 TTS 模型 CosyVoiceF5-TTS支持细粒度的风格控制包括
</p>
<ul>
<li><strong>速度控制</strong>调整音频播放速度而不改变音调</li>
<li><strong>音调控制</strong>改变基频 (F0) 曲线</li>
<li><strong>能量控制</strong>调整音量包络</li>
<li><strong>停顿控制</strong>调整句间和短语间的停顿长度</li>
</ul>
</div>
</el-collapse-item>
</el-collapse>
</div>
</div>
<div class="info-box">
<p>
<span class="icon">💡</span>
<strong>情感控制</strong>
现代 TTS 系统不仅能合成自然的语音还能精确控制情感语速语调等风格特征这使得 AI 配音可以适应不同的应用场景从平静的客服对话到激昂的演讲
</p>
</div>
</el-card>
</div>
</template>
<script setup>
import { ref, onMounted, watch } from 'vue'
import { MagicStick, VideoPlay, RefreshRight } from '@element-plus/icons-vue'
const emotions = [
{ id: 'neutral', name: '中性', emoji: '😐', description: '平稳自然', color: '#909399' },
{ id: 'happy', name: '开心', emoji: '😊', description: '轻快愉悦', color: '#67c23a' },
{ id: 'sad', name: '悲伤', emoji: '😢', description: '低沉缓慢', color: '#409eff' },
{ id: 'angry', name: '愤怒', emoji: '😠', description: '激昂有力', color: '#f56c6c' },
{ id: 'excited', name: '兴奋', emoji: '🤩', description: '热情高涨', color: '#e6a23c' },
{ id: 'calm', name: '平静', emoji: '😌', description: '舒缓放松', color: '#13c2c2' }
]
const selectedEmotion = ref('neutral')
const speed = ref(1.0)
const pitch = ref(0)
const energy = ref(100)
const pause = ref(150)
const previewText = ref('这是一段带有情感控制的语音合成演示。')
const emotionCanvas = ref(null)
const selectEmotion = (id) => {
selectedEmotion.value = id
drawEmotionEmbedding()
}
const resetParameters = () => {
speed.value = 1.0
pitch.value = 0
energy.value = 100
pause.value = 150
selectedEmotion.value = 'neutral'
drawEmotionEmbedding()
}
const synthesize = () => {
// 模拟合成
console.log('Synthesizing with:', {
emotion: selectedEmotion.value,
speed: speed.value,
pitch: pitch.value,
energy: energy.value,
pause: pause.value
})
}
// 绘制情感向量空间
const drawEmotionEmbedding = () => {
const canvas = emotionCanvas.value
if (!canvas) return
const ctx = canvas.getContext('2d')
const width = canvas.width
const height = canvas.height
ctx.clearRect(0, 0, width, height)
// 绘制坐标轴
ctx.strokeStyle = '#e0e0e0'
ctx.lineWidth = 1
// X轴 (Valence: 消极 -> 积极)
ctx.beginPath()
ctx.moveTo(40, height / 2)
ctx.lineTo(width - 20, height / 2)
ctx.stroke()
// Y轴 (Arousal: 平静 -> 兴奋)
ctx.beginPath()
ctx.moveTo(width / 2, height - 30)
ctx.lineTo(width / 2, 20)
ctx.stroke()
// 轴标签
ctx.fillStyle = '#666'
ctx.font = '12px sans-serif'
ctx.textAlign = 'center'
ctx.fillText('Valence (消极 → 积极)', width / 2, height - 10)
ctx.save()
ctx.translate(15, height / 2)
ctx.rotate(-Math.PI / 2)
ctx.fillText('Arousal (平静 → 兴奋)', 0, 0)
ctx.restore()
// 情感位置
const emotionPositions = {
neutral: { x: 0.5, y: 0.5 },
happy: { x: 0.8, y: 0.7 },
sad: { x: 0.2, y: 0.3 },
angry: { x: 0.3, y: 0.9 },
excited: { x: 0.9, y: 0.9 },
calm: { x: 0.6, y: 0.2 }
}
// 绘制所有情感点
emotions.forEach(emotion => {
const pos = emotionPositions[emotion.id]
const x = 50 + pos.x * (width - 80)
const y = height - 40 - pos.y * (height - 60)
// 绘制点
ctx.beginPath()
ctx.arc(x, y, emotion.id === selectedEmotion.value ? 12 : 8, 0, Math.PI * 2)
ctx.fillStyle = emotion.color
ctx.fill()
// 选中效果
if (emotion.id === selectedEmotion.value) {
ctx.strokeStyle = emotion.color
ctx.lineWidth = 2
ctx.beginPath()
ctx.arc(x, y, 18, 0, Math.PI * 2)
ctx.stroke()
}
// 标签
ctx.fillStyle = '#333'
ctx.font = emotion.id === selectedEmotion.value ? 'bold 12px sans-serif' : '12px sans-serif'
ctx.textAlign = 'center'
ctx.fillText(emotion.name, x, y + 25)
})
}
onMounted(drawEmotionEmbedding)
watch(selectedEmotion, drawEmotionEmbedding)
</script>
<style scoped>
.emotion-control-demo {
margin: 1rem 0;
}
.header-title {
display: flex;
align-items: center;
gap: 8px;
font-weight: 600;
}
.demo-content {
display: flex;
flex-direction: column;
gap: 24px;
}
.emotion-selector {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 20px;
}
.selector-title {
font-weight: 500;
margin-bottom: 16px;
}
.emotion-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
gap: 12px;
}
.emotion-card {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 16px;
text-align: center;
cursor: pointer;
transition: all 0.2s;
border: 2px solid transparent;
}
.emotion-card:hover {
border-color: var(--vp-c-brand);
transform: translateY(-2px);
}
.emotion-card.active {
border-color: var(--vp-c-brand);
background: var(--vp-c-bg-mute);
}
.emotion-emoji {
font-size: 2rem;
margin-bottom: 8px;
}
.emotion-name {
font-weight: 600;
margin-bottom: 4px;
}
.emotion-desc {
font-size: 0.75rem;
color: var(--vp-c-text-3);
}
.emotion-embedding {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 20px;
}
.embedding-title {
font-weight: 500;
margin-bottom: 16px;
text-align: center;
}
.emotion-canvas {
width: 100%;
height: auto;
max-height: 200px;
background: var(--vp-c-bg);
border-radius: 8px;
}
.embedding-legend {
display: flex;
justify-content: center;
flex-wrap: wrap;
gap: 16px;
margin-top: 16px;
}
.legend-item {
display: flex;
align-items: center;
gap: 6px;
font-size: 0.875rem;
}
.legend-dot {
width: 12px;
height: 12px;
border-radius: 50%;
}
.parameter-controls {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 20px;
}
.control-title {
font-weight: 500;
margin-bottom: 16px;
}
.controls-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 24px;
}
.control-item {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 16px;
}
.control-label {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 12px;
}
.control-hint {
display: flex;
justify-content: space-between;
margin-top: 8px;
font-size: 0.75rem;
color: var(--vp-c-text-3);
}
.preview-section {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 20px;
}
.preview-title {
font-weight: 500;
margin-bottom: 16px;
}
.preview-input {
margin-bottom: 16px;
}
.preview-actions {
display: flex;
gap: 12px;
}
.tech-explanation {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 20px;
}
.tech-content h4 {
margin: 16px 0 8px 0;
color: var(--vp-c-brand);
}
.tech-content h4:first-child {
margin-top: 0;
}
.tech-content p {
margin: 0 0 12px 0;
line-height: 1.6;
color: var(--vp-c-text-2);
}
.tech-content ul {
margin: 0;
padding-left: 20px;
color: var(--vp-c-text-2);
}
.tech-content li {
margin-bottom: 8px;
line-height: 1.5;
}
.info-box {
margin-top: 16px;
padding: 12px;
background: var(--vp-c-bg-mute);
border-radius: 6px;
font-size: 0.9rem;
line-height: 1.6;
}
.icon {
font-size: 1.2em;
}
</style>
@@ -0,0 +1,567 @@
<!--
MelSpectrogramDemo.vue
梅尔频谱图交互演示组件
用途
让用户直观理解音频如何从波形转换为梅尔频谱图以及梅尔刻度的原理
交互功能
- 选择不同音频类型语音/音乐/噪声
- 实时查看波形和频谱对比
- 调整 FFT 参数观察变化
- 理解梅尔刻度 vs 线性刻度
-->
<template>
<div class="mel-spec-demo">
<div class="header">
<div class="title">📊 梅尔频谱AI 如何"看懂"声音</div>
<div class="subtitle">
声音是波 AI 看到的是频谱图探索波形如何变成 AI 能理解的"图像"
</div>
</div>
<div class="control-panel">
<div class="audio-types">
<button
v-for="type in audioTypes"
:key="type.id"
@click="selectType(type.id)"
class="type-btn"
:class="{ active: selectedType === type.id }"
>
<span class="type-icon">{{ type.icon }}</span>
<span>{{ type.name }}</span>
</button>
</div>
<div class="param-controls">
<div class="param">
<label>FFT 窗口</label>
<input
type="range"
v-model="fftSize"
min="256"
max="2048"
step="256"
/>
<span class="value">{{ fftSize }}</span>
</div>
<div class="param">
<label>梅尔滤波器</label>
<input
type="range"
v-model="melBins"
min="20"
max="128"
step="4"
/>
<span class="value">{{ melBins }}</span>
</div>
</div>
</div>
<div class="visualization">
<!-- 波形图 -->
<div class="viz-section">
<div class="viz-header">
<span class="viz-title">🔊 波形 (时域)</span>
<span class="viz-desc">原始音频振幅随时间变化</span>
</div>
<canvas ref="waveformCanvas" width="600" height="100"></canvas>
</div>
<div class="transform-arrow">
<span>STFT 变换</span>
<span class="arrow"></span>
</div>
<!-- 频谱对比 -->
<div class="spec-comparison">
<div class="viz-section">
<div class="viz-header">
<span class="viz-title">📈 线性频谱</span>
<span class="viz-tag">高频分辨率低</span>
</div>
<canvas ref="linearCanvas" width="280" height="150"></canvas>
</div>
<div class="vs">VS</div>
<div class="viz-section highlight">
<div class="viz-header">
<span class="viz-title">🎯 梅尔频谱</span>
<span class="viz-tag success">符合人耳感知</span>
</div>
<canvas ref="melCanvas" width="280" height="150"></canvas>
</div>
</div>
</div>
<div class="explanation">
<div class="exp-title">🎧 为什么用梅尔刻度</div>
<div class="exp-content">
<div class="exp-item">
<div class="exp-visual">
<div class="freq-bars human">
<div class="bar" style="height: 80%"></div>
<div class="bar" style="height: 60%"></div>
<div class="bar" style="height: 40%"></div>
<div class="bar" style="height: 20%"></div>
</div>
</div>
<div class="exp-text">
<strong>人耳感知</strong><br>
100Hz200Hz 10000Hz10100Hz 感知差异相同
</div>
</div>
<div class="exp-item">
<div class="exp-visual">
<div class="freq-bars linear">
<div class="bar" style="height: 10%"></div>
<div class="bar" style="height: 20%"></div>
<div class="bar" style="height: 70%"></div>
<div class="bar" style="height: 90%"></div>
</div>
</div>
<div class="exp-text">
<strong>线性刻度</strong><br>
等距频率间隔不符合人耳感知
</div>
</div>
</div>
</div>
<div class="info-box">
<span class="icon">💡</span>
<p>
<strong>梅尔频谱原理</strong>
梅尔刻度模拟了人耳对频率的非线性感知人耳对低频变化更敏感对高频变化较迟钝
梅尔频谱将频率映射到梅尔刻度使 AI 更关注人耳敏感的部分
</p>
</div>
</div>
</template>
<script setup>
import { ref, onMounted, watch } from 'vue'
const audioTypes = [
{ id: 'speech', name: '语音', icon: '🗣️' },
{ id: 'music', name: '音乐', icon: '🎵' },
{ id: 'noise', name: '噪声', icon: '📢' }
]
const selectedType = ref('speech')
const fftSize = ref(1024)
const melBins = ref(80)
const waveformCanvas = ref(null)
const linearCanvas = ref(null)
const melCanvas = ref(null)
const selectType = (type) => {
selectedType.value = type
}
// 生成波形数据
const generateWaveform = (type) => {
const samples = 600
const data = []
for (let i = 0; i < samples; i++) {
let value = 0
const t = i / samples
if (type === 'speech') {
value = Math.sin(t * 20 * Math.PI) * 0.3 +
Math.sin(t * 50 * Math.PI) * 0.2 +
Math.sin(t * 120 * Math.PI) * 0.15 +
(Math.random() - 0.5) * 0.1
} else if (type === 'music') {
value = Math.sin(t * 10 * Math.PI) * 0.4 +
Math.sin(t * 25 * Math.PI) * 0.3 +
Math.sin(t * 40 * Math.PI) * 0.2
} else {
value = (Math.random() - 0.5) * 0.8
}
data.push(value)
}
return data
}
// 绘制波形
const drawWaveform = () => {
const canvas = waveformCanvas.value
if (!canvas) return
const ctx = canvas.getContext('2d')
const width = canvas.width
const height = canvas.height
ctx.clearRect(0, 0, width, height)
const data = generateWaveform(selectedType.value)
const centerY = height / 2
ctx.strokeStyle = '#409eff'
ctx.lineWidth = 2
ctx.beginPath()
for (let i = 0; i < data.length; i++) {
const x = (i / data.length) * width
const y = centerY + data[i] * height * 0.4
if (i === 0) ctx.moveTo(x, y)
else ctx.lineTo(x, y)
}
ctx.stroke()
// 中心线
ctx.strokeStyle = '#e0e0e0'
ctx.lineWidth = 1
ctx.beginPath()
ctx.moveTo(0, centerY)
ctx.lineTo(width, centerY)
ctx.stroke()
}
// 生成频谱数据
const generateSpectrogram = (isMel = false) => {
const timeBins = 60
const freqBins = isMel ? melBins.value : 80
const data = []
for (let t = 0; t < timeBins; t++) {
const frame = []
for (let f = 0; f < freqBins; f++) {
let value = 0
const normalizedF = f / freqBins
if (selectedType.value === 'speech') {
const formant1 = Math.exp(-Math.pow(normalizedF - 0.1, 2) / 0.01)
const formant2 = Math.exp(-Math.pow(normalizedF - 0.3, 2) / 0.02)
value = (formant1 + formant2 * 0.7) * (0.8 + Math.random() * 0.2)
} else if (selectedType.value === 'music') {
value = Math.sin(normalizedF * Math.PI * 3) * 0.5 + 0.5
value *= (0.7 + Math.random() * 0.3)
} else {
value = Math.random() * 0.5
}
if (isMel) {
value *= (1 - normalizedF * 0.3)
}
frame.push(value)
}
data.push(frame)
}
return data
}
// 绘制频谱图
const drawSpectrogram = (canvas, data) => {
if (!canvas) return
const ctx = canvas.getContext('2d')
const width = canvas.width
const height = canvas.height
ctx.clearRect(0, 0, width, height)
const cellWidth = width / data.length
const cellHeight = height / data[0].length
for (let t = 0; t < data.length; t++) {
for (let f = 0; f < data[t].length; f++) {
const value = data[t][f]
const intensity = Math.floor(value * 255)
const r = intensity
const g = Math.floor(intensity * 0.6)
const b = Math.floor(intensity * 0.2)
ctx.fillStyle = `rgb(${r}, ${g}, ${b})`
ctx.fillRect(
t * cellWidth,
height - (f + 1) * cellHeight,
cellWidth + 1,
cellHeight + 1
)
}
}
}
const updateVisualization = () => {
drawWaveform()
drawSpectrogram(linearCanvas.value, generateSpectrogram(false))
drawSpectrogram(melCanvas.value, generateSpectrogram(true))
}
onMounted(updateVisualization)
watch([selectedType, fftSize, melBins], updateVisualization)
</script>
<style scoped>
.mel-spec-demo {
background: var(--vp-c-bg-soft);
border: 1px solid var(--vp-c-divider);
border-radius: 12px;
padding: 24px;
margin: 24px 0;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
}
.header {
text-align: center;
margin-bottom: 24px;
}
.title {
font-size: 18px;
font-weight: 700;
margin-bottom: 8px;
background: linear-gradient(120deg, #409eff, #67c23a);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.subtitle {
font-size: 14px;
color: var(--vp-c-text-2);
}
.control-panel {
display: flex;
flex-wrap: wrap;
gap: 20px;
margin-bottom: 24px;
padding: 16px;
background: var(--vp-c-bg);
border-radius: 8px;
}
.audio-types {
display: flex;
gap: 10px;
flex-wrap: wrap;
}
.type-btn {
padding: 10px 16px;
border: 1px solid var(--vp-c-divider);
border-radius: 20px;
background: var(--vp-c-bg);
cursor: pointer;
display: flex;
align-items: center;
gap: 6px;
font-size: 13px;
transition: all 0.2s;
}
.type-btn:hover {
border-color: var(--vp-c-brand);
}
.type-btn.active {
background: var(--vp-c-brand);
color: white;
border-color: var(--vp-c-brand);
}
.param-controls {
display: flex;
gap: 20px;
flex-wrap: wrap;
flex: 1;
justify-content: flex-end;
}
.param {
display: flex;
align-items: center;
gap: 8px;
}
.param label {
font-size: 12px;
color: var(--vp-c-text-2);
}
.param input[type="range"] {
width: 100px;
}
.param .value {
font-size: 12px;
font-family: monospace;
min-width: 40px;
}
.visualization {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
}
.viz-section {
margin-bottom: 16px;
}
.viz-section.highlight {
border: 2px solid #67c23a;
border-radius: 8px;
padding: 12px;
}
.viz-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 12px;
}
.viz-title {
font-weight: 600;
font-size: 14px;
}
.viz-desc {
font-size: 12px;
color: var(--vp-c-text-3);
}
.viz-tag {
font-size: 11px;
padding: 4px 8px;
background: #e6a23c33;
color: #e6a23c;
border-radius: 4px;
}
.viz-tag.success {
background: #67c23a33;
color: #67c23a;
}
.viz-section canvas {
width: 100%;
height: auto;
background: #f5f5f5;
border-radius: 6px;
}
.transform-arrow {
text-align: center;
padding: 12px;
color: var(--vp-c-text-3);
font-size: 13px;
display: flex;
flex-direction: column;
align-items: center;
gap: 4px;
}
.transform-arrow .arrow {
font-size: 20px;
}
.spec-comparison {
display: grid;
grid-template-columns: 1fr auto 1fr;
gap: 16px;
align-items: center;
}
.vs {
font-weight: 600;
color: var(--vp-c-text-3);
}
.explanation {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
}
.exp-title {
font-weight: 600;
margin-bottom: 16px;
text-align: center;
}
.exp-content {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 24px;
}
.exp-item {
display: flex;
flex-direction: column;
align-items: center;
gap: 12px;
text-align: center;
}
.freq-bars {
display: flex;
align-items: flex-end;
gap: 8px;
height: 80px;
padding: 10px;
background: var(--vp-c-bg-soft);
border-radius: 8px;
}
.freq-bars .bar {
width: 30px;
border-radius: 4px 4px 0 0;
}
.freq-bars.human .bar {
background: linear-gradient(to top, #409eff, #67c23a);
}
.freq-bars.linear .bar {
background: linear-gradient(to top, #e6a23c, #f56c6c);
}
.exp-text {
font-size: 13px;
line-height: 1.5;
color: var(--vp-c-text-2);
}
.info-box {
display: flex;
gap: 12px;
padding: 16px;
background: var(--vp-c-bg-mute);
border-radius: 8px;
font-size: 13px;
line-height: 1.6;
}
.info-box .icon {
font-size: 18px;
flex-shrink: 0;
}
@media (max-width: 640px) {
.spec-comparison {
grid-template-columns: 1fr;
}
.vs {
transform: rotate(90deg);
}
}
</style>
@@ -0,0 +1,588 @@
<!--
TTSPipelineDemo.vue
TTS 流程演示组件
用途
展示文本转语音的完整流程对比不同架构自回归/非自回归/流匹配
-->
<template>
<div class="tts-pipeline-demo">
<div class="header">
<div class="title">🔄 TTS 架构演进从慢到快</div>
<div class="subtitle">
探索文本如何变成语音以及不同架构的优劣对比
</div>
</div>
<div class="arch-selector">
<button
v-for="arch in architectures"
:key="arch.id"
@click="selectArch(arch.id)"
class="arch-btn"
:class="{ active: selectedArch === arch.id }"
>
<span class="arch-icon">{{ arch.icon }}</span>
<span class="arch-name">{{ arch.name }}</span>
<span class="arch-tag" :class="arch.tagClass">{{ arch.tag }}</span>
</button>
</div>
<div class="pipeline-flow">
<div
v-for="(stage, index) in currentStages"
:key="stage.id"
class="stage"
:class="{ active: activeStage === index }"
@click="activeStage = index"
>
<div class="stage-num">{{ index + 1 }}</div>
<div class="stage-content">
<div class="stage-icon">{{ stage.icon }}</div>
<div class="stage-name">{{ stage.name }}</div>
<div class="stage-desc">{{ stage.shortDesc }}</div>
</div>
<div v-if="index < currentStages.length - 1" class="stage-arrow"></div>
</div>
</div>
<div class="stage-detail" v-if="currentStage">
<div class="detail-header">
<span class="detail-icon">{{ currentStage.icon }}</span>
<div>
<div class="detail-name">{{ currentStage.name }}</div>
<div class="detail-desc">{{ currentStage.description }}</div>
</div>
</div>
<div class="detail-canvas">
<canvas ref="detailCanvas" width="500" height="150"></canvas>
</div>
<div class="detail-meta">
<div class="meta-item">
<span class="label">输入:</span>
<span>{{ currentStage.input }}</span>
</div>
<div class="meta-item">
<span class="label">输出:</span>
<span>{{ currentStage.output }}</span>
</div>
<div class="meta-item">
<span class="label">技术:</span>
<span>{{ currentStage.tech }}</span>
</div>
</div>
</div>
<div class="comparison-table">
<div class="table-title">📊 架构对比</div>
<div class="table">
<div class="table-header">
<div class="cell">特性</div>
<div class="cell">自回归</div>
<div class="cell">非自回归</div>
<div class="cell">流匹配</div>
</div>
<div
v-for="row in comparisonRows"
:key="row.feature"
class="table-row"
>
<div class="cell feature">{{ row.feature }}</div>
<div class="cell" :class="{ highlight: selectedArch === 'ar' }">{{ row.ar }}</div>
<div class="cell" :class="{ highlight: selectedArch === 'nar' }">{{ row.nar }}</div>
<div class="cell" :class="{ highlight: selectedArch === 'flow' }">{{ row.flow }}</div>
</div>
</div>
</div>
<div class="models-section">
<div class="models-title">🏆 代表模型</div>
<div class="models-grid">
<div
v-for="model in models"
:key="model.name"
class="model-card"
:class="{ active: model.arch === selectedArch }"
>
<div class="model-name">{{ model.name }}</div>
<span class="model-tag" :class="model.tagClass">{{ model.type }}</span>
<div class="model-desc">{{ model.desc }}</div>
</div>
</div>
</div>
<div class="info-box">
<span class="icon">💡</span>
<p>
<strong>TTS 演进趋势</strong>
从早期的自回归模型 Tacotron到非自回归 FastSpeech再到最新的流匹配模型 F5-TTS
TTS 技术正在向更快更稳定更高质量的方向发展
</p>
</div>
</div>
</template>
<script setup>
import { ref, computed, onMounted, watch } from 'vue'
const architectures = [
{ id: 'ar', name: '自回归', icon: '📝', tag: 'AR', tagClass: 'primary' },
{ id: 'nar', name: '非自回归', icon: '⚡', tag: 'NAR', tagClass: 'success' },
{ id: 'flow', name: '流匹配', icon: '🌊', tag: 'Flow', tagClass: 'warning' }
]
const pipelineStages = {
ar: [
{ id: 'text', name: '文本处理', icon: '📝', shortDesc: '分词 & 音素', description: '将输入文本转换为音素序列', input: '原始文本', output: '音素序列', tech: 'G2P' },
{ id: 'encoder', name: '文本编码', icon: '🔢', shortDesc: '提取特征', description: '使用 Encoder 编码文本', input: '音素序列', output: '文本特征', tech: 'Transformer' },
{ id: 'decoder', name: '自回归解码', icon: '🎯', shortDesc: '逐帧生成', description: '逐个时间步生成梅尔频谱', input: '文本特征', output: '梅尔频谱', tech: 'AR Decoder' },
{ id: 'vocoder', name: '声码器', icon: '🔊', shortDesc: '频谱转波形', description: '将频谱转换为音频波形', input: '梅尔频谱', output: '音频波形', tech: 'HiFi-GAN' }
],
nar: [
{ id: 'text', name: '文本处理', icon: '📝', shortDesc: '分词 & 音素', description: '将输入文本转换为音素序列', input: '原始文本', output: '音素序列', tech: 'G2P' },
{ id: 'duration', name: '时长预测', icon: '⏱️', shortDesc: '预测时长', description: '预测每个音素的帧数', input: '音素序列', output: '时长信息', tech: 'Duration Predictor' },
{ id: 'decoder', name: '并行解码', icon: '⚡', shortDesc: '一次性生成', description: '并行生成完整梅尔频谱', input: '文本特征', output: '梅尔频谱', tech: 'Non-AR Transformer' },
{ id: 'vocoder', name: '声码器', icon: '🔊', shortDesc: '频谱转波形', description: '将频谱转换为音频波形', input: '梅尔频谱', output: '音频波形', tech: 'HiFi-GAN' }
],
flow: [
{ id: 'text', name: '文本处理', icon: '📝', shortDesc: '分词 & 音素', description: '将输入文本转换为音素序列', input: '原始文本', output: '音素序列', tech: 'G2P' },
{ id: 'embedding', name: '文本嵌入', icon: '🔢', shortDesc: '特征提取', description: '将音素转换为向量', input: '音素序列', output: '文本嵌入', tech: 'DiT' },
{ id: 'flow', name: '流匹配', icon: '🌊', shortDesc: '最优传输', description: '使用流匹配生成频谱', input: '文本嵌入', output: '梅尔频谱', tech: 'Flow Matching' },
{ id: 'vocoder', name: '声码器', icon: '🔊', shortDesc: '频谱转波形', description: '将频谱转换为音频波形', input: '梅尔频谱', output: '音频波形', tech: 'Vocoder' }
]
}
const comparisonRows = [
{ feature: '生成速度', ar: '慢', nar: '快', flow: '很快' },
{ feature: '音质', ar: '高', nar: '中高', flow: '高' },
{ feature: '稳定性', ar: '中', nar: '高', flow: '高' },
{ feature: '可控性', ar: '中', nar: '高', flow: '高' }
]
const models = [
{ name: 'Tacotron 2', arch: 'ar', type: 'AR', tagClass: 'primary', desc: '经典 AR 模型,音质优秀' },
{ name: 'FastSpeech 2', arch: 'nar', type: 'NAR', tagClass: 'success', desc: '并行生成,速度快' },
{ name: 'F5-TTS', arch: 'flow', type: 'Flow', tagClass: 'warning', desc: '最新 SOTA10 步生成' },
{ name: 'CosyVoice', arch: 'flow', type: 'Flow', tagClass: 'warning', desc: '阿里开源,支持多语言' }
]
const selectedArch = ref('flow')
const activeStage = ref(0)
const detailCanvas = ref(null)
const currentStages = computed(() => pipelineStages[selectedArch.value])
const currentStage = computed(() => currentStages.value[activeStage.value])
const selectArch = (id) => {
selectedArch.value = id
activeStage.value = 0
}
const drawVisualization = () => {
const canvas = detailCanvas.value
if (!canvas) return
const ctx = canvas.getContext('2d')
const w = canvas.width
const h = canvas.height
ctx.clearRect(0, 0, w, h)
const stage = currentStage.value
if (!stage) return
// 根据阶段绘制不同的可视化
if (stage.id === 'text') {
// 文本到音素
ctx.font = '16px sans-serif'
ctx.fillStyle = '#333'
ctx.fillText('"Hello"', 50, h/2)
ctx.strokeStyle = '#409eff'
ctx.lineWidth = 2
ctx.beginPath()
ctx.moveTo(120, h/2)
ctx.lineTo(200, h/2)
ctx.stroke()
const phonemes = ['h', 'ə', 'l', 'oʊ']
let x = 220
phonemes.forEach((p, i) => {
ctx.fillStyle = `hsl(${200 + i * 30}, 70%, 50%)`
ctx.fillRect(x, h/2 - 15, 30, 30)
ctx.fillStyle = '#fff'
ctx.fillText(p, x + 8, h/2 + 5)
x += 40
})
} else if (stage.id === 'decoder' && selectedArch.value === 'ar') {
// 自回归解码
for (let i = 0; i < 5; i++) {
const x = 80 + i * 80
for (let j = 0; j < 8; j++) {
const barH = Math.random() * 40 + 10
ctx.fillStyle = `rgba(64, 158, 255, ${0.5 + i * 0.1})`
ctx.fillRect(x + j * 8, h - 50 - barH, 6, barH)
}
if (i < 4) {
ctx.strokeStyle = '#ccc'
ctx.beginPath()
ctx.moveTo(x + 70, h/2)
ctx.lineTo(x + 80, h/2)
ctx.stroke()
}
}
ctx.fillStyle = '#666'
ctx.fillText('逐个时间步生成', 50, 30)
} else if (stage.id === 'flow') {
// 流匹配
ctx.strokeStyle = '#409eff'
ctx.lineWidth = 3
ctx.beginPath()
ctx.moveTo(50, h - 50)
for (let t = 0; t <= 1; t += 0.02) {
const x = 50 + t * 400
const y = h - 50 - t * (h - 100) + Math.sin(t * Math.PI * 4) * 20
ctx.lineTo(x, y)
}
ctx.stroke()
const steps = [0, 0.25, 0.5, 0.75, 1]
steps.forEach((t, i) => {
const x = 50 + t * 400
const y = h - 50 - t * (h - 100) + Math.sin(t * Math.PI * 4) * 20
ctx.fillStyle = '#e6a23c'
ctx.beginPath()
ctx.arc(x, y, 6, 0, Math.PI * 2)
ctx.fill()
})
}
}
onMounted(drawVisualization)
watch([selectedArch, activeStage], drawVisualization)
</script>
<style scoped>
.tts-pipeline-demo {
background: var(--vp-c-bg-soft);
border: 1px solid var(--vp-c-divider);
border-radius: 12px;
padding: 24px;
margin: 24px 0;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
}
.header {
text-align: center;
margin-bottom: 24px;
}
.title {
font-size: 18px;
font-weight: 700;
margin-bottom: 8px;
background: linear-gradient(120deg, #409eff, #67c23a);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.subtitle {
font-size: 14px;
color: var(--vp-c-text-2);
}
.arch-selector {
display: flex;
gap: 12px;
margin-bottom: 24px;
flex-wrap: wrap;
justify-content: center;
}
.arch-btn {
padding: 12px 20px;
border: 1px solid var(--vp-c-divider);
border-radius: 8px;
background: var(--vp-c-bg);
cursor: pointer;
display: flex;
align-items: center;
gap: 8px;
transition: all 0.2s;
}
.arch-btn:hover {
border-color: var(--vp-c-brand);
}
.arch-btn.active {
border-color: var(--vp-c-brand);
background: var(--vp-c-bg-mute);
}
.arch-icon {
font-size: 20px;
}
.arch-name {
font-weight: 500;
}
.arch-tag {
font-size: 10px;
padding: 2px 6px;
border-radius: 4px;
}
.arch-tag.primary { background: #409eff33; color: #409eff; }
.arch-tag.success { background: #67c23a33; color: #67c23a; }
.arch-tag.warning { background: #e6a23c33; color: #e6a23c; }
.pipeline-flow {
display: flex;
justify-content: center;
gap: 8px;
flex-wrap: wrap;
padding: 20px;
background: var(--vp-c-bg);
border-radius: 8px;
margin-bottom: 20px;
}
.stage {
display: flex;
align-items: center;
gap: 8px;
cursor: pointer;
}
.stage-content {
background: var(--vp-c-bg-soft);
border: 2px solid var(--vp-c-divider);
border-radius: 8px;
padding: 12px 16px;
text-align: center;
transition: all 0.2s;
min-width: 100px;
}
.stage:hover .stage-content,
.stage.active .stage-content {
border-color: var(--vp-c-brand);
background: var(--vp-c-bg-mute);
}
.stage-num {
width: 24px;
height: 24px;
background: var(--vp-c-brand);
color: white;
border-radius: 50%;
display: flex;
align-items: center;
justify-content: center;
font-size: 12px;
font-weight: 600;
}
.stage-icon {
font-size: 24px;
margin-bottom: 4px;
}
.stage-name {
font-weight: 500;
font-size: 13px;
}
.stage-desc {
font-size: 11px;
color: var(--vp-c-text-3);
}
.stage-arrow {
color: var(--vp-c-text-3);
font-size: 20px;
}
.stage-detail {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
}
.detail-header {
display: flex;
gap: 12px;
margin-bottom: 16px;
}
.detail-icon {
font-size: 32px;
}
.detail-name {
font-weight: 600;
margin-bottom: 4px;
}
.detail-desc {
font-size: 13px;
color: var(--vp-c-text-2);
}
.detail-canvas {
background: var(--vp-c-bg-soft);
border-radius: 8px;
margin-bottom: 16px;
}
.detail-canvas canvas {
width: 100%;
height: auto;
}
.detail-meta {
display: flex;
gap: 24px;
flex-wrap: wrap;
}
.meta-item {
font-size: 13px;
}
.meta-item .label {
color: var(--vp-c-text-3);
margin-right: 4px;
}
.comparison-table {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
}
.table-title {
font-weight: 600;
margin-bottom: 16px;
text-align: center;
}
.table {
display: flex;
flex-direction: column;
gap: 1px;
background: var(--vp-c-divider);
border-radius: 8px;
overflow: hidden;
}
.table-header,
.table-row {
display: grid;
grid-template-columns: 1fr 1fr 1fr 1fr;
background: var(--vp-c-bg);
}
.table-header {
font-weight: 600;
background: var(--vp-c-bg-mute);
}
.cell {
padding: 12px;
text-align: center;
font-size: 13px;
}
.cell.feature {
text-align: left;
font-weight: 500;
}
.cell.highlight {
background: rgba(64, 158, 255, 0.1);
color: var(--vp-c-brand);
font-weight: 500;
}
.models-section {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
}
.models-title {
font-weight: 600;
margin-bottom: 16px;
text-align: center;
}
.models-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 12px;
}
.model-card {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 16px;
text-align: center;
border: 2px solid transparent;
transition: all 0.2s;
}
.model-card.active {
border-color: var(--vp-c-brand);
}
.model-name {
font-weight: 600;
margin-bottom: 8px;
}
.model-tag {
font-size: 10px;
padding: 2px 8px;
border-radius: 4px;
display: inline-block;
margin-bottom: 8px;
}
.model-tag.primary { background: #409eff33; color: #409eff; }
.model-tag.success { background: #67c23a33; color: #67c23a; }
.model-tag.warning { background: #e6a23c33; color: #e6a23c; }
.model-desc {
font-size: 12px;
color: var(--vp-c-text-3);
}
.info-box {
display: flex;
gap: 12px;
padding: 16px;
background: var(--vp-c-bg-mute);
border-radius: 8px;
font-size: 13px;
line-height: 1.6;
}
.info-box .icon {
font-size: 18px;
flex-shrink: 0;
}
@media (max-width: 640px) {
.pipeline-flow {
flex-direction: column;
}
.stage-arrow {
transform: rotate(90deg);
}
}
</style>
@@ -0,0 +1,723 @@
<!--
VoiceCloningDemo.vue
声音克隆交互演示组件
用途
演示零样本声音克隆的原理和流程
-->
<template>
<div class="voice-clone-demo">
<div class="header">
<div class="title">🎭 声音克隆 AI 模仿任何人</div>
<div class="subtitle">
只需几秒钟的参考音频AI 就能学会任何人的声音
</div>
</div>
<div class="mode-tabs">
<button
v-for="mode in modes"
:key="mode.id"
@click="selectMode(mode.id)"
class="mode-btn"
:class="{ active: selectedMode === mode.id }"
>
<span class="mode-icon">{{ mode.icon }}</span>
<span>{{ mode.name }}</span>
</button>
</div>
<div class="demo-area">
<!-- 参考音频 -->
<div class="section">
<div class="section-title">
<span class="num">1</span>
提供参考音频
</div>
<div class="audio-grid">
<div
v-for="ref in references"
:key="ref.id"
class="audio-card"
:class="{ selected: selectedRef === ref.id }"
@click="selectRef(ref.id)"
>
<div class="audio-avatar">{{ ref.avatar }}</div>
<div class="audio-name">{{ ref.name }}</div>
<div class="audio-desc">{{ ref.desc }}</div>
<button class="play-btn" @click.stop="playRef(ref.id)">
{{ playingRef === ref.id ? '' : '' }}
</button>
</div>
</div>
<div class="or-divider"></div>
<button class="upload-btn" @click="uploadRef">
📤 上传自己的音频
</button>
</div>
<!-- 处理过程 -->
<div class="section process-section">
<div class="section-title">
<span class="num">2</span>
AI 学习声音特征
</div>
<div class="process-flow">
<div
v-for="(step, index) in processSteps"
:key="step.id"
class="process-step"
:class="{ active: currentStep >= index }"
>
<div class="step-icon">{{ step.icon }}</div>
<div class="step-name">{{ step.name }}</div>
<div v-if="index < processSteps.length - 1" class="step-arrow"></div>
</div>
</div>
<div class="feature-viz" v-if="currentStep >= 2">
<canvas ref="featureCanvas" width="400" height="100"></canvas>
<div class="viz-label">提取的声音特征向量</div>
</div>
</div>
<!-- 生成结果 -->
<div class="section">
<div class="section-title">
<span class="num">3</span>
输入文本生成语音
</div>
<div class="text-input">
<textarea
v-model="inputText"
placeholder="输入要合成的文本..."
rows="3"
></textarea>
<button
class="generate-btn"
:disabled="!canGenerate"
@click="generate"
>
<span v-if="isGenerating" class="spinner"></span>
<span v-else>🎙 生成语音</span>
</button>
</div>
<div v-if="generatedAudio" class="result-area">
<div class="result-header">
<span class="result-icon">🎵</span>
<span>生成结果</span>
<span class="similarity">相似度: {{ similarity }}%</span>
</div>
<div class="waveform-mini">
<canvas ref="resultCanvas" width="400" height="60"></canvas>
</div>
<div class="result-actions">
<button class="action-btn" @click="playResult">
{{ playingResult ? ' 暂停' : ' 播放' }}
</button>
<button class="action-btn secondary" @click="download">
下载
</button>
</div>
</div>
</div>
</div>
<div class="tips-section">
<div class="tips-title">💡 声音克隆小贴士</div>
<div class="tips-grid">
<div class="tip-card">
<div class="tip-icon"></div>
<div class="tip-text">
<strong>参考音频时长</strong>
<p>3-10 秒即可质量比时长更重要</p>
</div>
</div>
<div class="tip-card">
<div class="tip-icon">🔇</div>
<div class="tip-text">
<strong>环境要求</strong>
<p>安静环境避免背景噪音</p>
</div>
</div>
<div class="tip-card">
<div class="tip-icon">🗣</div>
<div class="tip-text">
<strong>内容选择</strong>
<p>包含多种音调和语速效果更好</p>
</div>
</div>
</div>
</div>
<div class="info-box">
<span class="icon">🔬</span>
<p>
<strong>技术原理</strong>
声音克隆通过提取参考音频的音色语调和说话风格特征构建说话人嵌入向量
生成时TTS 模型结合文本内容和说话人嵌入合成与参考声音相似的语音
</p>
</div>
</div>
</template>
<script setup>
import { ref, computed, onMounted, watch } from 'vue'
const modes = [
{ id: 'zeroshot', name: '零样本克隆', icon: '🎯' },
{ id: 'fewshot', name: '少样本克隆', icon: '📚' },
{ id: 'crosslingual', name: '跨语言克隆', icon: '🌍' }
]
const references = [
{ id: 'male1', name: '男声 A', avatar: '👨', desc: '低沉磁性' },
{ id: 'female1', name: '女声 B', avatar: '👩', desc: '温柔甜美' },
{ id: 'child', name: '童声', avatar: '🧒', desc: '活泼可爱' },
{ id: 'elder', name: '老人', avatar: '👴', desc: '沧桑稳重' }
]
const processSteps = [
{ id: 'load', name: '加载音频', icon: '📂' },
{ id: 'encode', name: '编码特征', icon: '🔢' },
{ id: 'extract', name: '提取音色', icon: '🎨' },
{ id: 'embed', name: '构建嵌入', icon: '💎' }
]
const selectedMode = ref('zeroshot')
const selectedRef = ref(null)
const currentStep = ref(0)
const inputText = ref('')
const isGenerating = ref(false)
const generatedAudio = ref(false)
const similarity = ref(0)
const playingRef = ref(null)
const playingResult = ref(false)
const featureCanvas = ref(null)
const resultCanvas = ref(null)
const canGenerate = computed(() => {
return selectedRef.value && inputText.value.trim().length > 0 && !isGenerating.value
})
const selectMode = (id) => {
selectedMode.value = id
resetDemo()
}
const selectRef = (id) => {
selectedRef.value = id
currentStep.value = 0
simulateProcess()
}
const playRef = (id) => {
playingRef.value = playingRef.value === id ? null : id
}
const uploadRef = () => {
alert('模拟:打开文件选择器')
}
const simulateProcess = () => {
currentStep.value = 0
const interval = setInterval(() => {
currentStep.value++
if (currentStep.value >= processSteps.length) {
clearInterval(interval)
drawFeatures()
}
}, 500)
}
const drawFeatures = () => {
const canvas = featureCanvas.value
if (!canvas) return
const ctx = canvas.getContext('2d')
const w = canvas.width
const h = canvas.height
ctx.clearRect(0, 0, w, h)
// 绘制特征向量可视化
const features = 20
const barW = (w - 40) / features
for (let i = 0; i < features; i++) {
const value = Math.random() * 0.8 + 0.2
const barH = value * (h - 40)
const hue = 200 + value * 60
ctx.fillStyle = `hsl(${hue}, 70%, 50%)`
ctx.fillRect(20 + i * barW, h - 20 - barH, barW - 2, barH)
}
}
const generate = () => {
isGenerating.value = true
generatedAudio.value = false
setTimeout(() => {
isGenerating.value = false
generatedAudio.value = true
similarity.value = Math.floor(Math.random() * 15) + 85
drawResultWaveform()
}, 2000)
}
const drawResultWaveform = () => {
const canvas = resultCanvas.value
if (!canvas) return
const ctx = canvas.getContext('2d')
const w = canvas.width
const h = canvas.height
ctx.clearRect(0, 0, w, h)
ctx.strokeStyle = '#409eff'
ctx.lineWidth = 2
ctx.beginPath()
for (let x = 0; x < w; x += 2) {
const y = h / 2 + Math.sin(x * 0.1) * 20 * Math.random()
if (x === 0) ctx.moveTo(x, y)
else ctx.lineTo(x, y)
}
ctx.stroke()
}
const playResult = () => {
playingResult.value = !playingResult.value
}
const download = () => {
alert('模拟:下载音频文件')
}
const resetDemo = () => {
selectedRef.value = null
currentStep.value = 0
inputText.value = ''
generatedAudio.value = false
similarity.value = 0
}
onMounted(() => {
if (featureCanvas.value) drawFeatures()
})
</script>
<style scoped>
.voice-clone-demo {
background: var(--vp-c-bg-soft);
border: 1px solid var(--vp-c-divider);
border-radius: 12px;
padding: 24px;
margin: 24px 0;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
}
.header {
text-align: center;
margin-bottom: 24px;
}
.title {
font-size: 18px;
font-weight: 700;
margin-bottom: 8px;
background: linear-gradient(120deg, #409eff, #e6a23c);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.subtitle {
font-size: 14px;
color: var(--vp-c-text-2);
}
.mode-tabs {
display: flex;
gap: 12px;
margin-bottom: 24px;
justify-content: center;
}
.mode-btn {
padding: 10px 20px;
border: 1px solid var(--vp-c-divider);
border-radius: 20px;
background: var(--vp-c-bg);
cursor: pointer;
display: flex;
align-items: center;
gap: 6px;
transition: all 0.2s;
}
.mode-btn:hover {
border-color: var(--vp-c-brand);
}
.mode-btn.active {
background: var(--vp-c-brand);
color: white;
border-color: var(--vp-c-brand);
}
.demo-area {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
}
.section {
margin-bottom: 24px;
}
.section:last-child {
margin-bottom: 0;
}
.section-title {
display: flex;
align-items: center;
gap: 8px;
font-weight: 600;
margin-bottom: 16px;
}
.section-title .num {
width: 24px;
height: 24px;
background: var(--vp-c-brand);
color: white;
border-radius: 50%;
display: flex;
align-items: center;
justify-content: center;
font-size: 12px;
}
.audio-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(100px, 1fr));
gap: 12px;
margin-bottom: 16px;
}
.audio-card {
background: var(--vp-c-bg-soft);
border: 2px solid var(--vp-c-divider);
border-radius: 8px;
padding: 16px;
text-align: center;
cursor: pointer;
transition: all 0.2s;
position: relative;
}
.audio-card:hover {
border-color: var(--vp-c-brand);
}
.audio-card.selected {
border-color: var(--vp-c-brand);
background: var(--vp-c-bg-mute);
}
.audio-avatar {
font-size: 32px;
margin-bottom: 8px;
}
.audio-name {
font-weight: 500;
font-size: 13px;
margin-bottom: 4px;
}
.audio-desc {
font-size: 11px;
color: var(--vp-c-text-3);
}
.play-btn {
position: absolute;
top: 8px;
right: 8px;
width: 28px;
height: 28px;
border-radius: 50%;
border: none;
background: var(--vp-c-brand);
color: white;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
}
.or-divider {
text-align: center;
color: var(--vp-c-text-3);
margin: 12px 0;
font-size: 13px;
}
.upload-btn {
width: 100%;
padding: 12px;
border: 2px dashed var(--vp-c-divider);
border-radius: 8px;
background: var(--vp-c-bg-soft);
cursor: pointer;
color: var(--vp-c-text-2);
transition: all 0.2s;
}
.upload-btn:hover {
border-color: var(--vp-c-brand);
color: var(--vp-c-brand);
}
.process-flow {
display: flex;
justify-content: center;
align-items: center;
gap: 8px;
flex-wrap: wrap;
margin-bottom: 16px;
}
.process-step {
display: flex;
align-items: center;
gap: 8px;
padding: 12px 16px;
background: var(--vp-c-bg-soft);
border-radius: 8px;
opacity: 0.5;
transition: all 0.3s;
}
.process-step.active {
opacity: 1;
background: var(--vp-c-brand);
color: white;
}
.step-icon {
font-size: 20px;
}
.step-name {
font-size: 13px;
font-weight: 500;
}
.step-arrow {
color: var(--vp-c-text-3);
}
.feature-viz {
background: var(--vp-c-bg-soft);
border-radius: 8px;
padding: 16px;
text-align: center;
}
.feature-viz canvas {
width: 100%;
height: auto;
}
.viz-label {
font-size: 12px;
color: var(--vp-c-text-3);
margin-top: 8px;
}
.text-input textarea {
width: 100%;
padding: 12px;
border: 1px solid var(--vp-c-divider);
border-radius: 8px;
background: var(--vp-c-bg-soft);
font-size: 14px;
resize: vertical;
margin-bottom: 12px;
}
.generate-btn {
width: 100%;
padding: 14px;
background: linear-gradient(120deg, #409eff, #67c23a);
color: white;
border: none;
border-radius: 8px;
font-size: 15px;
font-weight: 500;
cursor: pointer;
transition: all 0.2s;
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
}
.generate-btn:hover:not(:disabled) {
opacity: 0.9;
}
.generate-btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.spinner {
width: 16px;
height: 16px;
border: 2px solid rgba(255,255,255,0.3);
border-top-color: white;
border-radius: 50%;
animation: spin 1s linear infinite;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
.result-area {
margin-top: 16px;
padding: 16px;
background: var(--vp-c-bg-soft);
border-radius: 8px;
border: 2px solid #67c23a;
}
.result-header {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 12px;
}
.result-icon {
font-size: 20px;
}
.similarity {
margin-left: auto;
font-size: 12px;
padding: 4px 8px;
background: #67c23a33;
color: #67c23a;
border-radius: 4px;
}
.waveform-mini {
background: var(--vp-c-bg);
border-radius: 4px;
margin-bottom: 12px;
}
.waveform-mini canvas {
width: 100%;
height: auto;
}
.result-actions {
display: flex;
gap: 8px;
}
.action-btn {
flex: 1;
padding: 10px;
background: var(--vp-c-brand);
color: white;
border: none;
border-radius: 6px;
cursor: pointer;
font-size: 13px;
}
.action-btn.secondary {
background: var(--vp-c-bg-mute);
color: var(--vp-c-text-1);
}
.tips-section {
background: var(--vp-c-bg);
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
}
.tips-title {
font-weight: 600;
margin-bottom: 16px;
text-align: center;
}
.tips-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 12px;
}
.tip-card {
display: flex;
gap: 12px;
padding: 16px;
background: var(--vp-c-bg-soft);
border-radius: 8px;
}
.tip-icon {
font-size: 24px;
}
.tip-text strong {
font-size: 13px;
display: block;
margin-bottom: 4px;
}
.tip-text p {
font-size: 12px;
color: var(--vp-c-text-3);
margin: 0;
}
.info-box {
display: flex;
gap: 12px;
padding: 16px;
background: var(--vp-c-bg-mute);
border-radius: 8px;
font-size: 13px;
line-height: 1.6;
}
.info-box .icon {
font-size: 18px;
flex-shrink: 0;
}
@media (max-width: 640px) {
.mode-tabs {
flex-direction: column;
}
.process-flow {
flex-direction: column;
}
.step-arrow {
transform: rotate(90deg);
}
}
</style>