Files
test-repo/docs/.vitepress/theme/components/appendix/rag/ChunkingStrategyDemo.vue
T
sanbuphy 3af119a598 feat(appendix): 添加多个交互式演示组件,完善 AI/Infra 等章节内容
- 新增 Vibe Coding 全栈相关演示组件 (DeveloperSkillShift, FrontendTriad, BackendCore 等)
- 新增 RAG 相关组件 (RAGPipeline, ChunkingStrategy, Retrieval 等)
- 新增 Embedding & Vector 相关组件 (EmbeddingConcept, VectorSimilarity 等)
- 新增 AI Native App 设计组件 (AINativeArch, PromptDesign 等)
- 新增 Infrastructure as Code 组件 (IaCConcept, TerraformWorkflow 等)
- 新增 DNS & HTTPS 演示组件 (DnsResolution, HttpsHandshake 等)
- 新增 Model Finetuning 组件 (FinetuningPipeline 等)
- 更新多个章节的 markdown 内容,集成交互式演示
2026-02-24 18:22:58 +08:00

440 lines
11 KiB
Vue
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!--
ChunkingStrategyDemo.vue
文本分块策略交互演示
用途
展示不同的文本分块策略固定大小按句子语义递归
用户可以输入文本并观察不同策略如何切分
交互功能
- 输入自定义文本或使用预设文本
- 切换不同分块策略
- 可视化展示分块结果与边界
-->
<template>
<div class="chunking-demo">
<div class="input-section">
<div class="section-header">
<span class="section-title">输入文本</span>
<button
class="preset-btn"
@click="usePreset"
>
使用示例文本
</button>
</div>
<textarea
v-model="inputText"
class="text-input"
rows="4"
placeholder="请输入要分块的文本,或点击「使用示例文本」..."
/>
</div>
<div class="strategy-selector">
<button
v-for="s in strategies"
:key="s.id"
:class="['strategy-btn', { active: currentStrategy === s.id }]"
@click="currentStrategy = s.id"
>
<span class="strategy-icon">{{ s.icon }}</span>
<span class="strategy-name">{{ s.name }}</span>
</button>
</div>
<div class="strategy-info">
<div class="info-title">{{ activeStrategy.name }}</div>
<div class="info-desc">{{ activeStrategy.desc }}</div>
<div class="info-params">
<span
v-for="(p, i) in activeStrategy.params"
:key="i"
class="param-tag"
>
{{ p }}
</span>
</div>
</div>
<div class="result-section">
<div class="result-header">
分块结果
<span class="chunk-count"> {{ chunks.length }} 个块</span>
</div>
<div class="chunks-container">
<div
v-for="(chunk, i) in chunks"
:key="i"
class="chunk-item"
:style="{ borderLeftColor: chunkColors[i % chunkColors.length] }"
>
<div class="chunk-meta">
<span
class="chunk-index"
:style="{ background: chunkColors[i % chunkColors.length] }"
>
#{{ i + 1 }}
</span>
<span class="chunk-size">{{ chunk.length }} 字符</span>
</div>
<div class="chunk-text">{{ chunk }}</div>
</div>
<div
v-if="chunks.length === 0"
class="empty-hint"
>
请输入文本后查看分块结果
</div>
</div>
</div>
<div class="comparison-table">
<table>
<thead>
<tr>
<th>策略</th>
<th>优点</th>
<th>缺点</th>
<th>适用场景</th>
</tr>
</thead>
<tbody>
<tr
v-for="s in strategies"
:key="s.id"
:class="{ highlight: currentStrategy === s.id }"
>
<td class="strategy-cell">{{ s.icon }} {{ s.name }}</td>
<td>{{ s.pros }}</td>
<td>{{ s.cons }}</td>
<td>{{ s.useCase }}</td>
</tr>
</tbody>
</table>
</div>
</div>
</template>
<script setup>
import { ref, computed } from 'vue'
const chunkColors = ['#6366f1', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6', '#06b6d4']
const presetText = '人工智能(AI)是计算机科学的一个分支,致力于创建能够模拟人类智能的系统。机器学习是 AI 的核心方法之一,它让计算机能够从数据中学习规律。深度学习是机器学习的子集,使用多层神经网络来处理复杂任务。自然语言处理(NLP)使计算机能够理解和生成人类语言。大语言模型(LLM)如 GPT 和 Claude 通过海量文本训练,具备了强大的语言理解和生成能力。RAG(检索增强生成)技术通过在生成前检索相关文档,显著提升了 LLM 回答的准确性和时效性。向量数据库是 RAG 系统的关键组件,它能高效存储和检索文本的向量表示。'
const inputText = ref('')
const currentStrategy = ref('fixed')
const strategies = [
{
id: 'fixed',
name: '固定大小',
icon: '📏',
desc: '按照固定的字符数切分文本,是最简单直接的分块方式。通常会设置一定的重叠区域(overlap),避免在切分边界丢失上下文。',
params: ['块大小: 80 字符', '重叠: 20 字符'],
pros: '实现简单,块大小均匀',
cons: '可能在句子中间截断',
useCase: '结构化程度低的长文本'
},
{
id: 'sentence',
name: '按句子',
icon: '📝',
desc: '以句号、问号、感叹号等标点作为分隔符,按完整句子进行切分。保证每个块都是语义完整的句子集合。',
params: ['每块: 2-3 句', '分隔符: 。?!'],
pros: '保持句子完整性',
cons: '块大小不均匀',
useCase: '文章、报告等自然文本'
},
{
id: 'semantic',
name: '语义分块',
icon: '🧠',
desc: '根据文本的语义相似度进行分块。当相邻句子的语义差异超过阈值时,在此处切分。能更好地保持主题的连贯性。',
params: ['相似度阈值: 0.7', '最小块: 50 字符'],
pros: '主题连贯,语义完整',
cons: '计算成本高,需要嵌入模型',
useCase: '多主题混合的复杂文档'
},
{
id: 'recursive',
name: '递归分块',
icon: '🔄',
desc: '使用多级分隔符递归切分:先按段落分,段落太长则按句子分,句子太长则按固定大小分。LangChain 的默认策略。',
params: ['分隔符: \\n\\n → 。→ 固定', '目标: 80 字符'],
pros: '兼顾结构与大小',
cons: '实现较复杂',
useCase: '通用场景,推荐默认选择'
}
]
const activeStrategy = computed(() => strategies.find((s) => s.id === currentStrategy.value))
const chunks = computed(() => {
const text = inputText.value.trim()
if (!text) return []
switch (currentStrategy.value) {
case 'fixed':
return chunkFixed(text, 80, 20)
case 'sentence':
return chunkBySentence(text, 3)
case 'semantic':
return chunkSemantic(text)
case 'recursive':
return chunkRecursive(text, 80)
default:
return []
}
})
function chunkFixed(text, size, overlap) {
const result = []
let start = 0
while (start < text.length) {
result.push(text.slice(start, start + size))
start += size - overlap
}
return result
}
function chunkBySentence(text, perChunk) {
const sentences = text.split(/(?<=[。?!.?!])/).filter((s) => s.trim())
const result = []
for (let i = 0; i < sentences.length; i += perChunk) {
result.push(sentences.slice(i, i + perChunk).join(''))
}
return result
}
function chunkSemantic(text) {
const sentences = text.split(/(?<=[。?!.?!])/).filter((s) => s.trim())
const result = []
let current = ''
const keywords = ['AI', 'LLM', 'RAG', 'NLP', '机器学习', '深度学习', '向量']
let prevKeywords = new Set()
for (const s of sentences) {
const curKeywords = new Set(keywords.filter((k) => s.includes(k)))
const overlap = [...curKeywords].filter((k) => prevKeywords.has(k)).length
const similarity = prevKeywords.size > 0 ? overlap / Math.max(prevKeywords.size, curKeywords.size) : 1
if (current && similarity < 0.5 && current.length > 50) {
result.push(current)
current = s
} else {
current += s
}
prevKeywords = curKeywords
}
if (current) result.push(current)
return result
}
function chunkRecursive(text, target) {
const paragraphs = text.split(/\n\n+/).filter((p) => p.trim())
const result = []
for (const para of paragraphs) {
if (para.length <= target) {
result.push(para)
} else {
const sentences = para.split(/(?<=[。?!.?!])/).filter((s) => s.trim())
let current = ''
for (const s of sentences) {
if ((current + s).length > target && current) {
result.push(current)
current = s
} else {
current += s
}
}
if (current) result.push(current)
}
}
return result
}
function usePreset() {
inputText.value = presetText
}
</script>
<style scoped>
.chunking-demo {
border: 1px solid var(--vp-c-divider);
border-radius: 12px;
padding: 20px;
margin: 16px 0;
background: var(--vp-c-bg-soft);
}
.section-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 8px;
}
.section-title {
font-weight: 600;
font-size: 14px;
}
.preset-btn {
padding: 4px 12px;
border: 1px solid var(--vp-c-brand-1);
border-radius: 6px;
background: transparent;
color: var(--vp-c-brand-1);
cursor: pointer;
font-size: 12px;
}
.text-input {
width: 100%;
padding: 10px;
border: 1px solid var(--vp-c-divider);
border-radius: 8px;
background: var(--vp-c-bg);
color: var(--vp-c-text-1);
font-size: 13px;
line-height: 1.6;
resize: vertical;
box-sizing: border-box;
}
.strategy-selector {
display: flex;
gap: 8px;
margin: 16px 0;
flex-wrap: wrap;
}
.strategy-btn {
display: flex;
align-items: center;
gap: 6px;
padding: 8px 14px;
border: 1px solid var(--vp-c-divider);
border-radius: 8px;
background: var(--vp-c-bg);
cursor: pointer;
transition: all 0.2s;
font-size: 13px;
}
.strategy-btn.active {
border-color: var(--vp-c-brand-1);
background: var(--vp-c-brand-soft);
color: var(--vp-c-brand-1);
}
.strategy-icon {
font-size: 16px;
}
.strategy-info {
padding: 14px;
border-radius: 8px;
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
margin-bottom: 16px;
}
.info-title {
font-weight: 600;
font-size: 14px;
color: var(--vp-c-brand-1);
margin-bottom: 6px;
}
.info-desc {
font-size: 13px;
color: var(--vp-c-text-2);
line-height: 1.6;
margin-bottom: 8px;
}
.info-params {
display: flex;
gap: 8px;
flex-wrap: wrap;
}
.param-tag {
padding: 2px 10px;
border-radius: 4px;
background: var(--vp-c-bg-soft);
font-size: 12px;
color: var(--vp-c-text-2);
font-family: monospace;
}
.result-header {
display: flex;
justify-content: space-between;
align-items: center;
font-weight: 600;
font-size: 14px;
margin-bottom: 10px;
}
.chunk-count {
font-size: 12px;
color: var(--vp-c-text-3);
font-weight: 400;
}
.chunks-container {
display: flex;
flex-direction: column;
gap: 8px;
}
.chunk-item {
padding: 10px 12px;
border-radius: 8px;
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-left: 4px solid;
}
.chunk-meta {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 6px;
}
.chunk-index {
padding: 1px 8px;
border-radius: 4px;
color: #fff;
font-size: 11px;
font-weight: 600;
}
.chunk-size {
font-size: 11px;
color: var(--vp-c-text-3);
}
.chunk-text {
font-size: 13px;
color: var(--vp-c-text-2);
line-height: 1.5;
word-break: break-all;
}
.empty-hint {
text-align: center;
padding: 20px;
color: var(--vp-c-text-3);
font-size: 13px;
}
.comparison-table {
margin-top: 16px;
overflow-x: auto;
}
.comparison-table table {
width: 100%;
border-collapse: collapse;
font-size: 12px;
}
.comparison-table th,
.comparison-table td {
padding: 8px 10px;
border: 1px solid var(--vp-c-divider);
text-align: left;
}
.comparison-table th {
background: var(--vp-c-bg);
font-weight: 600;
}
.comparison-table tr.highlight {
background: var(--vp-c-brand-soft);
}
.strategy-cell {
white-space: nowrap;
}
</style>