Files
test-repo/docs/.vitepress/theme/components/appendix/llm-intro/TokenizationDemo.vue
T
sanbuphy 73f4788d7e feat: comprehensive documentation and demo updates
- Update READMEs and docs across multiple languages
- Enhance interactive demos for Agent, LLM, VLM, Audio, Image Gen, Terminal, and Web Basics
- Add new appendix sections for Database and IDE intros
- Update VitePress config, theme, and utility scripts
- Clean up unused assets and components
2026-01-16 19:10:51 +08:00

417 lines
9.3 KiB
Vue
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!--
TokenizationDemo.vue
分词原理演示组件
用途
展示大语言模型如何文本通过将文本拆解为 Token让用户理解 Token LLM 处理的最小单位
交互功能
- 文本输入用户可输入任意文本
- 实时分词模拟 Tokenizer 将文本切分为 Token
- 映射展示显示 Token 文本与其对应的模拟数字 ID
- 颜色编码使用不同颜色区分相邻 Token直观展示切分边界
-->
<template>
<div class="token-demo">
<div class="control-panel">
<div class="main-controls">
<div class="input-group">
<label>Input Text / 输入文本</label>
<textarea
v-model="inputText"
rows="3"
placeholder="Type something to see how AI reads it..."
></textarea>
</div>
<div class="settings-group">
<label>Algorithm / 算法</label>
<div class="radio-group">
<label
class="radio-option"
:class="{ active: algorithm === 'bpe' }"
>
<input type="radio" v-model="algorithm" value="bpe" />
<span>BPE (GPT-4)</span>
</label>
<label
class="radio-option"
:class="{ active: algorithm === 'word' }"
>
<input type="radio" v-model="algorithm" value="word" />
<span>Word (Legacy)</span>
</label>
<label
class="radio-option"
:class="{ active: algorithm === 'char' }"
>
<input type="radio" v-model="algorithm" value="char" />
<span>Character (Raw)</span>
</label>
</div>
</div>
</div>
<div class="stats">
<div class="stat-item">
<span class="value">{{ tokens.length }}</span>
<span class="label">Tokens</span>
</div>
<div class="stat-item">
<span class="value">{{ inputText.length }}</span>
<span class="label">Characters / 字符</span>
</div>
</div>
</div>
<!-- Tokenizer Process Visualization -->
<div class="tokenizer-arrow"></div>
<div class="visualization-area">
<div class="token-list">
<div
v-for="(token, index) in tokens"
:key="index"
class="token-chip"
:class="`color-${index % 5}`"
@mouseover="hoverIndex = index"
@mouseleave="hoverIndex = -1"
>
<span class="token-text">{{ token.text }}</span>
<span class="token-id">{{ token.id }}</span>
<div class="tooltip" v-if="hoverIndex === index">
ID: {{ token.id }}<br />
Type: {{ token.type }}
</div>
</div>
</div>
</div>
<div class="info-box">
<p>
<span class="icon">💡</span>
<strong>Note:</strong>
LLM 不直接理解单词它们处理的是数字Token IDs 对于英文一个 Token
通常是一个单词或单词的一部分 "ing" 对于中文一个 Token
通常是一个汉字或词组
</p>
</div>
</div>
</template>
<script setup>
import { ref, computed } from 'vue'
const inputText = ref(
'The quick brown fox jumps over the lazy dog. \n今天天气真不错!'
)
const hoverIndex = ref(-1)
const algorithm = ref('bpe')
// 模拟不同分词算法
const tokens = computed(() => {
const text = inputText.value
const result = []
let idCounter = 1000
// Helper to generate consistent fake ID
const generateId = (str) => {
let hash = 0
for (let i = 0; i < str.length; i++) {
hash = str.charCodeAt(i) + ((hash << 5) - hash)
}
return Math.abs(hash) % 50000
}
if (algorithm.value === 'bpe') {
// 1. BPE (Subword) Simulation
// 模拟:保留常用词,拆分生僻词/后缀,中文字符独立
const regex = /([a-zA-Z]+)|([\u4e00-\u9fa5])|(\s+)|(.+?)/g
let match
while ((match = regex.exec(text)) !== null) {
if (match[0]) {
let type = 'other'
if (match[1]) type = 'word (en)'
else if (match[2]) type = 'char (zh)'
else if (match[3]) type = 'whitespace'
else type = 'punctuation'
result.push({ text: match[0], id: generateId(match[0]), type })
}
}
} else if (algorithm.value === 'word') {
// 2. Word-based Simulation
// 简单按空格拆分,标点符号也可能粘连
const words = text.split(/(\s+)/)
words.forEach((w) => {
if (w) {
let type = /^\s+$/.test(w) ? 'whitespace' : 'word'
result.push({ text: w, id: generateId(w), type })
}
})
} else if (algorithm.value === 'char') {
// 3. Character-based Simulation
// 每个字符都是一个 Token
for (let char of text) {
let type = 'char'
if (/\s/.test(char)) type = 'whitespace'
result.push({ text: char, id: generateId(char), type })
}
}
return result
})
</script>
<style scoped>
.token-demo {
border: 1px solid var(--vp-c-divider);
border-radius: 8px;
background-color: var(--vp-c-bg-soft);
padding: 1.5rem;
margin: 1rem 0;
font-family: var(--vp-font-family-mono);
}
.control-panel {
display: flex;
gap: 1.5rem;
margin-bottom: 1.5rem;
align-items: flex-start;
}
.main-controls {
flex: 1;
display: flex;
flex-direction: column;
gap: 1rem;
min-width: 0; /* Prevent flex item from overflowing */
}
.input-group {
width: 100%;
}
.input-group label,
.settings-group label {
display: block;
font-size: 0.875rem;
font-weight: 600;
margin-bottom: 0.5rem;
color: var(--vp-c-text-2);
}
.radio-group {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
}
.radio-option {
display: flex;
align-items: center;
gap: 6px;
padding: 6px 12px;
border: 1px solid var(--vp-c-divider);
border-radius: 6px;
cursor: pointer;
background-color: var(--vp-c-bg);
font-size: 0.85rem;
transition: all 0.2s;
}
.radio-option:hover {
background-color: var(--vp-c-bg-alt);
}
.radio-option.active {
border-color: var(--vp-c-brand);
background-color: var(--vp-c-brand-soft);
color: var(--vp-c-brand-dark);
}
.radio-option input {
display: none;
}
.tokenizer-arrow {
text-align: center;
font-size: 1.5rem;
color: var(--vp-c-text-3);
margin: 0.5rem 0;
opacity: 0.5;
}
textarea {
width: 100%;
padding: 0.75rem;
border-radius: 6px;
border: 1px solid var(--vp-c-divider);
background-color: var(--vp-c-bg);
color: var(--vp-c-text-1);
font-family: inherit;
resize: vertical;
transition: border-color 0.2s;
}
textarea:focus {
outline: none;
border-color: var(--vp-c-brand);
}
.stats {
display: flex;
flex-direction: column;
gap: 0.75rem;
min-width: 100px;
}
.stat-item {
display: flex;
flex-direction: column;
align-items: center;
background-color: var(--vp-c-bg);
padding: 0.5rem;
border-radius: 6px;
border: 1px solid var(--vp-c-divider);
}
.stat-item .value {
font-size: 1.5rem;
font-weight: bold;
color: var(--vp-c-brand);
line-height: 1;
}
.stat-item .label {
font-size: 0.75rem;
color: var(--vp-c-text-2);
margin-top: 0.25rem;
}
.visualization-area {
background-color: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 6px;
padding: 1rem;
min-height: 100px;
margin-bottom: 1rem;
}
.token-list {
display: flex;
flex-wrap: wrap;
gap: 4px;
}
.token-chip {
position: relative;
display: inline-flex;
flex-direction: column;
align-items: center;
padding: 4px 6px;
border-radius: 4px;
cursor: help;
transition: transform 0.1s;
}
.token-chip:hover {
transform: scale(1.05);
z-index: 10;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
}
.token-text {
font-size: 1rem;
line-height: 1.4;
white-space: pre;
}
.token-id {
font-size: 0.6rem;
opacity: 0.6;
margin-top: 2px;
}
/* Color palette for tokens */
.color-0 {
background-color: rgba(255, 99, 132, 0.2);
border: 1px solid rgba(255, 99, 132, 0.3);
}
.color-1 {
background-color: rgba(54, 162, 235, 0.2);
border: 1px solid rgba(54, 162, 235, 0.3);
}
.color-2 {
background-color: rgba(255, 206, 86, 0.2);
border: 1px solid rgba(255, 206, 86, 0.3);
}
.color-3 {
background-color: rgba(75, 192, 192, 0.2);
border: 1px solid rgba(75, 192, 192, 0.3);
}
.color-4 {
background-color: rgba(153, 102, 255, 0.2);
border: 1px solid rgba(153, 102, 255, 0.3);
}
.tooltip {
position: absolute;
bottom: 100%;
left: 50%;
transform: translateX(-50%);
background-color: var(--vp-c-text-1);
color: var(--vp-c-bg);
padding: 4px 8px;
border-radius: 4px;
font-size: 0.75rem;
white-space: nowrap;
pointer-events: none;
margin-bottom: 6px;
z-index: 20;
}
.tooltip::after {
content: '';
position: absolute;
top: 100%;
left: 50%;
margin-left: -4px;
border-width: 4px;
border-style: solid;
border-color: var(--vp-c-text-1) transparent transparent transparent;
}
.info-box {
display: flex;
align-items: flex-start;
gap: 0.5rem;
padding: 0.75rem;
background-color: var(--vp-c-bg-alt);
border-radius: 6px;
font-size: 0.875rem;
color: var(--vp-c-text-2);
}
.info-box .icon {
font-size: 1.1em;
}
@media (max-width: 640px) {
.control-panel {
flex-direction: column;
gap: 1rem;
}
.stats {
flex-direction: row;
width: 100%;
justify-content: space-between;
}
.stat-item {
flex: 1;
}
}
</style>