Files
test-repo/docs/.vitepress/theme/components/appendix/vlm-intro/ModelArchitectureComparisonDemo.vue
T

512 lines
12 KiB
Vue
Raw Normal View History

2026-01-15 20:10:19 +08:00
<template>
<div class="model-evolution-demo">
<div class="controls-header">
<div class="toggle-container" @click="toggleMode">
<div class="toggle-track" :class="{ active: isVLM }">
<div class="toggle-thumb">
{{ isVLM ? '👁️' : '🧠' }}
</div>
</div>
<div class="toggle-label">
<span :class="{ active: !isVLM }">Pure LLM (纯文本)</span>
2026-01-15 20:10:19 +08:00
<span class="arrow"></span>
<span :class="{ active: isVLM }">Multimodal VLM (多模态)</span>
2026-01-15 20:10:19 +08:00
</div>
</div>
<div class="status-desc">
{{
isVLM
? 'Tokens from vision are translated and placed before text tokens. (视觉信息被翻译成 Token,放在文字 Token 之前。)'
: 'Text-only tokens flow into the LLM. (只有文字 Token 流入大模型。)'
2026-01-15 20:10:19 +08:00
}}
</div>
</div>
<div class="diagram-stage">
<div class="lanes">
<div class="lane lane-vision" v-show="isVLM">
<div class="lane-title">Vision Path (视觉路径)</div>
<div class="lane-flow">
<div class="node input-node">
<span class="icon">🖼</span>
<span class="label">Image (图片)</span>
</div>
<span class="mini-arrow"></span>
<div class="node process-node vit-node">
<span class="icon">👁</span>
<span class="label">ViT (视觉模型)</span>
</div>
<span class="mini-arrow"></span>
<div class="node adapter-node">
<span class="icon">🔌</span>
<span class="label">Projector (投影器)</span>
</div>
<span class="mini-arrow"></span>
<div class="token-box token-box-vision">
<div class="token-box-title">Vision Tokens (视觉 Token)</div>
<div class="tokens">
<span class="token vision">v1</span>
<span class="token vision">v2</span>
<span class="token vision">v3</span>
<span class="token vision"></span>
</div>
</div>
2026-01-15 20:10:19 +08:00
</div>
</div>
<div class="lane lane-text">
<div class="lane-title">Text Path (文字路径)</div>
<div class="lane-flow">
<div class="node input-node">
<span class="icon"></span>
<span class="label">Prompt (提示词)</span>
</div>
<span class="mini-arrow"></span>
<div class="node process-node">
<span class="icon">🔤</span>
<span class="label">Embed (向量化)</span>
</div>
<span class="mini-arrow"></span>
<div class="token-box">
<div class="token-box-title">Text Tokens (文字 Token)</div>
<div class="tokens">
<span class="token text">t1</span>
<span class="token text">t2</span>
<span class="token text">t3</span>
<span class="token text"></span>
</div>
</div>
2026-01-15 20:10:19 +08:00
</div>
</div>
<div class="merge-stage">
<div class="merge-title">Token Sequence (输入序列)</div>
<div class="sequence">
<div v-if="isVLM" class="sequence-row">
<span class="sequence-tag vision">Vision (视觉)</span>
<div class="tokens">
<span class="token vision">v1</span>
<span class="token vision">v2</span>
<span class="token vision">v3</span>
<span class="token vision"></span>
</div>
</div>
<div class="sequence-row">
<span class="sequence-tag text">Text (文字)</span>
<div class="tokens">
<span class="token text">t1</span>
<span class="token text">t2</span>
<span class="token text">t3</span>
<span class="token text"></span>
</div>
</div>
<div class="sequence-hint">
<span v-if="isVLM"
>Concat: [Vision Tokens] + [Text Tokens]
(拼接视觉在前文字在后)</span
>
<span v-else>Only [Text Tokens] (只有文字 Token)</span>
</div>
2026-01-15 20:10:19 +08:00
</div>
<div class="core-stage">
<span class="big-arrow"></span>
<div class="node core-node">
<span class="icon">🧠</span>
<span class="label">LLM Backbone (大模型)</span>
</div>
<span class="big-arrow"></span>
<div class="node output-node">
<span class="icon">💬</span>
<span class="label">Response (回复)</span>
2026-01-15 20:10:19 +08:00
</div>
</div>
</div>
</div>
</div>
<div class="interactive-info">
<transition name="fade" mode="out-in">
<div class="info-card" v-if="!isVLM" key="llm">
<h3>Standard LLM Flow (标准大模型流程)</h3>
<p>Prompt Embedding Token Sequence LLM Response</p>
</div>
<div class="info-card vlm-info" v-else key="vlm">
<h3>VLM = LLM + Vision Encoder (视觉大模型原理)</h3>
<ul>
<li><strong>ViT (The Eye):</strong> 把图片编码成视觉特征</li>
<li>
<strong>Projector (The Translator):</strong> 把视觉特征映射到 LLM
Token 空间
</li>
<li>
<strong>Concatenation (拼接):</strong> 把视觉 Token 放在文字 Token
之前作为同一条输入序列
</li>
</ul>
</div>
</transition>
2026-01-15 20:10:19 +08:00
</div>
</div>
</template>
<script setup>
import { ref } from 'vue'
const isVLM = ref(false)
const toggleMode = () => {
isVLM.value = !isVLM.value
}
</script>
<style scoped>
.model-evolution-demo {
background: var(--vp-c-bg-soft);
border: 1px solid var(--vp-c-divider);
border-radius: 12px;
padding: 24px;
margin: 20px 0;
font-family: 'Menlo', 'Monaco', sans-serif;
user-select: none;
}
.controls-header {
display: flex;
flex-direction: column;
align-items: center;
margin-bottom: 18px;
2026-01-15 20:10:19 +08:00
gap: 12px;
}
.toggle-container {
display: flex;
align-items: center;
gap: 15px;
cursor: pointer;
background: var(--vp-c-bg-mute);
padding: 8px 16px;
border-radius: 30px;
border: 1px solid transparent;
transition: all 0.2s;
}
.toggle-container:hover {
border-color: var(--vp-c-brand);
background: var(--vp-c-bg);
}
.toggle-track {
width: 50px;
height: 28px;
background: #ccc;
border-radius: 14px;
position: relative;
transition: background 0.3s;
}
.toggle-track.active {
background: var(--vp-c-brand);
}
.toggle-thumb {
width: 24px;
height: 24px;
background: #fff;
border-radius: 50%;
position: absolute;
top: 2px;
left: 2px;
display: flex;
align-items: center;
justify-content: center;
font-size: 14px;
transition: transform 0.3s cubic-bezier(0.34, 1.56, 0.64, 1);
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
2026-01-15 20:10:19 +08:00
}
.toggle-track.active .toggle-thumb {
transform: translateX(22px);
}
.toggle-label {
font-size: 14px;
font-weight: bold;
color: var(--vp-c-text-2);
display: flex;
gap: 8px;
align-items: center;
}
.toggle-label span.active {
color: var(--vp-c-text-1);
}
.status-desc {
font-size: 13px;
color: var(--vp-c-text-2);
text-align: center;
line-height: 1.5;
max-width: 720px;
2026-01-15 20:10:19 +08:00
}
.diagram-stage {
background: var(--vp-c-bg);
border: 1px dashed var(--vp-c-divider);
border-radius: 8px;
padding: 18px;
}
.lanes {
display: flex;
flex-direction: column;
gap: 14px;
}
.lane {
background: var(--vp-c-bg-mute);
border: 1px solid var(--vp-c-divider);
border-radius: 10px;
padding: 12px;
}
.lane-title {
font-size: 12px;
color: var(--vp-c-text-2);
margin-bottom: 10px;
font-weight: 700;
}
.lane-flow {
2026-01-15 20:10:19 +08:00
display: flex;
align-items: center;
gap: 10px;
flex-wrap: wrap;
2026-01-15 20:10:19 +08:00
}
.merge-stage {
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 10px;
padding: 12px;
2026-01-15 20:10:19 +08:00
}
.merge-title {
font-size: 12px;
color: var(--vp-c-text-2);
margin-bottom: 10px;
font-weight: 700;
}
.sequence {
border: 1px solid var(--vp-c-divider);
background: var(--vp-c-bg-soft);
border-radius: 10px;
padding: 10px;
}
.sequence-row {
2026-01-15 20:10:19 +08:00
display: flex;
align-items: center;
gap: 10px;
margin-bottom: 8px;
flex-wrap: wrap;
2026-01-15 20:10:19 +08:00
}
.sequence-row:last-child {
margin-bottom: 0;
2026-01-15 20:10:19 +08:00
}
.sequence-tag {
font-size: 11px;
font-weight: 800;
padding: 2px 8px;
border-radius: 999px;
border: 1px solid var(--vp-c-divider);
background: var(--vp-c-bg);
color: var(--vp-c-text-2);
}
.sequence-tag.vision {
border-color: var(--vp-c-yellow);
2026-01-15 20:10:19 +08:00
}
.sequence-tag.text {
border-color: var(--vp-c-brand);
}
.sequence-hint {
margin-top: 8px;
font-size: 11px;
color: var(--vp-c-text-2);
2026-01-15 20:10:19 +08:00
}
.core-stage {
margin-top: 14px;
2026-01-15 20:10:19 +08:00
display: flex;
align-items: center;
justify-content: center;
gap: 10px;
flex-wrap: wrap;
2026-01-15 20:10:19 +08:00
}
.big-arrow {
font-size: 18px;
color: var(--vp-c-text-2);
font-weight: 800;
2026-01-15 20:10:19 +08:00
}
.mini-arrow {
font-size: 14px;
color: var(--vp-c-text-3);
font-weight: 800;
2026-01-15 20:10:19 +08:00
}
.node {
background: var(--vp-c-bg);
border: 2px solid var(--vp-c-divider);
border-radius: 10px;
2026-01-15 20:10:19 +08:00
padding: 8px 12px;
display: flex;
flex-direction: column;
align-items: center;
min-width: 110px;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
2026-01-15 20:10:19 +08:00
}
.icon {
font-size: 20px;
margin-bottom: 4px;
}
.label {
font-size: 11px;
font-weight: 800;
text-align: center;
line-height: 1.2;
}
2026-01-15 20:10:19 +08:00
.input-node {
border-color: #aaa;
}
.process-node {
border-color: var(--vp-c-brand-dimm);
}
.core-node {
border-color: var(--vp-c-brand);
2026-01-15 20:10:19 +08:00
background: var(--vp-c-brand-dimm);
min-width: 140px;
2026-01-15 20:10:19 +08:00
}
.output-node {
border-color: var(--vp-c-brand);
}
2026-01-15 20:10:19 +08:00
.vit-node {
border-color: var(--vp-c-yellow);
2026-01-15 20:10:19 +08:00
background: rgba(255, 197, 23, 0.05);
}
.adapter-node {
border-color: var(--vp-c-yellow);
2026-01-15 20:10:19 +08:00
background: var(--vp-c-yellow-dimm);
}
.token-box {
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 10px;
padding: 10px;
min-width: 220px;
2026-01-15 20:10:19 +08:00
}
.token-box-vision {
border-color: var(--vp-c-yellow);
2026-01-15 20:10:19 +08:00
}
.token-box-title {
font-size: 11px;
font-weight: 800;
2026-01-15 20:10:19 +08:00
color: var(--vp-c-text-2);
margin-bottom: 8px;
2026-01-15 20:10:19 +08:00
}
.tokens {
2026-01-15 20:10:19 +08:00
display: flex;
gap: 6px;
flex-wrap: wrap;
2026-01-15 20:10:19 +08:00
}
.token {
font-size: 11px;
padding: 2px 8px;
border-radius: 999px;
border: 1px solid var(--vp-c-divider);
background: var(--vp-c-bg-soft);
color: var(--vp-c-text-1);
2026-01-15 20:10:19 +08:00
}
.token.vision {
border-color: var(--vp-c-yellow);
background: rgba(255, 197, 23, 0.12);
}
2026-01-15 20:10:19 +08:00
.token.text {
border-color: var(--vp-c-brand);
background: rgba(59, 130, 246, 0.12);
2026-01-15 20:10:19 +08:00
}
.interactive-info {
margin-top: 16px;
2026-01-15 20:10:19 +08:00
}
.info-card {
background: var(--vp-c-bg-mute);
padding: 16px;
border-radius: 8px;
}
.info-card h3 {
margin-top: 0;
margin-bottom: 10px;
font-size: 15px;
color: var(--vp-c-text-1);
}
.info-card p,
.info-card li {
2026-01-15 20:10:19 +08:00
font-size: 13px;
color: var(--vp-c-text-2);
line-height: 1.6;
}
.info-card ul {
padding-left: 20px;
margin: 0;
}
.fade-enter-active,
.fade-leave-active {
transition: opacity 0.3s ease;
2026-01-15 20:10:19 +08:00
}
.fade-enter-from,
.fade-leave-to {
opacity: 0;
}
@media (max-width: 720px) {
2026-01-15 20:10:19 +08:00
.diagram-stage {
padding: 14px;
2026-01-15 20:10:19 +08:00
}
.node {
min-width: 100px;
2026-01-15 20:10:19 +08:00
}
.token-box {
min-width: 200px;
2026-01-15 20:10:19 +08:00
}
}
</style>