feat: add AI and Backend evolution history with interactive demos, and refine Frontend evolution demo

This commit is contained in:
sanbuphy
2026-01-18 10:24:35 +08:00
parent 82be39a9ac
commit 26ed39e1eb
44 changed files with 9868 additions and 2633 deletions
@@ -1,57 +1,110 @@
<template>
<div class="attn-demo">
<div class="controls">
<span class="hint">🖱 把鼠标悬停在方块上查看它的注意力分配</span>
<div class="header">
<div class="title">Self-Attention Mechanism</div>
<div class="subtitle">自注意力机制全局信息交互</div>
</div>
<div class="visual-area">
<div class="image-grid" @mouseleave="hoverIndex = -1">
<div class="visual-stage">
<!-- Grid Layout -->
<div class="grid-container" @mouseleave="hoverIndex = -1">
<!-- SVG Layer for Connection Lines -->
<svg class="connections-layer">
<defs>
<marker id="arrowhead" markerWidth="6" markerHeight="4" refX="18" refY="2" orient="auto">
<polygon points="0 0, 6 2, 0 4" fill="var(--vp-c-brand)" opacity="0.6"/>
</marker>
</defs>
<!-- Draw lines from hoverIndex to ALL other nodes -->
<g v-if="hoverIndex !== -1">
<line
v-for="(target, tIndex) in items"
:key="tIndex"
v-show="tIndex !== hoverIndex"
:x1="getCenter(hoverIndex).x"
:y1="getCenter(hoverIndex).y"
:x2="getCenter(tIndex).x"
:y2="getCenter(tIndex).y"
:stroke="getLineColor(hoverIndex, tIndex)"
:stroke-width="getLineWidth(hoverIndex, tIndex)"
stroke-linecap="round"
:opacity="getLineOpacity(hoverIndex, tIndex)"
/>
</g>
</svg>
<!-- Cells -->
<div
v-for="(item, index) in items"
:key="index"
class="grid-cell"
:class="{ active: hoverIndex === index }"
:class="{
'is-source': hoverIndex === index,
'is-target': hoverIndex !== -1 && hoverIndex !== index,
'is-strong-attn': hoverIndex !== -1 && getAttentionScore(hoverIndex, index) > 0.5
}"
@mouseenter="hoverIndex = index"
:style="{
left: getCenter(index).x - 30 + 'px',
top: getCenter(index).y - 30 + 'px'
}"
>
{{ item.icon }}
<div class="cell-label">{{ item.label }}</div>
<div class="cell-content">
<span class="cell-icon">{{ item.icon }}</span>
<span class="cell-label">{{ item.label }}</span>
</div>
<!-- Attention Score Badge -->
<div
class="attn-badge"
v-if="hoverIndex !== -1 && hoverIndex !== index"
:style="{ opacity: Math.max(0.3, getAttentionScore(hoverIndex, index)) }"
>
{{ (getAttentionScore(hoverIndex, index) * 100).toFixed(0) }}%
</div>
</div>
<!-- SVG Overlay for lines -->
<svg class="connections" v-if="hoverIndex !== -1">
<line
v-for="(target, tIndex) in items"
:key="tIndex"
v-if="tIndex !== hoverIndex"
:x1="getCenter(hoverIndex).x"
:y1="getCenter(hoverIndex).y"
:x2="getCenter(tIndex).x"
:y2="getCenter(tIndex).y"
:stroke="getAttentionColor(hoverIndex, tIndex)"
:stroke-width="getAttentionWidth(hoverIndex, tIndex)"
stroke-linecap="round"
/>
</svg>
</div>
<div class="info-panel" :class="{ visible: hoverIndex !== -1 }">
<div class="info-title">Patch: {{ items[hoverIndex]?.label }}</div>
<div class="info-desc">正在关注</div>
<ul class="attn-list" v-if="hoverIndex !== -1">
<li
v-for="(weight, targetIdx) in getTopAttentions(hoverIndex)"
:key="targetIdx"
>
<span class="target-icon">{{ items[targetIdx].icon }}</span>
<span class="target-name">{{ items[targetIdx].label }}</span>
<div class="bar-bg">
<div
class="bar-fill"
:style="{ width: weight * 100 + '%' }"
></div>
<!-- Info Panel -->
<div class="info-panel">
<div v-if="hoverIndex === -1" class="placeholder-text">
<span class="cursor-icon">👆</span>
把鼠标悬停在任意方块上<br>观察它在"关注"
</div>
<div v-else class="active-info">
<div class="source-info">
<span class="label">当前 Patch:</span>
<div class="patch-tag">
{{ items[hoverIndex].icon }} {{ items[hoverIndex].label }}
</div>
</li>
</ul>
</div>
<div class="attn-list">
<div class="list-header">Attention Weights (注意力权重)</div>
<div
class="attn-item"
v-for="(score, idx) in getTopAttentions(hoverIndex)"
:key="idx"
>
<div class="item-left">
<span class="item-icon">{{ items[idx].icon }}</span>
<span class="item-name">{{ items[idx].label }}</span>
</div>
<div class="item-right">
<div class="progress-bar">
<div class="progress-fill" :style="{ width: score * 100 + '%' }"></div>
</div>
<span class="score-text">{{ (score * 100).toFixed(0) }}%</span>
</div>
</div>
</div>
<div class="insight-box">
<span class="bulb">💡</span>
<span class="insight-text">
{{ getInsightText(hoverIndex) }}
</span>
</div>
</div>
</div>
</div>
</div>
@@ -62,207 +115,309 @@ import { ref } from 'vue'
const hoverIndex = ref(-1)
// 3x3 Grid Data (Cat in grass)
const items = [
{ icon: '🌲', label: '背景' },
{ icon: '🌲', label: '背景' },
{ icon: '☁️', label: '天空' },
{ icon: '👂', label: '猫耳' },
{ icon: '😼', label: '猫' },
{ icon: '🌲', label: '背景' },
{ icon: '🐾', label: '猫爪' },
{ icon: '🧶', label: '毛线' },
{ icon: '🌱', label: '草地' }
{ icon: '🌿', label: '草地' }, // 0
{ icon: '🌿', label: '草地' }, // 1
{ icon: '🦋', label: '蝴蝶' }, // 2
{ icon: '🌿', label: '草地' }, // 3
{ icon: '🐱', label: '猫' }, // 4
{ icon: '🌿', label: '草地' }, // 5
{ icon: '🧶', label: '毛球' }, // 6
{ icon: '🐾', label: '猫爪' }, // 7
{ icon: '🌿', label: '草地' } // 8
]
// 3x3 Grid
// Layout Logic
const getCenter = (index) => {
const row = Math.floor(index / 3)
const col = index % 3
// Assuming 80px cell + 10px gap
const cellSize = 80
const gap = 10
const offset = cellSize / 2
const gap = 100
const offsetX = 50
const offsetY = 50
return {
x: col * (cellSize + gap) + offset,
y: row * (cellSize + gap) + offset
x: col * gap + offsetX,
y: row * gap + offsetY
}
}
// Mock attention weights
const getAttentionWeight = (source, target) => {
// Self attention is ignored for visualization clarity usually, but let's say:
// Attention Logic
const getAttentionScore = (source, target) => {
if (source === target) return 0
// Cat Head (4) attends strongly to:
if (source === 4) {
if (target === 7) return 0.95 // Paws (Body parts connected)
if (target === 2) return 0.8 // Butterfly (Interest)
if (target === 6) return 0.6 // Yarn (Toy)
return 0.1 // Background
}
// Cat parts (3, 4, 6) attend strongly to each other
const catParts = [3, 4, 6]
const isSourceCat = catParts.includes(source)
const isTargetCat = catParts.includes(target)
// Cat Paws (7) attends strongly to:
if (source === 7) {
if (target === 4) return 0.95 // Head
if (target === 6) return 0.9 // Yarn (Touching)
return 0.1
}
if (isSourceCat && isTargetCat) return 0.9 // Strong connection between cat parts
// Butterfly (2)
if (source === 2) {
if (target === 4) return 0.7 // Danger?
return 0.2
}
// Cat interacts with Yarn (7)
if (isSourceCat && target === 7) return 0.7
if (source === 7 && isTargetCat) return 0.7
// Grass (Background)
// Background patches attend to each other for texture consistency
const bgIndices = [0, 1, 3, 5, 8]
if (bgIndices.includes(source)) {
if (bgIndices.includes(target)) return 0.6
return 0.05
}
// Background parts attend to each other
const bgParts = [0, 1, 2, 5, 8]
if (bgParts.includes(source) && bgParts.includes(target)) return 0.5
return 0.1 // Weak attention otherwise
// Default fallback
return 0.1
}
const getAttentionColor = (source, target) => {
const weight = getAttentionWeight(source, target)
// Green for strong, gray for weak
if (weight > 0.6) return `rgba(16, 185, 129, ${weight})`
return `rgba(156, 163, 175, ${weight * 0.5})`
const getLineColor = (source, target) => {
const score = getAttentionScore(source, target)
return score > 0.5 ? 'var(--vp-c-brand)' : 'var(--vp-c-text-3)'
}
const getAttentionWidth = (source, target) => {
const weight = getAttentionWeight(source, target)
return weight * 5
const getLineWidth = (source, target) => {
const score = getAttentionScore(source, target)
return 1 + score * 4
}
const getLineOpacity = (source, target) => {
const score = getAttentionScore(source, target)
return 0.2 + score * 0.8
}
const getTopAttentions = (source) => {
const weights = {}
const scores = {}
items.forEach((_, idx) => {
if (idx !== source) {
weights[idx] = getAttentionWeight(source, idx)
scores[idx] = getAttentionScore(source, idx)
}
})
// Sort by weight desc
return weights
// Sort descending
const sortedKeys = Object.keys(scores).sort((a, b) => scores[b] - scores[a])
const top3 = {}
sortedKeys.slice(0, 3).forEach(key => {
top3[key] = scores[key]
})
return top3
}
const getInsightText = (idx) => {
if (idx === 4) return "猫头最关注猫爪(组成身体)和蝴蝶(捕猎目标)。"
if (idx === 7) return "猫爪最关注毛球(正在玩耍)和猫头。"
if (idx === 2) return "蝴蝶关注到了猫,可能是因为它是个威胁。"
if ([0,1,3,5,8].includes(idx)) return "草地主要关注周围的草地,确认背景纹理。"
if (idx === 6) return "毛球和猫爪有很强的互动关系。"
return "Self-Attention 让每个部分找到它的上下文关联。"
}
</script>
<style scoped>
.attn-demo {
padding: 20px;
background: var(--vp-c-bg-soft);
border-radius: 8px;
border: 1px solid var(--vp-c-divider);
border-radius: 12px;
padding: 24px;
margin: 20px 0;
user-select: none;
font-family: 'Menlo', 'Monaco', sans-serif;
}
.controls {
.header {
text-align: center;
margin-bottom: 20px;
margin-bottom: 30px;
}
.hint {
font-size: 0.9em;
color: var(--vp-c-text-2);
background: var(--vp-c-bg);
padding: 4px 12px;
border-radius: 12px;
border: 1px solid var(--vp-c-divider);
.title {
font-size: 16px;
font-weight: bold;
color: var(--vp-c-text-1);
}
.visual-area {
display: flex;
justify-content: center;
gap: 40px;
flex-wrap: wrap;
}
.image-grid {
display: grid;
grid-template-columns: repeat(3, 80px);
gap: 10px;
position: relative;
}
.grid-cell {
width: 80px;
height: 80px;
background: var(--vp-c-bg);
border: 2px solid var(--vp-c-divider);
border-radius: 8px;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
cursor: pointer;
transition: all 0.2s;
z-index: 2;
position: relative;
}
.grid-cell:hover,
.grid-cell.active {
border-color: var(--vp-c-brand);
transform: scale(1.05);
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
background: var(--vp-c-bg-mute);
}
.cell-label {
font-size: 0.8em;
.subtitle {
font-size: 12px;
color: var(--vp-c-text-2);
margin-top: 4px;
}
.connections {
.visual-stage {
display: flex;
gap: 40px;
justify-content: center;
align-items: flex-start;
flex-wrap: wrap;
}
/* Grid Area */
.grid-container {
width: 300px;
height: 300px;
position: relative;
/* background: rgba(0,0,0,0.02); */
border-radius: 12px;
}
.connections-layer {
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
pointer-events: none;
z-index: 1;
pointer-events: none;
}
.grid-cell {
position: absolute;
width: 60px;
height: 60px;
background: var(--vp-c-bg);
border: 2px solid var(--vp-c-divider);
border-radius: 12px;
display: flex;
align-items: center;
justify-content: center;
cursor: pointer;
z-index: 2;
transition: all 0.3s cubic-bezier(0.34, 1.56, 0.64, 1);
box-shadow: 0 4px 6px rgba(0,0,0,0.05);
}
.cell-content {
display: flex;
flex-direction: column;
align-items: center;
}
.cell-icon {
font-size: 24px;
line-height: 1.2;
}
.cell-label {
font-size: 10px;
color: var(--vp-c-text-2);
font-weight: bold;
}
/* Interaction States */
.grid-cell:hover, .grid-cell.is-source {
z-index: 10;
border-color: var(--vp-c-brand);
background: var(--vp-c-bg);
transform: scale(1.15);
box-shadow: 0 8px 20px rgba(0,0,0,0.15);
}
.grid-cell.is-strong-attn {
border-color: var(--vp-c-brand-light);
background: var(--vp-c-brand-dimm);
}
.attn-badge {
position: absolute;
top: -8px;
right: -8px;
background: var(--vp-c-brand);
color: white;
font-size: 9px;
padding: 2px 6px;
border-radius: 10px;
font-weight: bold;
box-shadow: 0 2px 4px rgba(0,0,0,0.2);
}
/* Info Panel */
.info-panel {
width: 200px;
width: 280px;
min-height: 260px;
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 8px;
padding: 15px;
opacity: 0;
transition: opacity 0.2s;
pointer-events: none;
border-radius: 12px;
padding: 20px;
display: flex;
flex-direction: column;
justify-content: center;
}
.info-panel.visible {
opacity: 1;
pointer-events: auto;
.placeholder-text {
text-align: center;
color: var(--vp-c-text-3);
font-size: 13px;
display: flex;
flex-direction: column;
align-items: center;
gap: 10px;
}
.info-title {
font-weight: bold;
margin-bottom: 5px;
color: var(--vp-c-brand);
.cursor-icon {
font-size: 32px;
animation: bounce 2s infinite;
}
.info-desc {
font-size: 0.85em;
.source-info {
display: flex;
align-items: center;
gap: 10px;
margin-bottom: 20px;
padding-bottom: 15px;
border-bottom: 1px dashed var(--vp-c-divider);
}
.label {
font-size: 12px;
color: var(--vp-c-text-2);
}
.patch-tag {
background: var(--vp-c-brand-dimm);
color: var(--vp-c-brand-dark);
padding: 4px 12px;
border-radius: 6px;
font-size: 13px;
font-weight: bold;
}
.list-header {
font-size: 11px;
color: var(--vp-c-text-3);
text-transform: uppercase;
margin-bottom: 10px;
letter-spacing: 0.5px;
}
.attn-list {
list-style: none;
padding: 0;
margin: 0;
.attn-item {
display: flex;
align-items: center;
justify-content: space-between;
margin-bottom: 12px;
}
.attn-list li {
.item-left {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 6px;
font-size: 0.85em;
width: 80px;
}
.target-icon {
width: 20px;
text-align: center;
.item-icon { font-size: 16px; }
.item-name { font-size: 12px; font-weight: 500; }
.item-right {
flex: 1;
display: flex;
align-items: center;
gap: 10px;
}
.target-name {
width: 40px;
}
.bar-bg {
.progress-bar {
flex: 1;
height: 6px;
background: var(--vp-c-bg-soft);
@@ -270,9 +425,50 @@ const getTopAttentions = (source) => {
overflow: hidden;
}
.bar-fill {
.progress-fill {
height: 100%;
background: var(--vp-c-brand);
border-radius: 3px;
}
.score-text {
font-size: 11px;
color: var(--vp-c-text-2);
width: 30px;
text-align: right;
font-family: monospace;
}
.insight-box {
margin-top: 15px;
background: var(--vp-c-yellow-dimm);
padding: 10px;
border-radius: 8px;
display: flex;
gap: 8px;
align-items: flex-start;
}
.bulb { font-size: 16px; }
.insight-text {
font-size: 12px;
color: var(--vp-c-text-1);
line-height: 1.4;
}
@keyframes bounce {
0%, 100% { transform: translateY(0); }
50% { transform: translateY(-5px); }
}
@media (max-width: 768px) {
.visual-stage {
flex-direction: column;
align-items: center;
}
.info-panel {
width: 100%;
min-height: auto;
}
}
</style>
@@ -3,16 +3,16 @@
<div class="demo-container">
<!-- Step 1: Patch -->
<div class="step-box">
<div class="label">1. Patch (4x4)</div>
<div class="label">1. Patch (16×16×3) (示意 / Toy)</div>
<div class="grid-patch">
<div
v-for="n in 16"
v-for="n in patchCellCount"
:key="n"
class="pixel"
:style="{ backgroundColor: getPixelColor(n) }"
></div>
</div>
<div class="desc">768 像素点</div>
<div class="desc">16×16 像素 × 3 通道 = 768 标量值</div>
</div>
<div class="arrow"></div>
@@ -22,13 +22,14 @@
<div class="label">2. Flatten</div>
<div class="vector-container">
<div
v-for="n in 16"
v-for="n in flattenSampleCount"
:key="n"
class="vector-cell"
:style="{ backgroundColor: getPixelColor(n) }"
></div>
<div class="vector-ellipsis"></div>
</div>
<div class="desc">拉平成向量</div>
<div class="desc">得到 1×768 向量 (Vector)</div>
</div>
<div class="arrow">× W</div>
@@ -39,13 +40,16 @@
<div class="embedding-container">
<div v-for="n in 8" :key="n" class="embed-cell"></div>
</div>
<div class="desc">压缩特征 (D=8)</div>
<div class="desc">映射到 D (示意 D=8常见 D=768)</div>
</div>
</div>
</div>
</template>
<script setup>
const patchCellCount = 16 * 16
const flattenSampleCount = 32
const getPixelColor = (n) => {
// Generate a gradient of colors
const hue = (n * 20) % 360
@@ -89,8 +93,8 @@ const getPixelColor = (n) => {
.grid-patch {
display: grid;
grid-template-columns: repeat(4, 1fr);
gap: 2px;
grid-template-columns: repeat(16, 1fr);
gap: 1px;
width: 80px;
height: 80px;
}
@@ -105,7 +109,7 @@ const getPixelColor = (n) => {
display: flex;
flex-direction: column;
gap: 1px;
height: 120px;
height: 140px;
width: 20px;
justify-content: center;
}
@@ -115,6 +119,14 @@ const getPixelColor = (n) => {
flex: 1;
}
.vector-ellipsis {
font-size: 12px;
line-height: 1;
color: var(--vp-c-text-3);
text-align: center;
padding-top: 4px;
}
.embedding-container {
display: flex;
flex-direction: column;
@@ -8,113 +8,135 @@
</div>
</div>
<div class="toggle-label">
<span :class="{ active: !isVLM }">Pure LLM</span>
<span :class="{ active: !isVLM }">Pure LLM (纯文本)</span>
<span class="arrow"></span>
<span :class="{ active: isVLM }">Multimodal VLM</span>
<span :class="{ active: isVLM }">Multimodal VLM (多模态)</span>
</div>
</div>
<div class="status-desc">
{{
isVLM
? '给大脑装上眼睛:视觉信号经过翻译,变成 Token 混入文字流。'
: '纯文本大脑:只能听懂 Token 语言,无法感知图像。'
? 'Tokens from vision are translated and placed before text tokens. (视觉信息被翻译成 Token,放在文字 Token 之前。)'
: 'Text-only tokens flow into the LLM. (只有文字 Token 流入大模型。)'
}}
</div>
</div>
<div class="diagram-stage" :class="{ 'vlm-mode': isVLM }">
<!-- Vision Pipeline (Only visible in VLM mode) -->
<div class="pipeline vision-pipeline">
<div class="node-group">
<div class="node input-node image-node">
<span class="icon"></span>
<span class="label">Image</span>
</div>
<div class="flow-arrow"></div>
<div
class="node process-node vit-node"
title="Vision Transformer: The Eye"
>
<span class="icon"></span>
<span class="label">ViT</span>
</div>
<div class="flow-arrow"></div>
<div
class="node adapter-node projector-node"
title="Projector: The Translator"
>
<span class="icon">🔌</span>
<span class="label">Projector</span>
</div>
<div class="flow-arrow connector-arrow"></div>
</div>
</div>
<!-- Text Pipeline (Always visible) -->
<div class="pipeline text-pipeline">
<div class="node-group horizontal">
<div class="node input-node text-node">
<span class="icon"></span>
<span class="label">Prompt</span>
</div>
<div class="flow-arrow"></div>
<div class="node process-node embed-node">
<span class="icon"></span>
<span class="label">Embed</span>
</div>
<!-- Merge Point Visualization -->
<div class="merge-point" :class="{ active: isVLM }">
<div class="plus-icon">+</div>
<div class="merge-label">Concat</div>
</div>
<div class="flow-arrow"></div>
<div class="node core-node llm-node">
<span class="icon">🧠</span>
<span class="label">LLM Backbone</span>
<div class="inner-flow">
<span class="dot t1"></span>
<span class="dot t2"></span>
<span class="dot v1" v-if="isVLM"></span>
<div class="diagram-stage">
<div class="lanes">
<div class="lane lane-vision" v-show="isVLM">
<div class="lane-title">Vision Path (视觉路径)</div>
<div class="lane-flow">
<div class="node input-node">
<span class="icon">🖼</span>
<span class="label">Image (图片)</span>
</div>
<span class="mini-arrow"></span>
<div class="node process-node vit-node">
<span class="icon">👁</span>
<span class="label">ViT (视觉模型)</span>
</div>
<span class="mini-arrow"></span>
<div class="node adapter-node">
<span class="icon">🔌</span>
<span class="label">Projector (投影器)</span>
</div>
<span class="mini-arrow"></span>
<div class="token-box token-box-vision">
<div class="token-box-title">Vision Tokens (视觉 Token)</div>
<div class="tokens">
<span class="token vision">v1</span>
<span class="token vision">v2</span>
<span class="token vision">v3</span>
<span class="token vision"></span>
</div>
</div>
</div>
<div class="flow-arrow"></div>
<div class="node output-node">
<span class="icon">💬</span>
<span class="label">Response</span>
</div>
<div class="lane lane-text">
<div class="lane-title">Text Path (文字路径)</div>
<div class="lane-flow">
<div class="node input-node">
<span class="icon"></span>
<span class="label">Prompt (提示词)</span>
</div>
<span class="mini-arrow"></span>
<div class="node process-node">
<span class="icon">🔤</span>
<span class="label">Embed (向量化)</span>
</div>
<span class="mini-arrow"></span>
<div class="token-box">
<div class="token-box-title">Text Tokens (文字 Token)</div>
<div class="tokens">
<span class="token text">t1</span>
<span class="token text">t2</span>
<span class="token text">t3</span>
<span class="token text"></span>
</div>
</div>
</div>
</div>
<div class="merge-stage">
<div class="merge-title">Token Sequence (输入序列)</div>
<div class="sequence">
<div v-if="isVLM" class="sequence-row">
<span class="sequence-tag vision">Vision (视觉)</span>
<div class="tokens">
<span class="token vision">v1</span>
<span class="token vision">v2</span>
<span class="token vision">v3</span>
<span class="token vision"></span>
</div>
</div>
<div class="sequence-row">
<span class="sequence-tag text">Text (文字)</span>
<div class="tokens">
<span class="token text">t1</span>
<span class="token text">t2</span>
<span class="token text">t3</span>
<span class="token text"></span>
</div>
</div>
<div class="sequence-hint">
<span v-if="isVLM">Concat: [Vision Tokens] + [Text Tokens] (拼接视觉在前文字在后)</span>
<span v-else>Only [Text Tokens] (只有文字 Token)</span>
</div>
</div>
<div class="core-stage">
<span class="big-arrow"></span>
<div class="node core-node">
<span class="icon">🧠</span>
<span class="label">LLM Backbone (大模型)</span>
</div>
<span class="big-arrow"></span>
<div class="node output-node">
<span class="icon">💬</span>
<span class="label">Response (回复)</span>
</div>
</div>
</div>
</div>
</div>
<div class="interactive-info">
<div class="info-card" v-if="!isVLM">
<h3>Standard LLM Flow</h3>
<p>
Text is converted into vectors (Embeddings) and processed by the
Transformer to predict the next word.
</p>
</div>
<div class="info-card vlm-info" v-else>
<h3>VLM = LLM + Vision Encoder</h3>
<ul>
<li>
<strong>ViT (The Eye):</strong> Slices image into patches and
extracts features.
</li>
<li>
<strong>Projector (The Translator):</strong> Converts visual
features into the same "language" (vector dimension) as text
embeddings.
</li>
<li>
<strong>Concatenation:</strong> The translated visual tokens are
pasted <em>before</em> the text tokens. The LLM sees them as
"foreign words" it learned to understand.
</li>
</ul>
</div>
<transition name="fade" mode="out-in">
<div class="info-card" v-if="!isVLM" key="llm">
<h3>Standard LLM Flow (标准大模型流程)</h3>
<p>Prompt Embedding Token Sequence LLM Response</p>
</div>
<div class="info-card vlm-info" v-else key="vlm">
<h3>VLM = LLM + Vision Encoder (视觉大模型原理)</h3>
<ul>
<li><strong>ViT (The Eye):</strong> 把图片编码成视觉特征</li>
<li><strong>Projector (The Translator):</strong> 把视觉特征映射到 LLM Token 空间</li>
<li><strong>Concatenation (拼接):</strong> 把视觉 Token 放在文字 Token 之前作为同一条输入序列</li>
</ul>
</div>
</transition>
</div>
</div>
</template>
@@ -140,12 +162,11 @@ const toggleMode = () => {
user-select: none;
}
/* Controls */
.controls-header {
display: flex;
flex-direction: column;
align-items: center;
margin-bottom: 30px;
margin-bottom: 18px;
gap: 12px;
}
@@ -216,105 +237,160 @@ const toggleMode = () => {
font-size: 13px;
color: var(--vp-c-text-2);
text-align: center;
height: 20px;
line-height: 1.5;
max-width: 720px;
}
/* Diagram Stage */
.diagram-stage {
position: relative;
height: 240px;
background: var(--vp-c-bg);
border: 1px dashed var(--vp-c-divider);
border-radius: 8px;
overflow: hidden;
padding: 18px;
}
.lanes {
display: flex;
justify-content: center;
align-items: center;
}
/* Pipelines */
.pipeline {
transition: all 0.5s cubic-bezier(0.34, 1.56, 0.64, 1);
}
.text-pipeline {
position: absolute;
bottom: 80px; /* Centered vertically in LLM mode */
left: 50%;
transform: translateX(-50%);
width: 100%;
display: flex;
justify-content: center;
}
.vlm-mode .text-pipeline {
bottom: 40px; /* Move down in VLM mode */
}
.vision-pipeline {
position: absolute;
top: 20px;
left: 20%; /* Align with input side */
opacity: 0;
transform: translateY(-20px);
pointer-events: none;
}
.vlm-mode .vision-pipeline {
opacity: 1;
transform: translateY(0);
pointer-events: auto;
}
.node-group {
display: flex;
align-items: center;
gap: 6px;
}
.node-group.horizontal {
flex-direction: row;
}
.vision-pipeline .node-group {
flex-direction: column;
gap: 14px;
}
.lane {
background: var(--vp-c-bg-mute);
border: 1px solid var(--vp-c-divider);
border-radius: 10px;
padding: 12px;
}
.lane-title {
font-size: 12px;
color: var(--vp-c-text-2);
margin-bottom: 10px;
font-weight: 700;
}
.lane-flow {
display: flex;
align-items: center;
gap: 10px;
flex-wrap: wrap;
}
.merge-stage {
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 10px;
padding: 12px;
}
.merge-title {
font-size: 12px;
color: var(--vp-c-text-2);
margin-bottom: 10px;
font-weight: 700;
}
.sequence {
border: 1px solid var(--vp-c-divider);
background: var(--vp-c-bg-soft);
border-radius: 10px;
padding: 10px;
}
.sequence-row {
display: flex;
align-items: center;
gap: 10px;
margin-bottom: 8px;
flex-wrap: wrap;
}
.sequence-row:last-child {
margin-bottom: 0;
}
.sequence-tag {
font-size: 11px;
font-weight: 800;
padding: 2px 8px;
border-radius: 999px;
border: 1px solid var(--vp-c-divider);
background: var(--vp-c-bg);
color: var(--vp-c-text-2);
}
.sequence-tag.vision {
border-color: var(--vp-c-yellow);
}
.sequence-tag.text {
border-color: var(--vp-c-brand);
}
.sequence-hint {
margin-top: 8px;
font-size: 11px;
color: var(--vp-c-text-2);
}
.core-stage {
margin-top: 14px;
display: flex;
align-items: center;
justify-content: center;
gap: 10px;
flex-wrap: wrap;
}
.big-arrow {
font-size: 18px;
color: var(--vp-c-text-2);
font-weight: 800;
}
.mini-arrow {
font-size: 14px;
color: var(--vp-c-text-3);
font-weight: 800;
}
/* Nodes */
.node {
background: var(--vp-c-bg);
border: 2px solid var(--vp-c-divider);
border-radius: 8px;
border-radius: 10px;
padding: 8px 12px;
display: flex;
flex-direction: column;
align-items: center;
min-width: 70px;
min-width: 110px;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
position: relative;
z-index: 2;
}
.icon {
font-size: 20px;
margin-bottom: 4px;
}
.label {
font-size: 11px;
font-weight: bold;
font-weight: 800;
text-align: center;
line-height: 1.2;
}
.input-node {
border-color: #aaa;
}
.process-node {
border-color: var(--vp-c-brand-dimm);
}
.core-node {
border-color: var(--vp-c-brand);
background: var(--vp-c-brand-dimm);
min-width: 100px;
min-width: 140px;
}
.output-node {
border-color: var(--vp-c-brand);
}
@@ -323,101 +399,64 @@ const toggleMode = () => {
border-color: var(--vp-c-yellow);
background: rgba(255, 197, 23, 0.05);
}
.projector-node {
.adapter-node {
border-color: var(--vp-c-yellow);
background: var(--vp-c-yellow-dimm);
}
/* Arrows */
.flow-arrow {
color: var(--vp-c-text-3);
font-size: 16px;
.token-box {
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 10px;
padding: 10px;
min-width: 220px;
}
.connector-arrow {
font-size: 24px;
color: var(--vp-c-yellow);
margin-top: -10px;
margin-bottom: -10px;
transform: rotate(-45deg) translateX(10px);
.token-box-vision {
border-color: var(--vp-c-yellow);
}
/* Merge Point */
.merge-point {
width: 0;
overflow: hidden;
transition: all 0.5s;
display: flex;
flex-direction: column;
align-items: center;
opacity: 0;
}
.merge-point.active {
width: 40px;
opacity: 1;
}
.plus-icon {
font-weight: bold;
.token-box-title {
font-size: 11px;
font-weight: 800;
color: var(--vp-c-text-2);
font-size: 18px;
margin-bottom: 8px;
}
.merge-label {
font-size: 9px;
color: var(--vp-c-text-3);
}
/* Inner Flow Animation inside LLM */
.inner-flow {
.tokens {
display: flex;
gap: 4px;
margin-top: 4px;
height: 6px;
gap: 6px;
flex-wrap: wrap;
}
.dot {
width: 6px;
height: 6px;
border-radius: 50%;
background: #fff;
opacity: 0.6;
animation: pulse 1s infinite alternate;
.token {
font-size: 11px;
padding: 2px 8px;
border-radius: 999px;
border: 1px solid var(--vp-c-divider);
background: var(--vp-c-bg-soft);
color: var(--vp-c-text-1);
}
.t1 {
animation-delay: 0s;
}
.t2 {
animation-delay: 0.2s;
}
.v1 {
background: var(--vp-c-yellow);
animation-delay: 0.4s;
.token.vision {
border-color: var(--vp-c-yellow);
background: rgba(255, 197, 23, 0.12);
}
@keyframes pulse {
from {
opacity: 0.3;
transform: scale(0.8);
}
to {
opacity: 1;
transform: scale(1.1);
}
.token.text {
border-color: var(--vp-c-brand);
background: rgba(59, 130, 246, 0.12);
}
/* Interactive Info */
.interactive-info {
margin-top: 20px;
margin-top: 16px;
}
.info-card {
background: var(--vp-c-bg-mute);
padding: 16px;
border-radius: 8px;
animation: fadeIn 0.3s;
}
.info-card h3 {
@@ -439,31 +478,25 @@ const toggleMode = () => {
margin: 0;
}
@keyframes fadeIn {
from {
opacity: 0;
transform: translateY(5px);
}
to {
opacity: 1;
transform: translateY(0);
}
.fade-enter-active,
.fade-leave-active {
transition: opacity 0.3s ease;
}
/* Mobile Adjustments */
@media (max-width: 600px) {
.fade-enter-from,
.fade-leave-to {
opacity: 0;
}
@media (max-width: 720px) {
.diagram-stage {
height: 300px;
padding: 14px;
}
.text-pipeline {
flex-wrap: wrap;
gap: 10px;
width: 90%;
.node {
min-width: 100px;
}
.vision-pipeline {
left: 10%;
.token-box {
min-width: 200px;
}
}
</style>
@@ -6,67 +6,137 @@
<div class="patchify-demo">
<div class="control-panel">
<div class="controls">
<button class="action-btn" @click="toggleState">
{{ isPatchified ? '还原图片 (Restore)' : '切分图片 (Patchify)' }}
<button
class="action-btn"
@click="prevStep"
:disabled="currentStep === 0"
>
上一步 (Prev)
</button>
<div class="info">
<span>Resolution: 224x224</span>
<span>Patch Size: 16x16</span>
<span>Total Patches: {{ 14 * 14 }}</span>
</div>
<span class="step-indicator">Step {{ currentStep + 1 }} / 4</span>
<button
class="action-btn primary"
@click="nextStep"
:disabled="currentStep === 3"
>
{{ currentStep === 3 ? '完成 (Done)' : '下一步 (Next) ➡' }}
</button>
</div>
<div class="step-desc">
{{ stepDescriptions[currentStep] }}
</div>
</div>
<div class="visual-area">
<!-- 原始/切分视图容器 -->
<div class="image-container" :class="{ 'is-patchified': isPatchified }">
<!--
Step 0: Show container background, cells hidden
Step 1: Show container background, grid overlay visible (cells with border)
Step 2+: Container background hidden, cells visible with individual backgrounds
-->
<div
class="image-container"
:class="{
'is-pixelated': currentStep >= 1,
'is-patchified': currentStep >= 2
}"
>
<div class="grid-overlay" v-if="currentStep === 1"></div>
<div
v-for="n in 196"
:key="n"
class="patch"
:style="{
'--delay': `${n * 0.005}s`,
'--hue': `${(n % 14) * 20 + Math.floor(n / 14) * 20}`
}"
:style="getPatchStyle(n)"
>
<span class="patch-id" v-if="isPatchified">{{ n }}</span>
<!-- Show number only in Pixelated stage to represent 'digitization' -->
<span class="pixel-val" v-if="currentStep === 1">{{ Math.floor(Math.random() * 9) }}</span>
<!-- Show ID in Patchified stage -->
<span class="patch-id" v-if="currentStep >= 2">{{ n }}</span>
</div>
</div>
<div class="arrow" v-if="isPatchified"></div>
<div class="arrow-down" v-if="currentStep >= 3"></div>
<!-- 线性序列视图 -->
<div class="sequence-container" v-if="isPatchified">
<div class="sequence-label">Flattened Sequence (Token Input)</div>
<div class="sequence-container" v-if="currentStep >= 3">
<div class="sequence-label">Token Sequence: 196×D (每个 Token D 维向量)</div>
<div class="token-stream">
<div
v-for="n in 196"
:key="n"
class="mini-patch"
:style="{ '--hue': `${(n % 14) * 20 + Math.floor(n / 14) * 20}` }"
:style="getMiniPatchStyle(n)"
></div>
</div>
</div>
</div>
<div class="explanation">
<p>
<span class="icon">💡</span>
计算机将图片切成 <strong>14x14 = 196</strong> 个小方块Patch
然后把这些方块拉直成一长串序列就像把一段话里的单词排成一排一样
这就是 <strong>Visual Tokenization</strong>
</p>
</div>
</div>
</template>
<script setup>
import { ref } from 'vue'
import { ref, computed } from 'vue'
const isPatchified = ref(false)
const currentStep = ref(0)
const toggleState = () => {
isPatchified.value = !isPatchified.value
const stepDescriptions = [
"1. 原始图片 (Original Image): 计算机看到的原始输入。",
"2. 数字化 (Digitization): 图片本质上是一个数字矩阵 (H x W x C)。",
"3. 切块 (Patchify): 典型设置:224×224 按 16×16 切成 14×14=196 个 Patch(此处等比示意)。",
"4. 序列化 (Serialize): 将二维分布的 Patch “拍扁”成一维序列 (Spatial Flatten)。现在它看起来就像一串“视觉单词”,可以被 Transformer 逐个读取。"
]
const nextStep = () => {
if (currentStep.value < 3) currentStep.value++
}
const prevStep = () => {
if (currentStep.value > 0) currentStep.value--
}
// 模拟一张风景图的 CSS 渐变
// Sky (Blue) -> Mountains (Green/Grey) -> Sun (Yellow)
const bgImage = 'linear-gradient(to bottom, #87CEEB 0%, #87CEEB 50%, #228B22 50%, #228B22 100%)'
// Add a sun using radial gradient
const complexBg = 'radial-gradient(circle at 70% 20%, #FFD700 0%, #FFD700 10%, transparent 10.5%), linear-gradient(to bottom, #87CEEB 0%, #87CEEB 60%, #4CA1AF 60%, #2C3E50 100%)'
const getPatchStyle = (n) => {
const row = Math.floor((n - 1) / 14)
const col = (n - 1) % 14
// Calculate background position for each patch to match the original image
// The container is 280px, each patch is 20px.
// 14 cols.
const posX = col * -20
const posY = row * -20
const isPatchified = currentStep.value >= 2
return {
backgroundImage: complexBg,
backgroundPosition: `${posX}px ${posY}px`,
backgroundSize: '280px 280px',
// In Step 0, patches are hidden to show pure container background
// In Step 1, patches are visible but transparent background to show numbers/borders over container background
// In Step 2, patches take over with their own background
opacity: currentStep.value === 0 ? 0 : 1,
// In Step 1, background must be transparent to see container bg
backgroundImage: isPatchified ? complexBg : 'none',
transform: isPatchified ? 'scale(0.9)' : 'scale(1)',
transition: 'all 0.5s ease',
}
}
const getMiniPatchStyle = (n) => {
const row = Math.floor((n - 1) / 14)
const col = (n - 1) % 14
const posX = col * -20
const posY = row * -20
return {
backgroundImage: complexBg,
backgroundPosition: `${posX}px ${posY}px`,
backgroundSize: '280px 280px',
}
}
</script>
@@ -77,40 +147,68 @@ const toggleState = () => {
padding: 20px;
background: var(--vp-c-bg-soft);
margin: 20px 0;
user-select: none;
}
.control-panel {
margin-bottom: 20px;
display: flex;
justify-content: center;
flex-direction: column;
align-items: center;
gap: 15px;
}
.controls {
display: flex;
gap: 20px;
gap: 15px;
align-items: center;
}
.action-btn {
background: var(--vp-c-brand);
color: white;
border: none;
.step-indicator {
font-family: monospace;
font-weight: bold;
color: var(--vp-c-text-2);
}
.step-desc {
font-size: 0.9em;
color: var(--vp-c-text-1);
text-align: center;
background: var(--vp-c-bg-mute);
padding: 8px 16px;
border-radius: 4px;
cursor: pointer;
font-weight: 600;
transition: opacity 0.2s;
}
.action-btn:hover {
opacity: 0.9;
}
.info {
min-height: 40px;
display: flex;
gap: 15px;
align-items: center;
justify-content: center;
width: 100%;
}
.action-btn {
background: var(--vp-c-bg-mute);
color: var(--vp-c-text-1);
border: 1px solid var(--vp-c-divider);
padding: 6px 12px;
border-radius: 4px;
cursor: pointer;
transition: all 0.2s;
font-size: 0.9em;
color: var(--vp-c-text-2);
}
.action-btn.primary {
background: var(--vp-c-brand);
color: white;
border-color: var(--vp-c-brand);
}
.action-btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.action-btn:not(:disabled):hover {
opacity: 0.8;
transform: translateY(-1px);
}
.visual-area {
@@ -118,7 +216,7 @@ const toggleState = () => {
flex-direction: column;
align-items: center;
gap: 20px;
min-height: 300px;
min-height: 350px;
}
.image-container {
@@ -126,31 +224,55 @@ const toggleState = () => {
grid-template-columns: repeat(14, 1fr);
width: 280px;
height: 280px;
gap: 0;
background: #333;
/* Step 0 & 1 Background */
background-image: radial-gradient(circle at 70% 20%, #FFD700 0%, #FFD700 10%, transparent 10.5%), linear-gradient(to bottom, #87CEEB 0%, #87CEEB 60%, #4CA1AF 60%, #2C3E50 100%);
position: relative;
transition: all 0.5s ease;
border: 2px solid var(--vp-c-text-1);
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
}
/* Step 2+: Remove container background, let patches show */
.image-container.is-patchified {
background-image: none;
background-color: transparent;
gap: 2px;
background: transparent;
border-color: transparent;
}
.patch {
background-color: hsl(var(--hue), 70%, 60%);
display: flex;
align-items: center;
justify-content: center;
font-size: 8px;
color: rgba(0, 0, 0, 0.5);
transition: all 0.5s ease;
color: rgba(255, 255, 255, 0.8);
position: relative;
}
.is-patchified .patch {
/* Step 1: Pixelated Overlay Effect */
.image-container.is-pixelated:not(.is-patchified) .patch {
border: 1px solid rgba(255, 255, 255, 0.1);
/* Use pseudo-element or just opacity logic in JS */
}
/* Step 1: Digitization numbers */
.pixel-val {
font-family: monospace;
font-size: 8px;
color: rgba(0, 0, 0, 0.3);
mix-blend-mode: overlay;
}
.patch-id {
background: rgba(0, 0, 0, 0.5);
color: white;
padding: 1px 2px;
border-radius: 2px;
transform: scale(0.9);
font-size: 7px;
}
.arrow-down {
font-size: 24px;
color: var(--vp-c-text-2);
animation: bounce 1s infinite;
}
.sequence-container {
@@ -159,7 +281,7 @@ const toggleState = () => {
padding: 15px;
border-radius: 8px;
border: 1px solid var(--vp-c-divider);
animation: fadeIn 0.5s ease;
animation: slideUp 0.5s ease;
}
.sequence-label {
@@ -171,50 +293,48 @@ const toggleState = () => {
.token-stream {
display: flex;
flex-wrap: wrap;
gap: 2px;
flex-wrap: nowrap;
gap: 1px;
overflow-x: auto;
padding: 10px 5px; /* Space for brackets */
align-items: center;
position: relative;
}
/* Add Matrix Brackets */
.token-stream::before,
.token-stream::after {
content: '';
display: block;
width: 6px;
height: 36px; /* Match vector height + padding */
border: 2px solid var(--vp-c-text-3);
flex-shrink: 0;
}
.token-stream::before {
border-right: none;
}
.token-stream::after {
border-left: none;
}
.mini-patch {
width: 10px;
height: 10px;
background-color: hsl(var(--hue), 70%, 60%);
width: 6px; /* Thinner to allow more density */
height: 32px; /* Taller to represent Vector Dimension D */
border-radius: 1px;
}
.explanation {
margin-top: 20px;
padding: 12px;
background: var(--vp-c-bg-mute);
border-radius: 6px;
font-size: 0.9em;
line-height: 1.6;
}
.arrow {
font-size: 24px;
color: var(--vp-c-text-2);
animation: bounce 1s infinite;
flex-shrink: 0;
opacity: 0.9;
}
@keyframes bounce {
0%,
100% {
transform: translateY(0);
}
50% {
transform: translateY(5px);
}
0%, 100% { transform: translateY(0); }
50% { transform: translateY(5px); }
}
@keyframes fadeIn {
from {
opacity: 0;
transform: translateY(10px);
}
to {
opacity: 1;
transform: translateY(0);
}
@keyframes slideUp {
from { opacity: 0; transform: translateY(20px); }
to { opacity: 1; transform: translateY(0); }
}
</style>
@@ -3,7 +3,7 @@
<div class="pipeline">
<!-- 1. Transformer Output Grid -->
<div class="stage">
<div class="stage-label">1. Processed Patches (Grid)</div>
<div class="stage-label">1. Patch Tokens (Shown as Grid) (Patch Token 网格示意)</div>
<div class="grid-container">
<div
v-for="(item, index) in items"
@@ -19,13 +19,13 @@
<div class="arrow-section">
<div class="arrow-line"></div>
<div class="arrow-text">Flatten & Output</div>
<div class="arrow-text">Reshape for View: Grid Sequence (重排显示网格序列)</div>
</div>
<!-- 2. Feature Vector Sequence -->
<div class="stage">
<div class="stage-label">
2. Feature Vector Sequence (The "Image Sentence")
2. Output Token Sequence (N×D) (输出序列)
</div>
<div class="vector-sequence">
<div