feat: save current work to dev branch

This commit is contained in:
sanbuphy
2026-01-15 20:10:19 +08:00
parent c9e7ece75d
commit c8567ce23f
76 changed files with 28352 additions and 6 deletions
@@ -0,0 +1,265 @@
<template>
<div class="attn-demo">
<div class="controls">
<span class="hint">🖱 把鼠标悬停在方块上查看它的注意力分配</span>
</div>
<div class="visual-area">
<div class="image-grid" @mouseleave="hoverIndex = -1">
<div
v-for="(item, index) in items"
:key="index"
class="grid-cell"
:class="{ active: hoverIndex === index }"
@mouseenter="hoverIndex = index"
>
{{ item.icon }}
<div class="cell-label">{{ item.label }}</div>
</div>
<!-- SVG Overlay for lines -->
<svg class="connections" v-if="hoverIndex !== -1">
<line
v-for="(target, tIndex) in items"
:key="tIndex"
v-if="tIndex !== hoverIndex"
:x1="getCenter(hoverIndex).x"
:y1="getCenter(hoverIndex).y"
:x2="getCenter(tIndex).x"
:y2="getCenter(tIndex).y"
:stroke="getAttentionColor(hoverIndex, tIndex)"
:stroke-width="getAttentionWidth(hoverIndex, tIndex)"
stroke-linecap="round"
/>
</svg>
</div>
<div class="info-panel" :class="{ visible: hoverIndex !== -1 }">
<div class="info-title">Patch: {{ items[hoverIndex]?.label }}</div>
<div class="info-desc">正在关注</div>
<ul class="attn-list" v-if="hoverIndex !== -1">
<li v-for="(weight, targetIdx) in getTopAttentions(hoverIndex)" :key="targetIdx">
<span class="target-icon">{{ items[targetIdx].icon }}</span>
<span class="target-name">{{ items[targetIdx].label }}</span>
<div class="bar-bg">
<div class="bar-fill" :style="{ width: (weight * 100) + '%' }"></div>
</div>
</li>
</ul>
</div>
</div>
</div>
</template>
<script setup>
import { ref } from 'vue'
const hoverIndex = ref(-1)
const items = [
{ icon: '🌲', label: '背景' }, { icon: '🌲', label: '背景' }, { icon: '☁️', label: '天空' },
{ icon: '👂', label: '猫耳' }, { icon: '😼', label: '猫脸' }, { icon: '🌲', label: '背景' },
{ icon: '🐾', label: '猫爪' }, { icon: '🧶', label: '毛线' }, { icon: '🌱', label: '草地' }
]
// 3x3 Grid
const getCenter = (index) => {
const row = Math.floor(index / 3)
const col = index % 3
// Assuming 80px cell + 10px gap
const cellSize = 80
const gap = 10
const offset = cellSize / 2
return {
x: col * (cellSize + gap) + offset,
y: row * (cellSize + gap) + offset
}
}
// Mock attention weights
const getAttentionWeight = (source, target) => {
// Self attention is ignored for visualization clarity usually, but let's say:
// Cat parts (3, 4, 6) attend strongly to each other
const catParts = [3, 4, 6]
const isSourceCat = catParts.includes(source)
const isTargetCat = catParts.includes(target)
if (isSourceCat && isTargetCat) return 0.9 // Strong connection between cat parts
// Cat interacts with Yarn (7)
if (isSourceCat && target === 7) return 0.7
if (source === 7 && isTargetCat) return 0.7
// Background parts attend to each other
const bgParts = [0, 1, 2, 5, 8]
if (bgParts.includes(source) && bgParts.includes(target)) return 0.5
return 0.1 // Weak attention otherwise
}
const getAttentionColor = (source, target) => {
const weight = getAttentionWeight(source, target)
// Green for strong, gray for weak
if (weight > 0.6) return `rgba(16, 185, 129, ${weight})`
return `rgba(156, 163, 175, ${weight * 0.5})`
}
const getAttentionWidth = (source, target) => {
const weight = getAttentionWeight(source, target)
return weight * 5
}
const getTopAttentions = (source) => {
const weights = {}
items.forEach((_, idx) => {
if (idx !== source) {
weights[idx] = getAttentionWeight(source, idx)
}
})
// Sort by weight desc
return weights
}
</script>
<style scoped>
.attn-demo {
padding: 20px;
background: var(--vp-c-bg-soft);
border-radius: 8px;
margin: 20px 0;
user-select: none;
}
.controls {
text-align: center;
margin-bottom: 20px;
}
.hint {
font-size: 0.9em;
color: var(--vp-c-text-2);
background: var(--vp-c-bg);
padding: 4px 12px;
border-radius: 12px;
border: 1px solid var(--vp-c-divider);
}
.visual-area {
display: flex;
justify-content: center;
gap: 40px;
flex-wrap: wrap;
}
.image-grid {
display: grid;
grid-template-columns: repeat(3, 80px);
gap: 10px;
position: relative;
}
.grid-cell {
width: 80px;
height: 80px;
background: var(--vp-c-bg);
border: 2px solid var(--vp-c-divider);
border-radius: 8px;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
cursor: pointer;
transition: all 0.2s;
z-index: 2;
position: relative;
}
.grid-cell:hover, .grid-cell.active {
border-color: var(--vp-c-brand);
transform: scale(1.05);
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
background: var(--vp-c-bg-mute);
}
.cell-label {
font-size: 0.8em;
color: var(--vp-c-text-2);
margin-top: 4px;
}
.connections {
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
pointer-events: none;
z-index: 1;
}
.info-panel {
width: 200px;
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 8px;
padding: 15px;
opacity: 0;
transition: opacity 0.2s;
pointer-events: none;
}
.info-panel.visible {
opacity: 1;
pointer-events: auto;
}
.info-title {
font-weight: bold;
margin-bottom: 5px;
color: var(--vp-c-brand);
}
.info-desc {
font-size: 0.85em;
color: var(--vp-c-text-2);
margin-bottom: 10px;
}
.attn-list {
list-style: none;
padding: 0;
margin: 0;
}
.attn-list li {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 6px;
font-size: 0.85em;
}
.target-icon {
width: 20px;
text-align: center;
}
.target-name {
width: 40px;
}
.bar-bg {
flex: 1;
height: 6px;
background: var(--vp-c-bg-soft);
border-radius: 3px;
overflow: hidden;
}
.bar-fill {
height: 100%;
background: var(--vp-c-brand);
border-radius: 3px;
}
</style>
@@ -0,0 +1,391 @@
<template>
<div class="feature-alignment-demo">
<div class="header">
<div class="title">阶段一特征对齐 (Feature Alignment / Pre-training)</div>
<div class="desc">
目标 Projector 学会翻译图像语言
<br>做法冻结 ViT LLM只训练 Projector
</div>
</div>
<div class="training-diagram">
<!-- Data Input -->
<div class="data-column">
<div class="data-item image-data">
<div class="data-icon">🖼</div>
<div class="data-label">图片<br>()</div>
</div>
<div class="data-item text-data">
<div class="data-icon">📝</div>
<div class="data-label">标题<br>("一只猫")</div>
</div>
</div>
<!-- Arrow Column -->
<div class="arrow-column">
<div class="arrow"></div>
<div class="arrow"></div>
</div>
<!-- Model Column -->
<div class="model-column">
<!-- Vision Branch -->
<div class="model-block frozen">
<div class="status-badge"> 冻结</div>
<div class="block-icon">👁</div>
<div class="block-name">ViT</div>
</div>
<div class="arrow-small"></div>
<div class="model-block training">
<div class="status-badge fire">🔥 训练</div>
<div class="block-icon">🔌</div>
<div class="block-name">Projector</div>
</div>
<!-- Text Branch -->
<div class="model-block frozen text-model">
<div class="status-badge"> 冻结</div>
<div class="block-icon">🧠</div>
<div class="block-name">LLM</div>
</div>
</div>
<!-- Arrow Column -->
<div class="arrow-column">
<div class="arrow"></div>
<div class="arrow"></div>
</div>
<!-- Vector Output -->
<div class="vector-column">
<div class="vector-item v-vector">
<div class="vector-icon">🟢</div>
<div class="vector-label">向量 V</div>
</div>
<div class="loss-connection">
<div class="loss-line"></div>
<div class="loss-box" :class="{ active: isCalculatingLoss }">
<div class="loss-label">Loss</div>
<div class="loss-desc">V T</div>
</div>
<div class="loss-line"></div>
</div>
<div class="vector-item t-vector">
<div class="vector-icon">🔵</div>
<div class="vector-label">向量 T</div>
</div>
</div>
</div>
<div class="controls">
<button class="play-btn" @click="nextStep">
{{ buttonText }}
</button>
<div class="step-desc">
{{ currentStepDesc }}
</div>
</div>
</div>
</template>
<script setup>
import { ref, computed } from 'vue'
const step = ref(0) // 0: Idle, 1: Forward, 2: Loss, 3: Backprop
const nextStep = () => {
if (step.value < 3) {
step.value++
} else {
step.value = 0
}
}
const buttonText = computed(() => {
switch (step.value) {
case 0: return '开始训练演示'
case 1: return '下一步:计算 Loss'
case 2: return '下一步:反向传播'
case 3: return '完成并重置'
default: return '开始'
}
})
const currentStepDesc = computed(() => {
switch (step.value) {
case 0: return '准备就绪。点击按钮开始模拟一次训练迭代。'
case 1: return '前向传播:图片经过 ViT (冻结) 和 Projector (训练) 得到向量 V;文本经过 LLM (冻结) 得到向量 T。'
case 2: return '计算 Loss:比较向量 V 和向量 T 的相似度。目标是让它们尽可能接近。'
case 3: return '反向传播:根据 Loss 更新 Projector 的参数。注意 ViT 和 LLM 不会更新!'
default: return ''
}
})
const isCalculatingLoss = computed(() => step.value === 2)
</script>
<style scoped>
.feature-alignment-demo {
background: var(--vp-c-bg-soft);
border: 1px solid var(--vp-c-divider);
border-radius: 12px;
padding: 20px;
margin: 20px 0;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
}
.header {
margin-bottom: 20px;
text-align: center;
}
.title {
font-weight: bold;
font-size: 16px;
margin-bottom: 8px;
}
.desc {
font-size: 13px;
color: var(--vp-c-text-2);
line-height: 1.5;
}
.training-diagram {
display: flex;
align-items: center;
justify-content: center;
background: var(--vp-c-bg);
border: 1px dashed var(--vp-c-divider);
border-radius: 8px;
padding: 20px 10px;
overflow: hidden;
gap: 10px;
}
/* Data Column */
.data-column {
display: flex;
flex-direction: column;
gap: 40px;
}
.data-item {
display: flex;
flex-direction: column;
align-items: center;
padding: 8px;
background: var(--vp-c-bg-mute);
border-radius: 6px;
width: 60px;
}
.data-icon { font-size: 24px; }
.data-label { font-size: 10px; text-align: center; margin-top: 4px; }
/* Arrow Column */
.arrow-column {
display: flex;
flex-direction: column;
gap: 80px;
color: var(--vp-c-text-3);
font-size: 14px;
}
/* Model Column */
.model-column {
display: grid;
grid-template-columns: auto auto auto;
grid-template-areas:
"vit arrow proj"
"llm llm llm";
gap: 10px;
row-gap: 30px;
align-items: center;
}
.model-block {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
border: 1.5px solid;
border-radius: 8px;
padding: 10px;
min-width: 70px;
position: relative;
background: var(--vp-c-bg);
transition: all 0.3s;
}
.status-badge {
position: absolute;
top: -8px;
right: -5px;
font-size: 9px;
padding: 2px 4px;
border-radius: 4px;
background: var(--vp-c-bg);
border: 1px solid;
font-weight: bold;
}
.frozen {
border-color: var(--vp-c-divider);
opacity: 0.8;
border-style: dashed;
}
.frozen .status-badge {
border-color: var(--vp-c-divider);
color: var(--vp-c-text-3);
}
.training {
border-color: var(--vp-c-brand);
box-shadow: 0 0 10px rgba(var(--vp-c-brand-rgb), 0.1);
}
.training .status-badge {
border-color: var(--vp-c-brand);
color: var(--vp-c-brand);
background: var(--vp-c-bg-soft);
}
.training.fire {
animation: pulse 2s infinite;
}
.text-model {
grid-area: llm;
width: 100%;
}
.block-icon { font-size: 20px; margin-bottom: 4px; }
.block-name { font-size: 12px; font-weight: bold; }
.arrow-small {
grid-area: arrow;
color: var(--vp-c-text-3);
}
/* Vector Output */
.vector-column {
display: flex;
flex-direction: column;
align-items: center;
gap: 10px;
min-width: 80px;
}
.vector-item {
display: flex;
flex-direction: column;
align-items: center;
font-size: 10px;
}
.loss-connection {
display: flex;
flex-direction: column;
align-items: center;
width: 100%;
}
.loss-line {
width: 1px;
height: 20px;
background: var(--vp-c-divider);
}
.loss-box {
border: 1px solid var(--vp-c-danger);
border-radius: 6px;
padding: 4px 8px;
text-align: center;
background: var(--vp-c-bg);
transition: all 0.3s;
opacity: 0.5;
}
.loss-box.active {
opacity: 1;
transform: scale(1.1);
background: rgba(255, 0, 0, 0.1);
box-shadow: 0 0 10px rgba(255, 0, 0, 0.2);
}
.loss-label { font-size: 12px; font-weight: bold; color: var(--vp-c-danger); }
.loss-desc { font-size: 10px; color: var(--vp-c-text-2); }
/* Controls */
.controls {
margin-top: 20px;
display: flex;
flex-direction: column;
align-items: center;
gap: 10px;
}
.play-btn {
background: var(--vp-c-brand);
color: white;
border: none;
padding: 8px 20px;
border-radius: 20px;
cursor: pointer;
font-weight: bold;
transition: opacity 0.2s;
}
.play-btn:disabled {
opacity: 0.7;
cursor: not-allowed;
}
.play-btn:active {
transform: scale(0.98);
}
.step-desc {
font-size: 13px;
color: var(--vp-c-text-1);
text-align: center;
min-height: 40px;
}
@keyframes pulse {
0% { box-shadow: 0 0 0 0 rgba(var(--vp-c-brand-rgb), 0.4); }
70% { box-shadow: 0 0 0 10px rgba(var(--vp-c-brand-rgb), 0); }
100% { box-shadow: 0 0 0 0 rgba(var(--vp-c-brand-rgb), 0); }
}
@media (max-width: 600px) {
.training-diagram {
flex-direction: column;
gap: 20px;
}
.arrow-column {
display: none;
}
.data-column {
flex-direction: row;
gap: 20px;
}
.vector-column {
flex-direction: row;
align-items: center;
justify-content: center;
width: 100%;
}
.loss-connection {
flex-direction: row;
align-items: center;
}
.loss-line {
width: 20px;
height: 1px;
}
}
</style>
@@ -0,0 +1,129 @@
<template>
<div class="linear-projection-demo">
<div class="demo-container">
<!-- Step 1: Patch -->
<div class="step-box">
<div class="label">1. Patch (4x4)</div>
<div class="grid-patch">
<div v-for="n in 16" :key="n" class="pixel" :style="{ backgroundColor: getPixelColor(n) }"></div>
</div>
<div class="desc">768 像素点</div>
</div>
<div class="arrow"></div>
<!-- Step 2: Flattened -->
<div class="step-box">
<div class="label">2. Flatten</div>
<div class="vector-container">
<div v-for="n in 16" :key="n" class="vector-cell" :style="{ backgroundColor: getPixelColor(n) }"></div>
</div>
<div class="desc">拉平成向量</div>
</div>
<div class="arrow">× W</div>
<!-- Step 3: Projected -->
<div class="step-box">
<div class="label">3. Embedding</div>
<div class="embedding-container">
<div v-for="n in 8" :key="n" class="embed-cell"></div>
</div>
<div class="desc">压缩特征 (D=8)</div>
</div>
</div>
</div>
</template>
<script setup>
const getPixelColor = (n) => {
// Generate a gradient of colors
const hue = (n * 20) % 360;
return `hsl(${hue}, 70%, 60%)`;
}
</script>
<style scoped>
.linear-projection-demo {
padding: 20px;
background: var(--vp-c-bg-soft);
border-radius: 8px;
margin: 20px 0;
overflow-x: auto;
}
.demo-container {
display: flex;
align-items: center;
justify-content: space-around;
min-width: 600px;
}
.step-box {
display: flex;
flex-direction: column;
align-items: center;
gap: 10px;
}
.label {
font-weight: bold;
font-size: 0.9em;
color: var(--vp-c-text-2);
}
.desc {
font-size: 0.8em;
color: var(--vp-c-text-3);
}
.grid-patch {
display: grid;
grid-template-columns: repeat(4, 1fr);
gap: 2px;
width: 80px;
height: 80px;
}
.pixel {
width: 100%;
height: 100%;
border-radius: 2px;
}
.vector-container {
display: flex;
flex-direction: column;
gap: 1px;
height: 120px;
width: 20px;
justify-content: center;
}
.vector-cell {
width: 100%;
flex: 1;
}
.embedding-container {
display: flex;
flex-direction: column;
gap: 2px;
height: 80px;
width: 20px;
}
.embed-cell {
width: 100%;
flex: 1;
background-color: var(--vp-c-brand);
opacity: 0.8;
border-radius: 2px;
}
.arrow {
font-size: 1.5em;
color: var(--vp-c-text-3);
font-weight: bold;
}
</style>
@@ -0,0 +1,418 @@
<template>
<div class="model-evolution-demo">
<div class="controls-header">
<div class="toggle-container" @click="toggleMode">
<div class="toggle-track" :class="{ active: isVLM }">
<div class="toggle-thumb">
{{ isVLM ? '👁️' : '🧠' }}
</div>
</div>
<div class="toggle-label">
<span :class="{ active: !isVLM }">Pure LLM</span>
<span class="arrow"></span>
<span :class="{ active: isVLM }">Multimodal VLM</span>
</div>
</div>
<div class="status-desc">
{{ isVLM
? '给大脑装上眼睛:视觉信号经过翻译,变成 Token 混入文字流。'
: '纯文本大脑:只能听懂 Token 语言,无法感知图像。'
}}
</div>
</div>
<div class="diagram-stage" :class="{ 'vlm-mode': isVLM }">
<!-- Vision Pipeline (Only visible in VLM mode) -->
<div class="pipeline vision-pipeline">
<div class="node-group">
<div class="node input-node image-node">
<span class="icon"></span>
<span class="label">Image</span>
</div>
<div class="flow-arrow"></div>
<div class="node process-node vit-node" title="Vision Transformer: The Eye">
<span class="icon"></span>
<span class="label">ViT</span>
</div>
<div class="flow-arrow"></div>
<div class="node adapter-node projector-node" title="Projector: The Translator">
<span class="icon">🔌</span>
<span class="label">Projector</span>
</div>
<div class="flow-arrow connector-arrow"></div>
</div>
</div>
<!-- Text Pipeline (Always visible) -->
<div class="pipeline text-pipeline">
<div class="node-group horizontal">
<div class="node input-node text-node">
<span class="icon"></span>
<span class="label">Prompt</span>
</div>
<div class="flow-arrow"></div>
<div class="node process-node embed-node">
<span class="icon"></span>
<span class="label">Embed</span>
</div>
<!-- Merge Point Visualization -->
<div class="merge-point" :class="{ active: isVLM }">
<div class="plus-icon">+</div>
<div class="merge-label">Concat</div>
</div>
<div class="flow-arrow"></div>
<div class="node core-node llm-node">
<span class="icon">🧠</span>
<span class="label">LLM Backbone</span>
<div class="inner-flow">
<span class="dot t1"></span>
<span class="dot t2"></span>
<span class="dot v1" v-if="isVLM"></span>
</div>
</div>
<div class="flow-arrow"></div>
<div class="node output-node">
<span class="icon">💬</span>
<span class="label">Response</span>
</div>
</div>
</div>
</div>
<div class="interactive-info">
<div class="info-card" v-if="!isVLM">
<h3>Standard LLM Flow</h3>
<p>Text is converted into vectors (Embeddings) and processed by the Transformer to predict the next word.</p>
</div>
<div class="info-card vlm-info" v-else>
<h3>VLM = LLM + Vision Encoder</h3>
<ul>
<li><strong>ViT (The Eye):</strong> Slices image into patches and extracts features.</li>
<li><strong>Projector (The Translator):</strong> Converts visual features into the same "language" (vector dimension) as text embeddings.</li>
<li><strong>Concatenation:</strong> The translated visual tokens are pasted <em>before</em> the text tokens. The LLM sees them as "foreign words" it learned to understand.</li>
</ul>
</div>
</div>
</div>
</template>
<script setup>
import { ref } from 'vue'
const isVLM = ref(false)
const toggleMode = () => {
isVLM.value = !isVLM.value
}
</script>
<style scoped>
.model-evolution-demo {
background: var(--vp-c-bg-soft);
border: 1px solid var(--vp-c-divider);
border-radius: 12px;
padding: 24px;
margin: 20px 0;
font-family: 'Menlo', 'Monaco', sans-serif;
user-select: none;
}
/* Controls */
.controls-header {
display: flex;
flex-direction: column;
align-items: center;
margin-bottom: 30px;
gap: 12px;
}
.toggle-container {
display: flex;
align-items: center;
gap: 15px;
cursor: pointer;
background: var(--vp-c-bg-mute);
padding: 8px 16px;
border-radius: 30px;
border: 1px solid transparent;
transition: all 0.2s;
}
.toggle-container:hover {
border-color: var(--vp-c-brand);
background: var(--vp-c-bg);
}
.toggle-track {
width: 50px;
height: 28px;
background: #ccc;
border-radius: 14px;
position: relative;
transition: background 0.3s;
}
.toggle-track.active {
background: var(--vp-c-brand);
}
.toggle-thumb {
width: 24px;
height: 24px;
background: #fff;
border-radius: 50%;
position: absolute;
top: 2px;
left: 2px;
display: flex;
align-items: center;
justify-content: center;
font-size: 14px;
transition: transform 0.3s cubic-bezier(0.34, 1.56, 0.64, 1);
box-shadow: 0 2px 4px rgba(0,0,0,0.2);
}
.toggle-track.active .toggle-thumb {
transform: translateX(22px);
}
.toggle-label {
font-size: 14px;
font-weight: bold;
color: var(--vp-c-text-2);
display: flex;
gap: 8px;
align-items: center;
}
.toggle-label span.active {
color: var(--vp-c-text-1);
}
.status-desc {
font-size: 13px;
color: var(--vp-c-text-2);
text-align: center;
height: 20px;
}
/* Diagram Stage */
.diagram-stage {
position: relative;
height: 240px;
background: var(--vp-c-bg);
border: 1px dashed var(--vp-c-divider);
border-radius: 8px;
overflow: hidden;
display: flex;
justify-content: center;
align-items: center;
}
/* Pipelines */
.pipeline {
transition: all 0.5s cubic-bezier(0.34, 1.56, 0.64, 1);
}
.text-pipeline {
position: absolute;
bottom: 80px; /* Centered vertically in LLM mode */
left: 50%;
transform: translateX(-50%);
width: 100%;
display: flex;
justify-content: center;
}
.vlm-mode .text-pipeline {
bottom: 40px; /* Move down in VLM mode */
}
.vision-pipeline {
position: absolute;
top: 20px;
left: 20%; /* Align with input side */
opacity: 0;
transform: translateY(-20px);
pointer-events: none;
}
.vlm-mode .vision-pipeline {
opacity: 1;
transform: translateY(0);
pointer-events: auto;
}
.node-group {
display: flex;
align-items: center;
gap: 6px;
}
.node-group.horizontal {
flex-direction: row;
}
.vision-pipeline .node-group {
flex-direction: column;
}
/* Nodes */
.node {
background: var(--vp-c-bg);
border: 2px solid var(--vp-c-divider);
border-radius: 8px;
padding: 8px 12px;
display: flex;
flex-direction: column;
align-items: center;
min-width: 70px;
box-shadow: 0 4px 6px rgba(0,0,0,0.05);
position: relative;
z-index: 2;
}
.icon { font-size: 20px; margin-bottom: 4px; }
.label { font-size: 11px; font-weight: bold; }
.input-node { border-color: #aaa; }
.process-node { border-color: var(--vp-c-brand-dimm); }
.core-node {
border-color: var(--vp-c-brand);
background: var(--vp-c-brand-dimm);
min-width: 100px;
}
.output-node { border-color: var(--vp-c-brand); }
.vit-node {
border-color: var(--vp-c-yellow);
background: rgba(255, 197, 23, 0.05);
}
.projector-node {
border-color: var(--vp-c-yellow);
background: var(--vp-c-yellow-dimm);
}
/* Arrows */
.flow-arrow {
color: var(--vp-c-text-3);
font-size: 16px;
}
.connector-arrow {
font-size: 24px;
color: var(--vp-c-yellow);
margin-top: -10px;
margin-bottom: -10px;
transform: rotate(-45deg) translateX(10px);
}
/* Merge Point */
.merge-point {
width: 0;
overflow: hidden;
transition: all 0.5s;
display: flex;
flex-direction: column;
align-items: center;
opacity: 0;
}
.merge-point.active {
width: 40px;
opacity: 1;
}
.plus-icon {
font-weight: bold;
color: var(--vp-c-text-2);
font-size: 18px;
}
.merge-label {
font-size: 9px;
color: var(--vp-c-text-3);
}
/* Inner Flow Animation inside LLM */
.inner-flow {
display: flex;
gap: 4px;
margin-top: 4px;
height: 6px;
}
.dot {
width: 6px;
height: 6px;
border-radius: 50%;
background: #fff;
opacity: 0.6;
animation: pulse 1s infinite alternate;
}
.t1 { animation-delay: 0s; }
.t2 { animation-delay: 0.2s; }
.v1 { background: var(--vp-c-yellow); animation-delay: 0.4s; }
@keyframes pulse {
from { opacity: 0.3; transform: scale(0.8); }
to { opacity: 1; transform: scale(1.1); }
}
/* Interactive Info */
.interactive-info {
margin-top: 20px;
}
.info-card {
background: var(--vp-c-bg-mute);
padding: 16px;
border-radius: 8px;
animation: fadeIn 0.3s;
}
.info-card h3 {
margin-top: 0;
margin-bottom: 10px;
font-size: 15px;
color: var(--vp-c-text-1);
}
.info-card p, .info-card li {
font-size: 13px;
color: var(--vp-c-text-2);
line-height: 1.6;
}
.info-card ul {
padding-left: 20px;
margin: 0;
}
@keyframes fadeIn {
from { opacity: 0; transform: translateY(5px); }
to { opacity: 1; transform: translateY(0); }
}
/* Mobile Adjustments */
@media (max-width: 600px) {
.diagram-stage {
height: 300px;
}
.text-pipeline {
flex-wrap: wrap;
gap: 10px;
width: 90%;
}
.vision-pipeline {
left: 10%;
}
}
</style>
@@ -0,0 +1,209 @@
<!--
PatchifyDemo.vue
视觉分词Patchify演示
-->
<template>
<div class="patchify-demo">
<div class="control-panel">
<div class="controls">
<button class="action-btn" @click="toggleState">
{{ isPatchified ? '还原图片 (Restore)' : '切分图片 (Patchify)' }}
</button>
<div class="info">
<span>Resolution: 224x224</span>
<span>Patch Size: 16x16</span>
<span>Total Patches: {{ 14 * 14 }}</span>
</div>
</div>
</div>
<div class="visual-area">
<!-- 原始/切分视图容器 -->
<div class="image-container" :class="{ 'is-patchified': isPatchified }">
<div
v-for="n in 196"
:key="n"
class="patch"
:style="{
'--delay': `${n * 0.005}s`,
'--hue': `${(n % 14) * 20 + Math.floor(n / 14) * 20}`
}"
>
<span class="patch-id" v-if="isPatchified">{{ n }}</span>
</div>
</div>
<div class="arrow" v-if="isPatchified"></div>
<!-- 线性序列视图 -->
<div class="sequence-container" v-if="isPatchified">
<div class="sequence-label">Flattened Sequence (Token Input)</div>
<div class="token-stream">
<div
v-for="n in 196"
:key="n"
class="mini-patch"
:style="{ '--hue': `${(n % 14) * 20 + Math.floor(n / 14) * 20}` }"
></div>
</div>
</div>
</div>
<div class="explanation">
<p>
<span class="icon">💡</span>
计算机将图片切成 <strong>14x14 = 196</strong> 个小方块Patch
然后把这些方块拉直成一长串序列就像把一段话里的单词排成一排一样
这就是 <strong>Visual Tokenization</strong>
</p>
</div>
</div>
</template>
<script setup>
import { ref } from 'vue'
const isPatchified = ref(false)
const toggleState = () => {
isPatchified.value = !isPatchified.value
}
</script>
<style scoped>
.patchify-demo {
border: 1px solid var(--vp-c-divider);
border-radius: 8px;
padding: 20px;
background: var(--vp-c-bg-soft);
margin: 20px 0;
}
.control-panel {
margin-bottom: 20px;
display: flex;
justify-content: center;
}
.controls {
display: flex;
gap: 20px;
align-items: center;
}
.action-btn {
background: var(--vp-c-brand);
color: white;
border: none;
padding: 8px 16px;
border-radius: 4px;
cursor: pointer;
font-weight: 600;
transition: opacity 0.2s;
}
.action-btn:hover {
opacity: 0.9;
}
.info {
display: flex;
gap: 15px;
font-size: 0.9em;
color: var(--vp-c-text-2);
}
.visual-area {
display: flex;
flex-direction: column;
align-items: center;
gap: 20px;
min-height: 300px;
}
.image-container {
display: grid;
grid-template-columns: repeat(14, 1fr);
width: 280px;
height: 280px;
gap: 0;
background: #333;
transition: all 0.5s ease;
border: 2px solid var(--vp-c-text-1);
}
.image-container.is-patchified {
gap: 2px;
background: transparent;
border-color: transparent;
}
.patch {
background-color: hsl(var(--hue), 70%, 60%);
display: flex;
align-items: center;
justify-content: center;
font-size: 8px;
color: rgba(0,0,0,0.5);
transition: all 0.5s ease;
}
.is-patchified .patch {
border-radius: 2px;
transform: scale(0.9);
}
.sequence-container {
width: 100%;
background: var(--vp-c-bg);
padding: 15px;
border-radius: 8px;
border: 1px solid var(--vp-c-divider);
animation: fadeIn 0.5s ease;
}
.sequence-label {
font-size: 0.9em;
margin-bottom: 10px;
font-weight: 600;
color: var(--vp-c-text-2);
}
.token-stream {
display: flex;
flex-wrap: wrap;
gap: 2px;
}
.mini-patch {
width: 10px;
height: 10px;
background-color: hsl(var(--hue), 70%, 60%);
border-radius: 1px;
}
.explanation {
margin-top: 20px;
padding: 12px;
background: var(--vp-c-bg-mute);
border-radius: 6px;
font-size: 0.9em;
line-height: 1.6;
}
.arrow {
font-size: 24px;
color: var(--vp-c-text-2);
animation: bounce 1s infinite;
}
@keyframes bounce {
0%, 100% { transform: translateY(0); }
50% { transform: translateY(5px); }
}
@keyframes fadeIn {
from { opacity: 0; transform: translateY(10px); }
to { opacity: 1; transform: translateY(0); }
}
</style>
@@ -0,0 +1,126 @@
<template>
<div class="pos-demo">
<div class="demo-row">
<!-- Input Feature -->
<div class="grid-wrapper">
<div class="grid-title">Feature Vectors</div>
<div class="grid-box feature-grid">
<div v-for="n in 9" :key="'f'+n" class="cell feature-cell">V</div>
</div>
</div>
<div class="op">+</div>
<!-- Positional Embedding -->
<div class="grid-wrapper">
<div class="grid-title">Position Embeddings</div>
<div class="grid-box pos-grid">
<div v-for="n in 9" :key="'p'+n" class="cell pos-cell">{{ n }}</div>
</div>
</div>
<div class="op">=</div>
<!-- Result -->
<div class="grid-wrapper">
<div class="grid-title">Input to Transformer</div>
<div class="grid-box result-grid">
<div v-for="n in 9" :key="'r'+n" class="cell result-cell">
<span class="v">V</span><span class="plus">+</span><span class="p">{{ n }}</span>
</div>
</div>
</div>
</div>
<div class="caption">
位置编码 (Position Embedding) 是一组可学习的向量直接<b></b>在图像特征上
</div>
</div>
</template>
<style scoped>
.pos-demo {
padding: 20px;
background: var(--vp-c-bg-soft);
border-radius: 8px;
margin: 20px 0;
overflow-x: auto;
}
.demo-row {
display: flex;
align-items: center;
justify-content: center;
gap: 20px;
min-width: 500px;
}
.grid-wrapper {
display: flex;
flex-direction: column;
align-items: center;
gap: 8px;
}
.grid-title {
font-size: 0.85em;
font-weight: bold;
color: var(--vp-c-text-2);
}
.grid-box {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 4px;
padding: 4px;
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 6px;
}
.cell {
width: 40px;
height: 40px;
display: flex;
align-items: center;
justify-content: center;
border-radius: 4px;
font-size: 0.9em;
font-family: monospace;
}
.feature-cell {
background-color: var(--vp-c-brand-soft);
color: var(--vp-c-brand-dark);
}
.pos-grid .pos-cell {
background-color: var(--vp-c-yellow-soft);
color: var(--vp-c-yellow-darker);
}
.result-cell {
background-color: var(--vp-c-green-soft);
color: var(--vp-c-green-darker);
font-size: 0.7em;
display: flex;
gap: 1px;
}
.op {
font-size: 2em;
color: var(--vp-c-text-3);
font-weight: bold;
}
.caption {
text-align: center;
margin-top: 15px;
font-size: 0.9em;
color: var(--vp-c-text-2);
}
.plus {
color: var(--vp-c-text-3);
font-weight: normal;
}
</style>
@@ -0,0 +1,241 @@
<!--
ProjectorDemo.vue
投射器Projector原理演示
-->
<template>
<div class="projector-demo">
<div class="mode-switch">
<button
:class="{ active: mode === 'linear' }"
@click="mode = 'linear'"
>
Linear (LLaVA)
</button>
<button
:class="{ active: mode === 'qformer' }"
@click="mode = 'qformer'"
>
Q-Former (BLIP-2)
</button>
</div>
<div class="pipeline">
<!-- Input: Visual Tokens -->
<div class="stage">
<div class="label">Visual Tokens (ViT)</div>
<div class="token-container input">
<div v-for="n in 16" :key="n" class="token visual"></div>
</div>
<div class="count">{{ mode === 'linear' ? '256 Tokens' : '256 Tokens' }}</div>
</div>
<!-- Process: The Projector -->
<div class="stage connector">
<div class="arrow-line"></div>
<div class="projector-box" :class="mode">
<div class="title">{{ mode === 'linear' ? 'Linear Layer' : 'Q-Former' }}</div>
<div class="desc">
{{ mode === 'linear' ? '直接映射 (1:1)' : '查询提取 (N:M)' }}
</div>
<div class="animation-dots" v-if="mode === 'qformer'">
<div class="dot"></div>
<div class="dot"></div>
<div class="dot"></div>
</div>
</div>
<div class="arrow-line"></div>
</div>
<!-- Output: LLM Tokens -->
<div class="stage">
<div class="label">LLM Tokens</div>
<div class="token-container output">
<div
v-for="n in (mode === 'linear' ? 16 : 4)"
:key="n"
class="token llm"
></div>
</div>
<div class="count">
{{ mode === 'linear' ? '256 Tokens (保留全部细节)' : '32 Tokens (只保留关键信息)' }}
</div>
</div>
</div>
<div class="explanation">
<div v-if="mode === 'linear'">
<strong>Linear Projector:</strong>
简单高效它像一个直译器保留了所有的视觉信息虽然 Token 数量多计算量大但对细节的把控更好
</div>
<div v-else>
<strong>Q-Former:</strong>
精细优雅它使用一组查询向量主动去图像中提取与文本相关的信息大大压缩了 Token 数量 LLM 跑得更快
</div>
</div>
</div>
</template>
<script setup>
import { ref } from 'vue'
const mode = ref('linear')
</script>
<style scoped>
.projector-demo {
border: 1px solid var(--vp-c-divider);
border-radius: 8px;
padding: 20px;
background: var(--vp-c-bg-soft);
margin: 20px 0;
}
.mode-switch {
display: flex;
justify-content: center;
gap: 10px;
margin-bottom: 30px;
}
.mode-switch button {
padding: 6px 16px;
border-radius: 20px;
border: 1px solid var(--vp-c-divider);
background: var(--vp-c-bg);
cursor: pointer;
transition: all 0.2s;
}
.mode-switch button.active {
background: var(--vp-c-brand);
color: white;
border-color: var(--vp-c-brand);
}
.pipeline {
display: flex;
align-items: center;
justify-content: space-between;
gap: 10px;
}
.stage {
display: flex;
flex-direction: column;
align-items: center;
gap: 8px;
flex: 1;
}
.label {
font-size: 0.8em;
color: var(--vp-c-text-2);
font-weight: 600;
}
.token-container {
display: grid;
gap: 4px;
padding: 10px;
background: var(--vp-c-bg);
border-radius: 6px;
border: 1px solid var(--vp-c-divider);
}
.token-container.input {
grid-template-columns: repeat(4, 1fr);
}
.token-container.output {
grid-template-columns: repeat(4, 1fr);
}
.token {
width: 12px;
height: 12px;
border-radius: 2px;
}
.token.visual {
background-color: #3b82f6;
}
.token.llm {
background-color: #10b981;
}
.connector {
flex: 0.5;
display: flex;
flex-direction: row;
align-items: center;
}
.projector-box {
background: var(--vp-c-bg-mute);
border: 2px solid var(--vp-c-brand);
border-radius: 8px;
padding: 10px;
text-align: center;
min-width: 100px;
transition: all 0.3s;
}
.projector-box.qformer {
border-color: #8b5cf6;
background: rgba(139, 92, 246, 0.1);
}
.title {
font-weight: bold;
font-size: 0.9em;
}
.desc {
font-size: 0.7em;
color: var(--vp-c-text-2);
}
.count {
font-size: 0.8em;
color: var(--vp-c-text-3);
}
.explanation {
margin-top: 20px;
padding: 12px;
background: var(--vp-c-bg-mute);
border-radius: 6px;
font-size: 0.9em;
line-height: 1.6;
}
.arrow-line {
height: 2px;
background: var(--vp-c-divider);
flex-grow: 1;
}
.animation-dots {
display: flex;
justify-content: center;
gap: 4px;
margin-top: 4px;
}
.dot {
width: 4px;
height: 4px;
border-radius: 50%;
background: #8b5cf6;
animation: pulse 1s infinite;
}
.dot:nth-child(2) { animation-delay: 0.2s; }
.dot:nth-child(3) { animation-delay: 0.4s; }
@keyframes pulse {
0%, 100% { opacity: 0.3; }
50% { opacity: 1; }
}
</style>
@@ -0,0 +1,210 @@
<template>
<div class="pipeline-demo">
<div class="stage-switch">
<button
:class="{ active: stage === 1 }"
@click="stage = 1"
>
阶段一特征对齐
</button>
<button
:class="{ active: stage === 2 }"
@click="stage = 2"
>
阶段二指令微调
</button>
</div>
<div class="pipeline-visual">
<!-- Image Input -->
<div class="component-box image-input">
<div class="icon">🖼</div>
<div class="name">Image</div>
</div>
<div class="arrow"></div>
<!-- Vision Encoder -->
<div class="component-box encoder" :class="{ frozen: true }">
<div class="status-badge"> Frozen</div>
<div class="name">ViT</div>
<div class="desc">Vision Encoder</div>
</div>
<div class="arrow"></div>
<!-- Projector -->
<div class="component-box projector" :class="{ training: true }">
<div class="status-badge fire">🔥 Train</div>
<div class="name">Projector</div>
<div class="desc">Adapter</div>
</div>
<div class="arrow"></div>
<!-- LLM -->
<div class="component-box llm" :class="{ frozen: stage === 1, training: stage === 2 }">
<div class="status-badge">{{ stage === 1 ? '❄️ Frozen' : '🔥 Train' }}</div>
<div class="name">LLM</div>
<div class="desc">Language Model</div>
</div>
<div class="arrow"></div>
<!-- Output / Loss -->
<div class="component-box output">
<div class="name" v-if="stage === 1">Loss Calculation</div>
<div class="name" v-else>Text Generation</div>
<div class="desc" v-if="stage === 1">Contrastive Loss</div>
<div class="desc" v-else>Next Token Prediction</div>
</div>
</div>
<div class="data-example">
<div class="data-title">当前训练数据示例</div>
<div class="data-content" v-if="stage === 1">
<code>&lt;Image: 🐱&gt;, &lt;Text: "一只猫"&gt;</code>
<p>任务让图像向量与文本向量距离变近</p>
</div>
<div class="data-content" v-else>
<code>User: &lt;Image: 🐱&gt; 这只猫在干嘛<br/>Assistant: 它在睡觉</code>
<p>任务根据图像和问题生成回答</p>
</div>
</div>
</div>
</template>
<script setup>
import { ref } from 'vue'
const stage = ref(1)
</script>
<style scoped>
.pipeline-demo {
padding: 20px;
background: var(--vp-c-bg-soft);
border-radius: 8px;
margin: 20px 0;
}
.stage-switch {
display: flex;
justify-content: center;
gap: 10px;
margin-bottom: 30px;
}
.stage-switch button {
padding: 8px 16px;
border-radius: 20px;
border: 1px solid var(--vp-c-divider);
background: var(--vp-c-bg);
cursor: pointer;
transition: all 0.2s;
font-weight: bold;
color: var(--vp-c-text-2);
}
.stage-switch button.active {
background: var(--vp-c-brand);
color: white;
border-color: var(--vp-c-brand);
transform: scale(1.05);
}
.pipeline-visual {
display: flex;
align-items: center;
justify-content: space-between;
margin-bottom: 20px;
overflow-x: auto;
padding: 10px 0;
}
.component-box {
border: 2px solid var(--vp-c-divider);
border-radius: 8px;
padding: 15px;
text-align: center;
min-width: 100px;
background: var(--vp-c-bg);
position: relative;
transition: all 0.3s;
}
.component-box.frozen {
background: var(--vp-c-bg-mute);
border-color: var(--vp-c-divider);
opacity: 0.8;
}
.component-box.training {
border-color: var(--vp-c-brand);
background: var(--vp-c-brand-dimm);
box-shadow: 0 0 10px rgba(var(--vp-c-brand-rgb), 0.2);
}
.status-badge {
position: absolute;
top: -10px;
left: 50%;
transform: translateX(-50%);
font-size: 0.7em;
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
padding: 2px 6px;
border-radius: 10px;
white-space: nowrap;
}
.fire {
color: #f43f5e;
border-color: #f43f5e;
}
.name {
font-weight: bold;
margin-bottom: 4px;
}
.desc {
font-size: 0.8em;
color: var(--vp-c-text-2);
}
.arrow {
font-size: 1.5em;
color: var(--vp-c-text-3);
font-weight: bold;
}
.data-example {
background: var(--vp-c-bg);
padding: 15px;
border-radius: 8px;
border: 1px solid var(--vp-c-divider);
}
.data-title {
font-size: 0.9em;
font-weight: bold;
margin-bottom: 8px;
color: var(--vp-c-text-2);
}
.data-content code {
display: block;
background: var(--vp-c-bg-mute);
padding: 8px;
border-radius: 4px;
margin-bottom: 8px;
font-family: monospace;
}
.data-content p {
margin: 0;
font-size: 0.9em;
color: var(--vp-c-text-2);
}
</style>
@@ -0,0 +1,198 @@
<!--
VLMInferenceDemo.vue
多模态推理演示
-->
<template>
<div class="vlm-chat-demo">
<div class="chat-window">
<!-- Chat History -->
<div class="messages">
<!-- User Message -->
<div class="message user">
<div class="avatar">👤</div>
<div class="bubble">
<div class="image-upload">
<div class="placeholder-img">
🐱
</div>
</div>
<div class="text">这只猫在做什么</div>
</div>
</div>
<!-- Assistant Message -->
<div class="message assistant" v-if="step > 0">
<div class="avatar">🤖</div>
<div class="bubble">
<div v-if="step === 1" class="thinking">
<span class="icon">👁</span> 正在观察图片...
</div>
<div v-else-if="step === 2" class="thinking">
<span class="icon">🧠</span> 正在思考...
</div>
<div v-else class="content type-writer">
{{ typedText }}<span class="cursor">|</span>
</div>
</div>
</div>
</div>
</div>
<div class="controls">
<button
class="send-btn"
:disabled="step > 0 && step < 3"
@click="startInference"
>
{{ step === 0 || step === 3 ? '发送 (Send)' : '生成中...' }}
</button>
</div>
</div>
</template>
<script setup>
import { ref, watch } from 'vue'
const step = ref(0)
const fullText = "它正趴在窗台上晒太阳,看起来非常惬意。"
const typedText = ref("")
const startInference = () => {
step.value = 1
typedText.value = ""
// Step 1: Vision Encoding
setTimeout(() => {
step.value = 2
// Step 2: Thinking
setTimeout(() => {
step.value = 3
typeText()
}, 1500)
}, 1500)
}
const typeText = () => {
let i = 0
const interval = setInterval(() => {
if (i < fullText.length) {
typedText.value += fullText[i]
i++
} else {
clearInterval(interval)
}
}, 100)
}
</script>
<style scoped>
.vlm-chat-demo {
border: 1px solid var(--vp-c-divider);
border-radius: 12px;
background: var(--vp-c-bg);
overflow: hidden;
max-width: 500px;
margin: 20px auto;
box-shadow: 0 4px 12px rgba(0,0,0,0.05);
}
.chat-window {
padding: 20px;
background: var(--vp-c-bg-soft);
min-height: 300px;
}
.message {
display: flex;
gap: 12px;
margin-bottom: 20px;
}
.message.user {
flex-direction: row-reverse;
}
.avatar {
width: 36px;
height: 36px;
border-radius: 50%;
background: var(--vp-c-bg-mute);
display: flex;
align-items: center;
justify-content: center;
font-size: 20px;
border: 1px solid var(--vp-c-divider);
}
.bubble {
background: var(--vp-c-bg);
padding: 12px;
border-radius: 12px;
border: 1px solid var(--vp-c-divider);
max-width: 80%;
box-shadow: 0 2px 4px rgba(0,0,0,0.02);
}
.message.user .bubble {
background: var(--vp-c-brand-soft);
border-color: var(--vp-c-brand-light);
}
.image-upload {
margin-bottom: 8px;
}
.placeholder-img {
width: 100px;
height: 100px;
background: #e2e8f0;
border-radius: 8px;
display: flex;
align-items: center;
justify-content: center;
font-size: 40px;
}
.controls {
padding: 15px;
border-top: 1px solid var(--vp-c-divider);
display: flex;
justify-content: flex-end;
}
.send-btn {
background: var(--vp-c-brand);
color: white;
border: none;
padding: 8px 20px;
border-radius: 20px;
font-weight: 600;
cursor: pointer;
transition: opacity 0.2s;
}
.send-btn:disabled {
opacity: 0.6;
cursor: not-allowed;
}
.thinking {
color: var(--vp-c-text-2);
font-style: italic;
display: flex;
align-items: center;
gap: 6px;
}
.cursor {
display: inline-block;
width: 2px;
background: currentColor;
animation: blink 1s infinite;
}
@keyframes blink {
0%, 100% { opacity: 1; }
50% { opacity: 0; }
}
</style>
@@ -0,0 +1,348 @@
<template>
<div class="vit-output-demo">
<div class="pipeline">
<!-- 1. Transformer Output Grid -->
<div class="stage">
<div class="stage-label">1. Processed Patches (Grid)</div>
<div class="grid-container">
<div
v-for="(item, index) in items"
:key="index"
class="grid-item"
:class="{ active: activeIndex === index }"
@mouseenter="activeIndex = index"
>
<span class="icon">{{ item.icon }}</span>
</div>
</div>
</div>
<div class="arrow-section">
<div class="arrow-line"></div>
<div class="arrow-text">Flatten & Output</div>
</div>
<!-- 2. Feature Vector Sequence -->
<div class="stage">
<div class="stage-label">2. Feature Vector Sequence (The "Image Sentence")</div>
<div class="vector-sequence">
<div
v-for="(item, index) in items"
:key="index"
class="vector-wrapper"
:class="{ active: activeIndex === index }"
@mouseenter="activeIndex = index"
>
<div class="vector-col">
<!-- Simulated vector dimensions -->
<div class="v-cell" :style="{ opacity: 0.9, background: item.color }"></div>
<div class="v-cell" :style="{ opacity: 0.7, background: item.color }"></div>
<div class="v-cell" :style="{ opacity: 0.5, background: item.color }"></div>
<div class="v-cell" :style="{ opacity: 0.8, background: item.color }"></div>
<div class="v-cell" :style="{ opacity: 0.6, background: item.color }"></div>
</div>
<div class="vector-idx">{{ index + 1 }}</div>
</div>
</div>
</div>
</div>
<!-- 3. Semantic Panel -->
<div class="semantic-panel">
<div v-if="activeIndex !== -1" class="semantic-content">
<div class="header" :style="{ borderColor: items[activeIndex].color }">
<span class="large-icon">{{ items[activeIndex].icon }}</span>
<div class="title-group">
<span class="title">Token #{{ activeIndex + 1 }}: {{ items[activeIndex].label }}</span>
<span class="subtitle">Type: {{ items[activeIndex].type }}</span>
</div>
</div>
<div class="desc">
<div class="vector-repr">
<span class="label">Vector Value:</span>
<span class="code" :style="{ color: items[activeIndex].color }">
[0.{{ (Math.random()*99).toFixed(0) }}, -0.{{ (Math.random()*99).toFixed(0) }}, 1.{{ (Math.random()*99).toFixed(0) }}, ...]
</span>
</div>
<div class="meaning">
<strong>🤖 What ViT sees (Semantic):</strong>
<p>{{ items[activeIndex].desc }}</p>
</div>
</div>
</div>
<div v-else class="placeholder">
<span class="hint-icon">👆</span>
<span class="hint-text">悬停在上方方块或向量上查看 ViT 输出的语义特征</span>
</div>
</div>
</div>
</template>
<script setup>
import { ref } from 'vue'
const activeIndex = ref(-1)
const items = [
{ icon: '🌲', label: 'Background', type: 'Environment', color: '#4caf50', desc: 'Recognized as outdoor nature elements (Trees/Greenery). Low relevance to main subject.' },
{ icon: '🌲', label: 'Background', type: 'Environment', color: '#4caf50', desc: 'Redundant background info. Contextualizes the scene as "Outdoors".' },
{ icon: '☁️', label: 'Sky', type: 'Environment', color: '#2196f3', desc: 'Spatial context: Upper region, open area.' },
{ icon: '👂', label: 'Cat Ear', type: 'Subject Part', color: '#ff9800', desc: 'High Importance. Identified as "Feline Feature". Strongly linked to "Cat Face".' },
{ icon: '😼', label: 'Cat Face', type: 'Subject Core', color: '#ff5722', desc: 'Global Focus Center. Contains "Eyes", "Whiskers". Aggregates info from surrounding patches.' },
{ icon: '🌲', label: 'Background', type: 'Environment', color: '#4caf50', desc: 'Background noise.' },
{ icon: '🐾', label: 'Cat Paw', type: 'Subject Part', color: '#ff9800', desc: 'Action component. Suggests "Standing" or "Walking" posture.' },
{ icon: '🧶', label: 'Yarn', type: 'Object', color: '#e91e63', desc: 'Interacting Object. Semantically linked to "Play" or "Toy".' },
{ icon: '🌱', label: 'Grass', type: 'Environment', color: '#8bc34a', desc: 'Ground context. Confirms "Ground level" view.' }
]
</script>
<style scoped>
.vit-output-demo {
background: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 12px;
padding: 24px;
font-family: system-ui, -apple-system, sans-serif;
max-width: 700px;
margin: 20px auto;
}
.dark .vit-output-demo {
background: #1e1e20;
border-color: #2d2d30;
color: #e0e0e0;
}
.pipeline {
display: flex;
flex-direction: column;
gap: 16px;
align-items: center;
}
.stage {
width: 100%;
display: flex;
flex-direction: column;
align-items: center;
}
.stage-label {
font-size: 12px;
text-transform: uppercase;
color: #868e96;
margin-bottom: 8px;
font-weight: 600;
}
/* Grid Stage */
.grid-container {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 6px;
background: #fff;
padding: 8px;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.05);
}
.dark .grid-container {
background: #252529;
}
.grid-item {
width: 40px;
height: 40px;
display: flex;
align-items: center;
justify-content: center;
background: #f1f3f5;
border-radius: 4px;
cursor: pointer;
transition: all 0.2s;
font-size: 20px;
}
.dark .grid-item {
background: #343a40;
}
.grid-item:hover, .grid-item.active {
background: #e7f5ff;
transform: scale(1.1);
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
.dark .grid-item:hover, .dark .grid-item.active {
background: #1c7ed6;
}
/* Arrow */
.arrow-section {
display: flex;
align-items: center;
gap: 8px;
color: #adb5bd;
}
.arrow-line {
width: 2px;
height: 20px;
background: #dee2e6;
}
/* Vector Sequence Stage */
.vector-sequence {
display: flex;
gap: 4px;
padding: 10px;
background: #fff;
border-radius: 8px;
box-shadow: 0 2px 12px rgba(0,0,0,0.05);
overflow-x: auto;
max-width: 100%;
}
.dark .vector-sequence {
background: #252529;
}
.vector-wrapper {
display: flex;
flex-direction: column;
align-items: center;
gap: 4px;
cursor: pointer;
padding: 4px;
border-radius: 4px;
transition: background 0.2s;
}
.vector-wrapper:hover, .vector-wrapper.active {
background: rgba(0,0,0,0.05);
}
.dark .vector-wrapper:hover, .dark .vector-wrapper.active {
background: rgba(255,255,255,0.1);
}
.vector-col {
display: flex;
flex-direction: column;
gap: 1px;
}
.v-cell {
width: 12px;
height: 6px;
border-radius: 1px;
}
.vector-idx {
font-size: 10px;
color: #adb5bd;
}
/* Semantic Panel */
.semantic-panel {
margin-top: 24px;
background: #fff;
border: 1px solid #e9ecef;
border-radius: 8px;
padding: 16px;
min-height: 120px;
display: flex;
align-items: center;
justify-content: center;
}
.dark .semantic-panel {
background: #252529;
border-color: #343a40;
}
.placeholder {
color: #868e96;
font-size: 14px;
display: flex;
align-items: center;
gap: 8px;
}
.semantic-content {
width: 100%;
text-align: left;
}
.header {
display: flex;
align-items: center;
gap: 12px;
margin-bottom: 12px;
padding-bottom: 12px;
border-bottom: 2px solid #eee;
}
.large-icon {
font-size: 32px;
background: #f8f9fa;
width: 48px;
height: 48px;
display: flex;
align-items: center;
justify-content: center;
border-radius: 8px;
}
.dark .large-icon {
background: #343a40;
}
.title-group {
display: flex;
flex-direction: column;
}
.title {
font-weight: bold;
font-size: 16px;
color: #343a40;
}
.dark .title {
color: #f8f9fa;
}
.subtitle {
font-size: 12px;
color: #868e96;
}
.desc {
font-size: 14px;
color: #495057;
}
.dark .desc {
color: #ced4da;
}
.vector-repr {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 8px;
font-family: 'Menlo', monospace;
font-size: 12px;
background: #f1f3f5;
padding: 4px 8px;
border-radius: 4px;
width: fit-content;
}
.dark .vector-repr {
background: #343a40;
}
.label {
color: #868e96;
}
.meaning strong {
display: block;
margin-bottom: 4px;
color: #212529;
}
.dark .meaning strong {
color: #f8f9fa;
}
</style>
@@ -0,0 +1,758 @@
<template>
<div class="vlm-quick-start">
<div class="header">
<div class="title">👁 VLM 初体验不只是看图说话</div>
<div class="subtitle">选择不同场景体验多模态模型的多种能力</div>
</div>
<div class="scenario-tabs">
<button
v-for="s in scenarios"
:key="s.id"
class="tab-btn"
:class="{ active: currentScenario === s.id }"
@click="switchScenario(s.id)"
>
{{ s.name }}
</button>
</div>
<div class="demo-container">
<!-- Image Area -->
<div class="image-area">
<div class="image-placeholder" :class="{ loaded: hasImage, 'receipt-bg': currentScenario === 'ocr' }">
<div v-if="!hasImage" class="upload-prompt">
<div class="icon">🖼</div>
<button class="upload-btn" @click="loadImage">
上传图片 (模拟)
</button>
</div>
<div v-else class="image-content">
<!-- Chat: Landscape -->
<div v-if="currentScenario === 'chat'" class="real-image-container landscape">
<div class="real-image">🏔</div>
<div class="sun"></div>
<div class="tree">🌲</div>
</div>
<!-- Detection: Fruits -->
<div v-else-if="currentScenario === 'detection'" class="real-image-container fruits">
<div class="real-image">
<span class="fruit apple">🍎</span>
<span class="fruit banana">🍌</span>
<span class="fruit grape">🍇</span>
</div>
<div v-if="showBoundingBox" class="bounding-box apple-box" title="Apple">
<span class="box-label">apple: 0.98</span>
</div>
<div v-if="showBoundingBox" class="bounding-box banana-box" title="Banana">
<span class="box-label">banana: 0.95</span>
</div>
</div>
<!-- Analysis: Factory Safety -->
<div v-else-if="currentScenario === 'analysis'" class="factory-image">
<div class="safety-sign"> 安全生产</div>
<div class="worker-container">
<span class="worker">👷</span>
<span class="helmet" v-if="true"></span>
</div>
<div class="machinery"></div>
</div>
<!-- OCR: Receipt -->
<div v-else class="receipt-image">
<div class="receipt-header">🧾 RECEIPT</div>
<div class="receipt-body">
<div class="line"><span>Coffee</span><span>$4.50</span></div>
<div class="line"><span>Bagel</span><span>$3.00</span></div>
<div class="line total"><span>TOTAL</span><span>$7.50</span></div>
<div class="line date"><span>2023-10-24</span></div>
</div>
</div>
<div class="image-label">
{{ getImageLabel() }}
</div>
</div>
</div>
</div>
<!-- Chat Area -->
<div class="chat-area">
<div class="messages" ref="messagesRef">
<div v-if="messages.length === 0" class="empty-text">
{{ hasImage ? '图片已就绪请选择指令' : '请先上传图片' }}
</div>
<div v-for="(msg, index) in messages" :key="index" class="message" :class="msg.role">
<div class="content">
<div v-if="msg.isJson" class="json-content">
<pre>{{ msg.content }}</pre>
</div>
<span v-else>{{ msg.content }}</span>
<span v-if="msg.role === 'assistant' && isGenerating && index === messages.length - 1" class="cursor">|</span>
</div>
</div>
</div>
<div class="input-area">
<div class="quick-actions" v-if="hasImage && !isGenerating">
<button v-for="q in currentQuestions" :key="q" @click="ask(q)" class="action-btn">
{{ q }}
</button>
</div>
<div class="status-text" v-else-if="isGenerating">
AI 正在观察图片并思考...
</div>
<div class="status-text" v-else>
等待图片上传...
</div>
</div>
</div>
</div>
</div>
</template>
<script setup>
import { ref, computed, nextTick } from 'vue'
const scenarios = [
{ id: 'chat', name: '通用对话' },
{ id: 'detection', name: '目标检测' },
{ id: 'ocr', name: 'OCR 提取' },
{ id: 'analysis', name: '业务风控' }
]
const currentScenario = ref('chat')
const hasImage = ref(false)
const isGenerating = ref(false)
const showBoundingBox = ref(false)
const messages = ref([])
const messagesRef = ref(null)
const questionsMap = {
chat: [
"这里是哪里?",
"描述一下天气",
"写首关于这座山的诗"
],
detection: [
"检测图中的水果",
"数数有几个苹果",
"输出检测框坐标"
],
ocr: [
"提取所有文字",
"总金额是多少?",
"消费日期是哪天?"
],
analysis: [
"工人是否佩戴安全帽?",
"检测现场安全隐患",
"输出风险评估报告"
]
}
const answersMap = {
chat: {
"这里是哪里?": "这是一张高山风景照。远处是覆盖着皑皑白雪的山峰,可能是阿尔卑斯山或喜马拉雅山脉。山脚下有郁郁葱葱的松树林。",
"描述一下天气": "天气看起来非常晴朗,阳光明媚(☀️),能见度很高。蓝天白云,是一个适合登山或滑雪的好天气。",
"写首关于这座山的诗": "🏔️ 雪岭插云天,\n🌲 松涛响翠烟。\n☀️ 金阳融冷色,\n🏞️ 壮丽入心田。"
},
detection: {
"检测图中的水果": {
type: 'json',
text: JSON.stringify({ objects: ['apple', 'banana', 'grape'], count: 3 }, null, 2),
action: 'showBox'
},
"数数有几个苹果": "图中检测到 1 个苹果(🍎)。",
"输出检测框坐标": {
type: 'json',
text: JSON.stringify({
objects: [
{ label: 'apple', box: [15, 15, 85, 85] },
{ label: 'banana', box: [95, 15, 165, 85] }
]
}, null, 2),
action: 'showBox'
}
},
ocr: {
"提取所有文字": {
type: 'json',
text: JSON.stringify({
lines: [
"RECEIPT",
"Coffee $4.50",
"Bagel $3.00",
"TOTAL $7.50",
"2023-10-24"
]
}, null, 2)
},
"总金额是多少?": "这张小票的总金额是 $7.50。",
"消费日期是哪天?": "消费日期是 2023年10月24日。"
},
analysis: {
"工人是否佩戴安全帽?": "检测到画面中有一名工人(👷),已正确佩戴红色安全帽(⛑️)。",
"检测现场安全隐患": {
type: 'json',
text: JSON.stringify({ hazards: [], safety_score: 100, status: "SAFE" }, null, 2)
},
"输出风险评估报告": "✅ **安全合规**\n- 人员:1人\n- 防护装备:齐全\n- 机械设备:正常运行中\n- 风险等级:低"
}
}
const getImageLabel = () => {
const map = {
chat: '已上传:雪山风景.jpg',
detection: '已上传:水果果盘.jpg',
ocr: '已上传:购物小票.jpg',
analysis: '已上传:车间监控.jpg'
}
return map[currentScenario.value]
}
const currentQuestions = computed(() => questionsMap[currentScenario.value] || [])
const switchScenario = (id) => {
currentScenario.value = id
hasImage.value = false
messages.value = []
showBoundingBox.value = false
}
const loadImage = () => {
hasImage.value = true
messages.value = [] // Clear history
showBoundingBox.value = false
}
const ask = async (question) => {
messages.value.push({ role: 'user', content: question })
isGenerating.value = true
await wait(800) // Simulate vision encoding time
const scenarioAnswers = answersMap[currentScenario.value]
const rawAnswer = scenarioAnswers[question] || "我还在学习这个任务..."
let content = ''
let isJson = false
let action = null
if (typeof rawAnswer === 'object') {
content = rawAnswer.text
isJson = rawAnswer.type === 'json'
action = rawAnswer.action
} else {
content = rawAnswer
}
messages.value.push({ role: 'assistant', content: '', isJson })
const answerIdx = messages.value.length - 1
// Streaming effect
const stepSize = isJson ? 5 : 1 // JSON types faster
for (let i = 0; i < content.length; i += stepSize) {
messages.value[answerIdx].content += content.slice(i, i + stepSize)
scrollToBottom()
await wait(20)
}
if (action === 'showBox') {
showBoundingBox.value = true
}
isGenerating.value = false
}
const wait = (ms) => new Promise(resolve => setTimeout(resolve, ms))
const scrollToBottom = () => {
nextTick(() => {
if (messagesRef.value) {
messagesRef.value.scrollTop = messagesRef.value.scrollHeight
}
})
}
</script>
<style scoped>
.vlm-quick-start {
background: var(--vp-c-bg-soft);
border: 1px solid var(--vp-c-divider);
border-radius: 12px;
padding: 20px;
margin: 20px 0;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
}
.header {
text-align: center;
margin-bottom: 20px;
}
.title {
font-size: 18px;
font-weight: bold;
margin-bottom: 5px;
}
.subtitle {
font-size: 13px;
color: var(--vp-c-text-2);
}
.scenario-tabs {
display: flex;
justify-content: center;
gap: 10px;
margin-bottom: 20px;
flex-wrap: wrap;
}
.tab-btn {
padding: 6px 16px;
border-radius: 20px;
border: 1px solid transparent;
background: var(--vp-c-bg);
color: var(--vp-c-text-2);
font-size: 13px;
cursor: pointer;
transition: all 0.2s;
}
.tab-btn.active {
background: var(--vp-c-brand);
color: white;
font-weight: bold;
}
.tab-btn:hover:not(.active) {
background: var(--vp-c-bg-mute);
}
.demo-container {
display: flex;
gap: 20px;
height: 340px;
}
/* Image Area */
.image-area {
flex: 1;
background: var(--vp-c-bg);
border: 1px dashed var(--vp-c-divider);
border-radius: 8px;
overflow: hidden;
position: relative;
}
.image-placeholder {
width: 100%;
height: 100%;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
transition: all 0.3s;
}
.image-placeholder.loaded {
background: #fff4e6;
border: none;
}
.image-placeholder.receipt-bg {
background: #f0f0f0;
}
.upload-prompt .icon {
font-size: 48px;
margin-bottom: 10px;
text-align: center;
}
.upload-btn {
background: var(--vp-c-brand);
color: white;
border: none;
padding: 8px 16px;
border-radius: 6px;
cursor: pointer;
font-size: 13px;
transition: opacity 0.2s;
}
.upload-btn:hover {
opacity: 0.9;
}
.image-content {
text-align: center;
position: relative;
width: 100%;
height: 100%;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
}
.real-image-container {
position: relative;
display: inline-block;
}
/* Landscape Style */
.real-image-container.landscape {
width: 100%;
height: 100%;
display: flex;
align-items: center;
justify-content: center;
background: linear-gradient(to bottom, #87CEEB 50%, #e0e0e0 50%);
border-radius: 8px;
overflow: hidden;
position: absolute;
top: 0;
left: 0;
}
.landscape .real-image {
font-size: 80px;
z-index: 2;
margin-top: 20px;
}
.landscape .sun {
position: absolute;
top: 20px;
right: 20px;
font-size: 40px;
animation: spin 10s linear infinite;
}
.landscape .tree {
position: absolute;
bottom: 20px;
left: 20px;
font-size: 40px;
z-index: 3;
}
/* Fruits Style */
.real-image-container.fruits {
padding: 20px;
}
.real-image-container.fruits .real-image {
display: flex;
gap: 20px;
}
.real-image-container.fruits .fruit {
font-size: 60px;
display: inline-block;
animation: popIn 0.5s ease;
}
.bounding-box.apple-box {
left: 15px;
top: 15px;
width: 70px;
height: 75px;
right: auto;
bottom: auto;
}
.bounding-box.banana-box {
left: 95px;
top: 15px;
width: 70px;
height: 75px;
right: auto;
bottom: auto;
}
/* Factory Style */
.factory-image {
background: #f8f9fa;
border: 2px solid #e9ecef;
border-radius: 8px;
padding: 20px;
width: 260px;
height: 180px;
position: relative;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
animation: slideUp 0.5s ease;
}
.safety-sign {
position: absolute;
top: 10px;
left: 10px;
font-size: 12px;
background: #ffeb3b;
color: #000;
padding: 2px 6px;
border-radius: 4px;
border: 1px solid #fbc02d;
font-weight: bold;
}
.worker-container {
font-size: 80px;
position: relative;
z-index: 2;
}
.worker-container .helmet {
position: absolute;
top: -15px;
left: 15px;
font-size: 40px;
z-index: 3;
}
.machinery {
position: absolute;
bottom: 10px;
right: 10px;
font-size: 50px;
opacity: 0.8;
animation: spin 5s linear infinite;
}
.real-image {
font-size: 80px;
margin-bottom: 10px;
animation: popIn 0.5s cubic-bezier(0.175, 0.885, 0.32, 1.275);
}
.bounding-box {
position: absolute;
top: -10px;
left: -10px;
right: -10px;
bottom: 0px;
border: 2px solid #ef4444;
background: rgba(239, 68, 68, 0.1);
border-radius: 4px;
animation: fadeIn 0.3s ease;
}
.box-label {
position: absolute;
top: -20px;
left: -2px;
background: #ef4444;
color: white;
font-size: 10px;
padding: 2px 4px;
border-radius: 2px;
}
/* Receipt Style */
.receipt-image {
background: white;
padding: 15px;
width: 160px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
font-family: 'Courier New', Courier, monospace;
font-size: 11px;
text-align: left;
margin-bottom: 10px;
animation: slideUp 0.5s ease;
}
.receipt-header {
text-align: center;
font-weight: bold;
border-bottom: 1px dashed #ccc;
padding-bottom: 8px;
margin-bottom: 8px;
}
.receipt-body .line {
display: flex;
justify-content: space-between;
margin-bottom: 4px;
}
.receipt-body .total {
border-top: 1px dashed #ccc;
padding-top: 4px;
margin-top: 4px;
font-weight: bold;
}
.receipt-body .date {
margin-top: 8px;
justify-content: center;
color: #888;
font-size: 10px;
}
.image-label {
font-size: 12px;
color: #666;
background: rgba(255,255,255,0.8);
padding: 4px 8px;
border-radius: 4px;
position: absolute;
bottom: 10px;
}
/* Chat Area */
.chat-area {
flex: 1.2;
display: flex;
flex-direction: column;
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 8px;
}
.messages {
flex: 1;
padding: 15px;
overflow-y: auto;
display: flex;
flex-direction: column;
gap: 12px;
}
.empty-text {
text-align: center;
color: var(--vp-c-text-3);
margin-top: 40px;
font-size: 13px;
}
.message {
max-width: 90%;
padding: 10px;
border-radius: 10px;
font-size: 13px;
line-height: 1.5;
}
.message.user {
align-self: flex-end;
background: var(--vp-c-brand);
color: white;
border-bottom-right-radius: 2px;
}
.message.assistant {
align-self: flex-start;
background: var(--vp-c-bg-mute);
color: var(--vp-c-text-1);
border-bottom-left-radius: 2px;
}
.json-content pre {
margin: 0;
white-space: pre-wrap;
font-family: monospace;
font-size: 11px;
}
.input-area {
padding: 15px;
border-top: 1px solid var(--vp-c-divider);
min-height: 60px;
display: flex;
align-items: center;
justify-content: center;
}
.quick-actions {
display: flex;
flex-wrap: wrap;
gap: 8px;
justify-content: center;
}
.action-btn {
padding: 6px 12px;
background: var(--vp-c-bg);
border: 1px solid var(--vp-c-divider);
border-radius: 16px;
cursor: pointer;
font-size: 12px;
transition: all 0.2s;
}
.action-btn:hover {
border-color: var(--vp-c-brand);
color: var(--vp-c-brand);
background: var(--vp-c-bg-mute);
}
.status-text {
font-size: 12px;
color: var(--vp-c-text-3);
}
.cursor {
display: inline-block;
width: 2px;
height: 14px;
background: currentColor;
animation: blink 1s infinite;
vertical-align: middle;
}
@keyframes popIn {
from { transform: scale(0); opacity: 0; }
to { transform: scale(1); opacity: 1; }
}
@keyframes slideUp {
from { transform: translateY(20px); opacity: 0; }
to { transform: translateY(0); opacity: 1; }
}
@keyframes fadeIn {
from { opacity: 0; }
to { opacity: 1; }
}
@keyframes blink {
0%, 100% { opacity: 1; }
50% { opacity: 0; }
}
@keyframes spin {
from { transform: rotate(0deg); }
to { transform: rotate(360deg); }
}
@media (max-width: 600px) {
.demo-container {
flex-direction: column;
height: auto;
}
.image-area {
height: 200px;
}
.chat-area {
height: 300px;
}
.scenario-tabs {
overflow-x: auto;
justify-content: flex-start;
padding-bottom: 5px;
}
.tab-btn {
white-space: nowrap;
}
}
</style>