feat: comprehensive documentation and demo updates

- Update READMEs and docs across multiple languages
- Enhance interactive demos for Agent, LLM, VLM, Audio, Image Gen, Terminal, and Web Basics
- Add new appendix sections for Database and IDE intros
- Update VitePress config, theme, and utility scripts
- Clean up unused assets and components
This commit is contained in:
sanbuphy
2026-01-16 19:10:21 +08:00
parent c8567ce23f
commit 73f4788d7e
150 changed files with 19530 additions and 13401 deletions
@@ -3,11 +3,11 @@
<div class="controls">
<span class="hint">🖱 把鼠标悬停在方块上查看它的注意力分配</span>
</div>
<div class="visual-area">
<div class="image-grid" @mouseleave="hoverIndex = -1">
<div
v-for="(item, index) in items"
<div
v-for="(item, index) in items"
:key="index"
class="grid-cell"
:class="{ active: hoverIndex === index }"
@@ -16,16 +16,16 @@
{{ item.icon }}
<div class="cell-label">{{ item.label }}</div>
</div>
<!-- SVG Overlay for lines -->
<svg class="connections" v-if="hoverIndex !== -1">
<line
v-for="(target, tIndex) in items"
<line
v-for="(target, tIndex) in items"
:key="tIndex"
v-if="tIndex !== hoverIndex"
:x1="getCenter(hoverIndex).x"
:x1="getCenter(hoverIndex).x"
:y1="getCenter(hoverIndex).y"
:x2="getCenter(tIndex).x"
:x2="getCenter(tIndex).x"
:y2="getCenter(tIndex).y"
:stroke="getAttentionColor(hoverIndex, tIndex)"
:stroke-width="getAttentionWidth(hoverIndex, tIndex)"
@@ -33,16 +33,22 @@
/>
</svg>
</div>
<div class="info-panel" :class="{ visible: hoverIndex !== -1 }">
<div class="info-title">Patch: {{ items[hoverIndex]?.label }}</div>
<div class="info-desc">正在关注</div>
<ul class="attn-list" v-if="hoverIndex !== -1">
<li v-for="(weight, targetIdx) in getTopAttentions(hoverIndex)" :key="targetIdx">
<li
v-for="(weight, targetIdx) in getTopAttentions(hoverIndex)"
:key="targetIdx"
>
<span class="target-icon">{{ items[targetIdx].icon }}</span>
<span class="target-name">{{ items[targetIdx].label }}</span>
<div class="bar-bg">
<div class="bar-fill" :style="{ width: (weight * 100) + '%' }"></div>
<div
class="bar-fill"
:style="{ width: weight * 100 + '%' }"
></div>
</div>
</li>
</ul>
@@ -57,9 +63,15 @@ import { ref } from 'vue'
const hoverIndex = ref(-1)
const items = [
{ icon: '🌲', label: '背景' }, { icon: '🌲', label: '背景' }, { icon: '☁️', label: '天空' },
{ icon: '👂', label: '猫耳' }, { icon: '😼', label: '猫脸' }, { icon: '🌲', label: '背景' },
{ icon: '🐾', label: '猫爪' }, { icon: '🧶', label: '毛线' }, { icon: '🌱', label: '草地' }
{ icon: '🌲', label: '背景' },
{ icon: '🌲', label: '背景' },
{ icon: '☁️', label: '天空' },
{ icon: '👂', label: '猫耳' },
{ icon: '😼', label: '猫脸' },
{ icon: '🌲', label: '背景' },
{ icon: '🐾', label: '猫爪' },
{ icon: '🧶', label: '毛线' },
{ icon: '🌱', label: '草地' }
]
// 3x3 Grid
@@ -79,14 +91,14 @@ const getCenter = (index) => {
// Mock attention weights
const getAttentionWeight = (source, target) => {
// Self attention is ignored for visualization clarity usually, but let's say:
// Cat parts (3, 4, 6) attend strongly to each other
const catParts = [3, 4, 6]
const isSourceCat = catParts.includes(source)
const isTargetCat = catParts.includes(target)
if (isSourceCat && isTargetCat) return 0.9 // Strong connection between cat parts
// Cat interacts with Yarn (7)
if (isSourceCat && target === 7) return 0.7
if (source === 7 && isTargetCat) return 0.7
@@ -94,7 +106,7 @@ const getAttentionWeight = (source, target) => {
// Background parts attend to each other
const bgParts = [0, 1, 2, 5, 8]
if (bgParts.includes(source) && bgParts.includes(target)) return 0.5
return 0.1 // Weak attention otherwise
}
@@ -175,10 +187,11 @@ const getTopAttentions = (source) => {
position: relative;
}
.grid-cell:hover, .grid-cell.active {
.grid-cell:hover,
.grid-cell.active {
border-color: var(--vp-c-brand);
transform: scale(1.05);
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
background: var(--vp-c-bg-mute);
}
@@ -1,10 +1,12 @@
<template>
<div class="feature-alignment-demo">
<div class="header">
<div class="title">阶段一特征对齐 (Feature Alignment / Pre-training)</div>
<div class="title">
阶段一特征对齐 (Feature Alignment / Pre-training)
</div>
<div class="desc">
目标 Projector 学会翻译图像语言
<br>做法冻结 ViT LLM只训练 Projector
<br />做法冻结 ViT LLM只训练 Projector
</div>
</div>
@@ -13,11 +15,11 @@
<div class="data-column">
<div class="data-item image-data">
<div class="data-icon">🖼</div>
<div class="data-label">图片<br>()</div>
<div class="data-label">图片<br />()</div>
</div>
<div class="data-item text-data">
<div class="data-icon">📝</div>
<div class="data-label">标题<br>("一只猫")</div>
<div class="data-label">标题<br />("一只猫")</div>
</div>
</div>
@@ -35,7 +37,7 @@
<div class="block-icon">👁</div>
<div class="block-name">ViT</div>
</div>
<div class="arrow-small"></div>
<div class="model-block training">
@@ -64,7 +66,7 @@
<div class="vector-icon">🟢</div>
<div class="vector-label">向量 V</div>
</div>
<div class="loss-connection">
<div class="loss-line"></div>
<div class="loss-box" :class="{ active: isCalculatingLoss }">
@@ -107,21 +109,31 @@ const nextStep = () => {
const buttonText = computed(() => {
switch (step.value) {
case 0: return '开始训练演示'
case 1: return '下一步:计算 Loss'
case 2: return '下一步:反向传播'
case 3: return '完成并重置'
default: return '开始'
case 0:
return '开始训练演示'
case 1:
return '下一步:计算 Loss'
case 2:
return '下一步:反向传播'
case 3:
return '完成并重置'
default:
return '开始'
}
})
const currentStepDesc = computed(() => {
switch (step.value) {
case 0: return '准备就绪。点击按钮开始模拟一次训练迭代。'
case 1: return '前向传播:图片经过 ViT (冻结) 和 Projector (训练) 得到向量 V;文本经过 LLM (冻结) 得到向量 T。'
case 2: return '计算 Loss:比较向量 V 和向量 T 的相似度。目标是让它们尽可能接近。'
case 3: return '向传播:根据 Loss 更新 Projector 的参数。注意 ViT 和 LLM 不会更新!'
default: return ''
case 0:
return '准备就绪。点击按钮开始模拟一次训练迭代。'
case 1:
return '向传播:图片经过 ViT (冻结) 和 Projector (训练) 得到向量 V;文本经过 LLM (冻结) 得到向量 T。'
case 2:
return '计算 Loss:比较向量 V 和向量 T 的相似度。目标是让它们尽可能接近。'
case 3:
return '反向传播:根据 Loss 更新 Projector 的参数。注意 ViT 和 LLM 不会更新!'
default:
return ''
}
})
@@ -135,7 +147,8 @@ const isCalculatingLoss = computed(() => step.value === 2)
border-radius: 12px;
padding: 20px;
margin: 20px 0;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
font-family:
-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
}
.header {
@@ -184,8 +197,14 @@ const isCalculatingLoss = computed(() => step.value === 2)
width: 60px;
}
.data-icon { font-size: 24px; }
.data-label { font-size: 10px; text-align: center; margin-top: 4px; }
.data-icon {
font-size: 24px;
}
.data-label {
font-size: 10px;
text-align: center;
margin-top: 4px;
}
/* Arrow Column */
.arrow-column {
@@ -200,9 +219,9 @@ const isCalculatingLoss = computed(() => step.value === 2)
.model-column {
display: grid;
grid-template-columns: auto auto auto;
grid-template-areas:
"vit arrow proj"
"llm llm llm";
grid-template-areas:
'vit arrow proj'
'llm llm llm';
gap: 10px;
row-gap: 30px;
align-items: center;
@@ -262,8 +281,14 @@ const isCalculatingLoss = computed(() => step.value === 2)
width: 100%;
}
.block-icon { font-size: 20px; margin-bottom: 4px; }
.block-name { font-size: 12px; font-weight: bold; }
.block-icon {
font-size: 20px;
margin-bottom: 4px;
}
.block-name {
font-size: 12px;
font-weight: bold;
}
.arrow-small {
grid-area: arrow;
@@ -316,8 +341,15 @@ const isCalculatingLoss = computed(() => step.value === 2)
box-shadow: 0 0 10px rgba(255, 0, 0, 0.2);
}
.loss-label { font-size: 12px; font-weight: bold; color: var(--vp-c-danger); }
.loss-desc { font-size: 10px; color: var(--vp-c-text-2); }
.loss-label {
font-size: 12px;
font-weight: bold;
color: var(--vp-c-danger);
}
.loss-desc {
font-size: 10px;
color: var(--vp-c-text-2);
}
/* Controls */
.controls {
@@ -356,9 +388,15 @@ const isCalculatingLoss = computed(() => step.value === 2)
}
@keyframes pulse {
0% { box-shadow: 0 0 0 0 rgba(var(--vp-c-brand-rgb), 0.4); }
70% { box-shadow: 0 0 0 10px rgba(var(--vp-c-brand-rgb), 0); }
100% { box-shadow: 0 0 0 0 rgba(var(--vp-c-brand-rgb), 0); }
0% {
box-shadow: 0 0 0 0 rgba(var(--vp-c-brand-rgb), 0.4);
}
70% {
box-shadow: 0 0 0 10px rgba(var(--vp-c-brand-rgb), 0);
}
100% {
box-shadow: 0 0 0 0 rgba(var(--vp-c-brand-rgb), 0);
}
}
@media (max-width: 600px) {
@@ -388,4 +426,4 @@ const isCalculatingLoss = computed(() => step.value === 2)
height: 1px;
}
}
</style>
</style>
@@ -5,7 +5,12 @@
<div class="step-box">
<div class="label">1. Patch (4x4)</div>
<div class="grid-patch">
<div v-for="n in 16" :key="n" class="pixel" :style="{ backgroundColor: getPixelColor(n) }"></div>
<div
v-for="n in 16"
:key="n"
class="pixel"
:style="{ backgroundColor: getPixelColor(n) }"
></div>
</div>
<div class="desc">768 像素点</div>
</div>
@@ -16,7 +21,12 @@
<div class="step-box">
<div class="label">2. Flatten</div>
<div class="vector-container">
<div v-for="n in 16" :key="n" class="vector-cell" :style="{ backgroundColor: getPixelColor(n) }"></div>
<div
v-for="n in 16"
:key="n"
class="vector-cell"
:style="{ backgroundColor: getPixelColor(n) }"
></div>
</div>
<div class="desc">拉平成向量</div>
</div>
@@ -38,8 +48,8 @@
<script setup>
const getPixelColor = (n) => {
// Generate a gradient of colors
const hue = (n * 20) % 360;
return `hsl(${hue}, 70%, 60%)`;
const hue = (n * 20) % 360
return `hsl(${hue}, 70%, 60%)`
}
</script>
@@ -14,15 +14,15 @@
</div>
</div>
<div class="status-desc">
{{ isVLM
? '给大脑装上眼睛:视觉信号经过翻译,变成 Token 混入文字流。'
: '纯文本大脑:只能听懂 Token 语言,无法感知图像。'
{{
isVLM
? '给大脑装上眼睛:视觉信号经过翻译,变成 Token 混入文字流。'
: '纯文本大脑:只能听懂 Token 语言,无法感知图像。'
}}
</div>
</div>
<div class="diagram-stage" :class="{ 'vlm-mode': isVLM }">
<!-- Vision Pipeline (Only visible in VLM mode) -->
<div class="pipeline vision-pipeline">
<div class="node-group">
@@ -31,12 +31,18 @@
<span class="label">Image</span>
</div>
<div class="flow-arrow"></div>
<div class="node process-node vit-node" title="Vision Transformer: The Eye">
<div
class="node process-node vit-node"
title="Vision Transformer: The Eye"
>
<span class="icon"></span>
<span class="label">ViT</span>
</div>
<div class="flow-arrow"></div>
<div class="node adapter-node projector-node" title="Projector: The Translator">
<div
class="node adapter-node projector-node"
title="Projector: The Translator"
>
<span class="icon">🔌</span>
<span class="label">Projector</span>
</div>
@@ -56,7 +62,7 @@
<span class="icon"></span>
<span class="label">Embed</span>
</div>
<!-- Merge Point Visualization -->
<div class="merge-point" :class="{ active: isVLM }">
<div class="plus-icon">+</div>
@@ -80,20 +86,33 @@
</div>
</div>
</div>
</div>
<div class="interactive-info">
<div class="info-card" v-if="!isVLM">
<h3>Standard LLM Flow</h3>
<p>Text is converted into vectors (Embeddings) and processed by the Transformer to predict the next word.</p>
<p>
Text is converted into vectors (Embeddings) and processed by the
Transformer to predict the next word.
</p>
</div>
<div class="info-card vlm-info" v-else>
<h3>VLM = LLM + Vision Encoder</h3>
<ul>
<li><strong>ViT (The Eye):</strong> Slices image into patches and extracts features.</li>
<li><strong>Projector (The Translator):</strong> Converts visual features into the same "language" (vector dimension) as text embeddings.</li>
<li><strong>Concatenation:</strong> The translated visual tokens are pasted <em>before</em> the text tokens. The LLM sees them as "foreign words" it learned to understand.</li>
<li>
<strong>ViT (The Eye):</strong> Slices image into patches and
extracts features.
</li>
<li>
<strong>Projector (The Translator):</strong> Converts visual
features into the same "language" (vector dimension) as text
embeddings.
</li>
<li>
<strong>Concatenation:</strong> The translated visual tokens are
pasted <em>before</em> the text tokens. The LLM sees them as
"foreign words" it learned to understand.
</li>
</ul>
</div>
</div>
@@ -173,7 +192,7 @@ const toggleMode = () => {
justify-content: center;
font-size: 14px;
transition: transform 0.3s cubic-bezier(0.34, 1.56, 0.64, 1);
box-shadow: 0 2px 4px rgba(0,0,0,0.2);
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
}
.toggle-track.active .toggle-thumb {
@@ -271,29 +290,41 @@ const toggleMode = () => {
flex-direction: column;
align-items: center;
min-width: 70px;
box-shadow: 0 4px 6px rgba(0,0,0,0.05);
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
position: relative;
z-index: 2;
}
.icon { font-size: 20px; margin-bottom: 4px; }
.label { font-size: 11px; font-weight: bold; }
.icon {
font-size: 20px;
margin-bottom: 4px;
}
.label {
font-size: 11px;
font-weight: bold;
}
.input-node { border-color: #aaa; }
.process-node { border-color: var(--vp-c-brand-dimm); }
.core-node {
border-color: var(--vp-c-brand);
.input-node {
border-color: #aaa;
}
.process-node {
border-color: var(--vp-c-brand-dimm);
}
.core-node {
border-color: var(--vp-c-brand);
background: var(--vp-c-brand-dimm);
min-width: 100px;
}
.output-node { border-color: var(--vp-c-brand); }
.output-node {
border-color: var(--vp-c-brand);
}
.vit-node {
border-color: var(--vp-c-yellow);
.vit-node {
border-color: var(--vp-c-yellow);
background: rgba(255, 197, 23, 0.05);
}
.projector-node {
border-color: var(--vp-c-yellow);
border-color: var(--vp-c-yellow);
background: var(--vp-c-yellow-dimm);
}
@@ -355,13 +386,26 @@ const toggleMode = () => {
animation: pulse 1s infinite alternate;
}
.t1 { animation-delay: 0s; }
.t2 { animation-delay: 0.2s; }
.v1 { background: var(--vp-c-yellow); animation-delay: 0.4s; }
.t1 {
animation-delay: 0s;
}
.t2 {
animation-delay: 0.2s;
}
.v1 {
background: var(--vp-c-yellow);
animation-delay: 0.4s;
}
@keyframes pulse {
from { opacity: 0.3; transform: scale(0.8); }
to { opacity: 1; transform: scale(1.1); }
from {
opacity: 0.3;
transform: scale(0.8);
}
to {
opacity: 1;
transform: scale(1.1);
}
}
/* Interactive Info */
@@ -383,7 +427,8 @@ const toggleMode = () => {
color: var(--vp-c-text-1);
}
.info-card p, .info-card li {
.info-card p,
.info-card li {
font-size: 13px;
color: var(--vp-c-text-2);
line-height: 1.6;
@@ -395,8 +440,14 @@ const toggleMode = () => {
}
@keyframes fadeIn {
from { opacity: 0; transform: translateY(5px); }
to { opacity: 1; transform: translateY(0); }
from {
opacity: 0;
transform: translateY(5px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
/* Mobile Adjustments */
@@ -404,15 +455,15 @@ const toggleMode = () => {
.diagram-stage {
height: 300px;
}
.text-pipeline {
flex-wrap: wrap;
gap: 10px;
width: 90%;
}
.vision-pipeline {
left: 10%;
}
}
</style>
</style>
@@ -20,11 +20,11 @@
<div class="visual-area">
<!-- 原始/切分视图容器 -->
<div class="image-container" :class="{ 'is-patchified': isPatchified }">
<div
v-for="n in 196"
:key="n"
<div
v-for="n in 196"
:key="n"
class="patch"
:style="{
:style="{
'--delay': `${n * 0.005}s`,
'--hue': `${(n % 14) * 20 + Math.floor(n / 14) * 20}`
}"
@@ -32,16 +32,16 @@
<span class="patch-id" v-if="isPatchified">{{ n }}</span>
</div>
</div>
<div class="arrow" v-if="isPatchified"></div>
<!-- 线性序列视图 -->
<div class="sequence-container" v-if="isPatchified">
<div class="sequence-label">Flattened Sequence (Token Input)</div>
<div class="token-stream">
<div
v-for="n in 196"
:key="n"
<div
v-for="n in 196"
:key="n"
class="mini-patch"
:style="{ '--hue': `${(n % 14) * 20 + Math.floor(n / 14) * 20}` }"
></div>
@@ -144,7 +144,7 @@ const toggleState = () => {
align-items: center;
justify-content: center;
font-size: 8px;
color: rgba(0,0,0,0.5);
color: rgba(0, 0, 0, 0.5);
transition: all 0.5s ease;
}
@@ -198,12 +198,23 @@ const toggleState = () => {
}
@keyframes bounce {
0%, 100% { transform: translateY(0); }
50% { transform: translateY(5px); }
0%,
100% {
transform: translateY(0);
}
50% {
transform: translateY(5px);
}
}
@keyframes fadeIn {
from { opacity: 0; transform: translateY(10px); }
to { opacity: 1; transform: translateY(0); }
from {
opacity: 0;
transform: translateY(10px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
</style>
@@ -5,7 +5,7 @@
<div class="grid-wrapper">
<div class="grid-title">Feature Vectors</div>
<div class="grid-box feature-grid">
<div v-for="n in 9" :key="'f'+n" class="cell feature-cell">V</div>
<div v-for="n in 9" :key="'f' + n" class="cell feature-cell">V</div>
</div>
</div>
@@ -15,7 +15,7 @@
<div class="grid-wrapper">
<div class="grid-title">Position Embeddings</div>
<div class="grid-box pos-grid">
<div v-for="n in 9" :key="'p'+n" class="cell pos-cell">{{ n }}</div>
<div v-for="n in 9" :key="'p' + n" class="cell pos-cell">{{ n }}</div>
</div>
</div>
@@ -25,14 +25,16 @@
<div class="grid-wrapper">
<div class="grid-title">Input to Transformer</div>
<div class="grid-box result-grid">
<div v-for="n in 9" :key="'r'+n" class="cell result-cell">
<span class="v">V</span><span class="plus">+</span><span class="p">{{ n }}</span>
<div v-for="n in 9" :key="'r' + n" class="cell result-cell">
<span class="v">V</span><span class="plus">+</span
><span class="p">{{ n }}</span>
</div>
</div>
</div>
</div>
<div class="caption">
位置编码 (Position Embedding) 是一组可学习的向量直接<b></b>在图像特征上
位置编码 (Position Embedding)
是一组可学习的向量直接<b></b>在图像特征上
</div>
</div>
</template>
@@ -5,16 +5,10 @@
<template>
<div class="projector-demo">
<div class="mode-switch">
<button
:class="{ active: mode === 'linear' }"
@click="mode = 'linear'"
>
<button :class="{ active: mode === 'linear' }" @click="mode = 'linear'">
Linear (LLaVA)
</button>
<button
:class="{ active: mode === 'qformer' }"
@click="mode = 'qformer'"
>
<button :class="{ active: mode === 'qformer' }" @click="mode = 'qformer'">
Q-Former (BLIP-2)
</button>
</div>
@@ -26,14 +20,18 @@
<div class="token-container input">
<div v-for="n in 16" :key="n" class="token visual"></div>
</div>
<div class="count">{{ mode === 'linear' ? '256 Tokens' : '256 Tokens' }}</div>
<div class="count">
{{ mode === 'linear' ? '256 Tokens' : '256 Tokens' }}
</div>
</div>
<!-- Process: The Projector -->
<div class="stage connector">
<div class="arrow-line"></div>
<div class="projector-box" :class="mode">
<div class="title">{{ mode === 'linear' ? 'Linear Layer' : 'Q-Former' }}</div>
<div class="title">
{{ mode === 'linear' ? 'Linear Layer' : 'Q-Former' }}
</div>
<div class="desc">
{{ mode === 'linear' ? '直接映射 (1:1)' : '查询提取 (N:M)' }}
</div>
@@ -50,26 +48,32 @@
<div class="stage">
<div class="label">LLM Tokens</div>
<div class="token-container output">
<div
v-for="n in (mode === 'linear' ? 16 : 4)"
:key="n"
<div
v-for="n in mode === 'linear' ? 16 : 4"
:key="n"
class="token llm"
></div>
</div>
<div class="count">
{{ mode === 'linear' ? '256 Tokens (保留全部细节)' : '32 Tokens (只保留关键信息)' }}
{{
mode === 'linear'
? '256 Tokens (保留全部细节)'
: '32 Tokens (只保留关键信息)'
}}
</div>
</div>
</div>
<div class="explanation">
<div v-if="mode === 'linear'">
<strong>Linear Projector:</strong>
简单高效它像一个直译器保留了所有的视觉信息虽然 Token 数量多计算量大但对细节的把控更好
<strong>Linear Projector:</strong>
简单高效它像一个直译器保留了所有的视觉信息虽然 Token
数量多计算量大但对细节的把控更好
</div>
<div v-else>
<strong>Q-Former:</strong>
精细优雅它使用一组查询向量主动去图像中提取与文本相关的信息大大压缩了 Token 数量 LLM 跑得更快
<strong>Q-Former:</strong>
精细优雅它使用一组查询向量主动去图像中提取与文本相关的信息大大压缩了
Token 数量 LLM 跑得更快
</div>
</div>
</div>
@@ -231,11 +235,20 @@ const mode = ref('linear')
animation: pulse 1s infinite;
}
.dot:nth-child(2) { animation-delay: 0.2s; }
.dot:nth-child(3) { animation-delay: 0.4s; }
.dot:nth-child(2) {
animation-delay: 0.2s;
}
.dot:nth-child(3) {
animation-delay: 0.4s;
}
@keyframes pulse {
0%, 100% { opacity: 0.3; }
50% { opacity: 1; }
0%,
100% {
opacity: 0.3;
}
50% {
opacity: 1;
}
}
</style>
@@ -1,16 +1,10 @@
<template>
<div class="pipeline-demo">
<div class="stage-switch">
<button
:class="{ active: stage === 1 }"
@click="stage = 1"
>
<button :class="{ active: stage === 1 }" @click="stage = 1">
阶段一特征对齐
</button>
<button
:class="{ active: stage === 2 }"
@click="stage = 2"
>
<button :class="{ active: stage === 2 }" @click="stage = 2">
阶段二指令微调
</button>
</div>
@@ -43,8 +37,13 @@
<div class="arrow"></div>
<!-- LLM -->
<div class="component-box llm" :class="{ frozen: stage === 1, training: stage === 2 }">
<div class="status-badge">{{ stage === 1 ? '❄️ Frozen' : '🔥 Train' }}</div>
<div
class="component-box llm"
:class="{ frozen: stage === 1, training: stage === 2 }"
>
<div class="status-badge">
{{ stage === 1 ? '❄️ Frozen' : '🔥 Train' }}
</div>
<div class="name">LLM</div>
<div class="desc">Language Model</div>
</div>
@@ -67,7 +66,10 @@
<p>任务让图像向量与文本向量距离变近</p>
</div>
<div class="data-content" v-else>
<code>User: &lt;Image: 🐱&gt; 这只猫在干嘛<br/>Assistant: 它在睡觉</code>
<code
>User: &lt;Image: 🐱&gt; 这只猫在干嘛<br />Assistant:
它在睡觉</code
>
<p>任务根据图像和问题生成回答</p>
</div>
</div>
@@ -12,9 +12,7 @@
<div class="avatar">👤</div>
<div class="bubble">
<div class="image-upload">
<div class="placeholder-img">
🐱
</div>
<div class="placeholder-img">🐱</div>
</div>
<div class="text">这只猫在做什么</div>
</div>
@@ -39,8 +37,8 @@
</div>
<div class="controls">
<button
class="send-btn"
<button
class="send-btn"
:disabled="step > 0 && step < 3"
@click="startInference"
>
@@ -54,13 +52,13 @@
import { ref, watch } from 'vue'
const step = ref(0)
const fullText = "它正趴在窗台上晒太阳,看起来非常惬意。"
const typedText = ref("")
const fullText = '它正趴在窗台上晒太阳,看起来非常惬意。'
const typedText = ref('')
const startInference = () => {
step.value = 1
typedText.value = ""
typedText.value = ''
// Step 1: Vision Encoding
setTimeout(() => {
step.value = 2
@@ -93,7 +91,7 @@ const typeText = () => {
overflow: hidden;
max-width: 500px;
margin: 20px auto;
box-shadow: 0 4px 12px rgba(0,0,0,0.05);
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
}
.chat-window {
@@ -130,7 +128,7 @@ const typeText = () => {
border-radius: 12px;
border: 1px solid var(--vp-c-divider);
max-width: 80%;
box-shadow: 0 2px 4px rgba(0,0,0,0.02);
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.02);
}
.message.user .bubble {
@@ -192,7 +190,12 @@ const typeText = () => {
}
@keyframes blink {
0%, 100% { opacity: 1; }
50% { opacity: 0; }
0%,
100% {
opacity: 1;
}
50% {
opacity: 0;
}
}
</style>
@@ -5,8 +5,8 @@
<div class="stage">
<div class="stage-label">1. Processed Patches (Grid)</div>
<div class="grid-container">
<div
v-for="(item, index) in items"
<div
v-for="(item, index) in items"
:key="index"
class="grid-item"
:class="{ active: activeIndex === index }"
@@ -24,10 +24,12 @@
<!-- 2. Feature Vector Sequence -->
<div class="stage">
<div class="stage-label">2. Feature Vector Sequence (The "Image Sentence")</div>
<div class="stage-label">
2. Feature Vector Sequence (The "Image Sentence")
</div>
<div class="vector-sequence">
<div
v-for="(item, index) in items"
<div
v-for="(item, index) in items"
:key="index"
class="vector-wrapper"
:class="{ active: activeIndex === index }"
@@ -35,11 +37,26 @@
>
<div class="vector-col">
<!-- Simulated vector dimensions -->
<div class="v-cell" :style="{ opacity: 0.9, background: item.color }"></div>
<div class="v-cell" :style="{ opacity: 0.7, background: item.color }"></div>
<div class="v-cell" :style="{ opacity: 0.5, background: item.color }"></div>
<div class="v-cell" :style="{ opacity: 0.8, background: item.color }"></div>
<div class="v-cell" :style="{ opacity: 0.6, background: item.color }"></div>
<div
class="v-cell"
:style="{ opacity: 0.9, background: item.color }"
></div>
<div
class="v-cell"
:style="{ opacity: 0.7, background: item.color }"
></div>
<div
class="v-cell"
:style="{ opacity: 0.5, background: item.color }"
></div>
<div
class="v-cell"
:style="{ opacity: 0.8, background: item.color }"
></div>
<div
class="v-cell"
:style="{ opacity: 0.6, background: item.color }"
></div>
</div>
<div class="vector-idx">{{ index + 1 }}</div>
</div>
@@ -53,7 +70,10 @@
<div class="header" :style="{ borderColor: items[activeIndex].color }">
<span class="large-icon">{{ items[activeIndex].icon }}</span>
<div class="title-group">
<span class="title">Token #{{ activeIndex + 1 }}: {{ items[activeIndex].label }}</span>
<span class="title"
>Token #{{ activeIndex + 1 }}:
{{ items[activeIndex].label }}</span
>
<span class="subtitle">Type: {{ items[activeIndex].type }}</span>
</div>
</div>
@@ -61,7 +81,9 @@
<div class="vector-repr">
<span class="label">Vector Value:</span>
<span class="code" :style="{ color: items[activeIndex].color }">
[0.{{ (Math.random()*99).toFixed(0) }}, -0.{{ (Math.random()*99).toFixed(0) }}, 1.{{ (Math.random()*99).toFixed(0) }}, ...]
[0.{{ (Math.random() * 99).toFixed(0) }}, -0.{{
(Math.random() * 99).toFixed(0)
}}, 1.{{ (Math.random() * 99).toFixed(0) }}, ...]
</span>
</div>
<div class="meaning">
@@ -71,8 +93,10 @@
</div>
</div>
<div v-else class="placeholder">
<span class="hint-icon">👆</span>
<span class="hint-text">悬停在上方方块或向量上查看 ViT 输出的语义特征</span>
<span class="hint-icon">👆</span>
<span class="hint-text"
>悬停在上方方块或向量上查看 ViT 输出的语义特征</span
>
</div>
</div>
</div>
@@ -84,15 +108,69 @@ import { ref } from 'vue'
const activeIndex = ref(-1)
const items = [
{ icon: '🌲', label: 'Background', type: 'Environment', color: '#4caf50', desc: 'Recognized as outdoor nature elements (Trees/Greenery). Low relevance to main subject.' },
{ icon: '🌲', label: 'Background', type: 'Environment', color: '#4caf50', desc: 'Redundant background info. Contextualizes the scene as "Outdoors".' },
{ icon: '☁️', label: 'Sky', type: 'Environment', color: '#2196f3', desc: 'Spatial context: Upper region, open area.' },
{ icon: '👂', label: 'Cat Ear', type: 'Subject Part', color: '#ff9800', desc: 'High Importance. Identified as "Feline Feature". Strongly linked to "Cat Face".' },
{ icon: '😼', label: 'Cat Face', type: 'Subject Core', color: '#ff5722', desc: 'Global Focus Center. Contains "Eyes", "Whiskers". Aggregates info from surrounding patches.' },
{ icon: '🌲', label: 'Background', type: 'Environment', color: '#4caf50', desc: 'Background noise.' },
{ icon: '🐾', label: 'Cat Paw', type: 'Subject Part', color: '#ff9800', desc: 'Action component. Suggests "Standing" or "Walking" posture.' },
{ icon: '🧶', label: 'Yarn', type: 'Object', color: '#e91e63', desc: 'Interacting Object. Semantically linked to "Play" or "Toy".' },
{ icon: '🌱', label: 'Grass', type: 'Environment', color: '#8bc34a', desc: 'Ground context. Confirms "Ground level" view.' }
{
icon: '🌲',
label: 'Background',
type: 'Environment',
color: '#4caf50',
desc: 'Recognized as outdoor nature elements (Trees/Greenery). Low relevance to main subject.'
},
{
icon: '🌲',
label: 'Background',
type: 'Environment',
color: '#4caf50',
desc: 'Redundant background info. Contextualizes the scene as "Outdoors".'
},
{
icon: '☁️',
label: 'Sky',
type: 'Environment',
color: '#2196f3',
desc: 'Spatial context: Upper region, open area.'
},
{
icon: '👂',
label: 'Cat Ear',
type: 'Subject Part',
color: '#ff9800',
desc: 'High Importance. Identified as "Feline Feature". Strongly linked to "Cat Face".'
},
{
icon: '😼',
label: 'Cat Face',
type: 'Subject Core',
color: '#ff5722',
desc: 'Global Focus Center. Contains "Eyes", "Whiskers". Aggregates info from surrounding patches.'
},
{
icon: '🌲',
label: 'Background',
type: 'Environment',
color: '#4caf50',
desc: 'Background noise.'
},
{
icon: '🐾',
label: 'Cat Paw',
type: 'Subject Part',
color: '#ff9800',
desc: 'Action component. Suggests "Standing" or "Walking" posture.'
},
{
icon: '🧶',
label: 'Yarn',
type: 'Object',
color: '#e91e63',
desc: 'Interacting Object. Semantically linked to "Play" or "Toy".'
},
{
icon: '🌱',
label: 'Grass',
type: 'Environment',
color: '#8bc34a',
desc: 'Ground context. Confirms "Ground level" view.'
}
]
</script>
@@ -102,7 +180,10 @@ const items = [
border: 1px solid #e9ecef;
border-radius: 12px;
padding: 24px;
font-family: system-ui, -apple-system, sans-serif;
font-family:
system-ui,
-apple-system,
sans-serif;
max-width: 700px;
margin: 20px auto;
}
@@ -143,7 +224,7 @@ const items = [
background: #fff;
padding: 8px;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.05);
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
}
.dark .grid-container {
background: #252529;
@@ -165,12 +246,14 @@ const items = [
background: #343a40;
}
.grid-item:hover, .grid-item.active {
.grid-item:hover,
.grid-item.active {
background: #e7f5ff;
transform: scale(1.1);
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
}
.dark .grid-item:hover, .dark .grid-item.active {
.dark .grid-item:hover,
.dark .grid-item.active {
background: #1c7ed6;
}
@@ -194,7 +277,7 @@ const items = [
padding: 10px;
background: #fff;
border-radius: 8px;
box-shadow: 0 2px 12px rgba(0,0,0,0.05);
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.05);
overflow-x: auto;
max-width: 100%;
}
@@ -213,11 +296,13 @@ const items = [
transition: background 0.2s;
}
.vector-wrapper:hover, .vector-wrapper.active {
background: rgba(0,0,0,0.05);
.vector-wrapper:hover,
.vector-wrapper.active {
background: rgba(0, 0, 0, 0.05);
}
.dark .vector-wrapper:hover, .dark .vector-wrapper.active {
background: rgba(255,255,255,0.1);
.dark .vector-wrapper:hover,
.dark .vector-wrapper.active {
background: rgba(255, 255, 255, 0.1);
}
.vector-col {
@@ -6,8 +6,8 @@
</div>
<div class="scenario-tabs">
<button
v-for="s in scenarios"
<button
v-for="s in scenarios"
:key="s.id"
class="tab-btn"
:class="{ active: currentScenario === s.id }"
@@ -20,39 +20,59 @@
<div class="demo-container">
<!-- Image Area -->
<div class="image-area">
<div class="image-placeholder" :class="{ loaded: hasImage, 'receipt-bg': currentScenario === 'ocr' }">
<div
class="image-placeholder"
:class="{ loaded: hasImage, 'receipt-bg': currentScenario === 'ocr' }"
>
<div v-if="!hasImage" class="upload-prompt">
<div class="icon">🖼</div>
<button class="upload-btn" @click="loadImage">
上传图片 (模拟)
</button>
</div>
<div v-else class="image-content">
<!-- Chat: Landscape -->
<div v-if="currentScenario === 'chat'" class="real-image-container landscape">
<div
v-if="currentScenario === 'chat'"
class="real-image-container landscape"
>
<div class="real-image">🏔</div>
<div class="sun"></div>
<div class="tree">🌲</div>
</div>
<!-- Detection: Fruits -->
<div v-else-if="currentScenario === 'detection'" class="real-image-container fruits">
<div
v-else-if="currentScenario === 'detection'"
class="real-image-container fruits"
>
<div class="real-image">
<span class="fruit apple">🍎</span>
<span class="fruit banana">🍌</span>
<span class="fruit grape">🍇</span>
</div>
<div v-if="showBoundingBox" class="bounding-box apple-box" title="Apple">
<div
v-if="showBoundingBox"
class="bounding-box apple-box"
title="Apple"
>
<span class="box-label">apple: 0.98</span>
</div>
<div v-if="showBoundingBox" class="bounding-box banana-box" title="Banana">
<div
v-if="showBoundingBox"
class="bounding-box banana-box"
title="Banana"
>
<span class="box-label">banana: 0.95</span>
</div>
</div>
<!-- Analysis: Factory Safety -->
<div v-else-if="currentScenario === 'analysis'" class="factory-image">
<div
v-else-if="currentScenario === 'analysis'"
class="factory-image"
>
<div class="safety-sign"> 安全生产</div>
<div class="worker-container">
<span class="worker">👷</span>
@@ -67,7 +87,9 @@
<div class="receipt-body">
<div class="line"><span>Coffee</span><span>$4.50</span></div>
<div class="line"><span>Bagel</span><span>$3.00</span></div>
<div class="line total"><span>TOTAL</span><span>$7.50</span></div>
<div class="line total">
<span>TOTAL</span><span>$7.50</span>
</div>
<div class="line date"><span>2023-10-24</span></div>
</div>
</div>
@@ -85,29 +107,45 @@
<div v-if="messages.length === 0" class="empty-text">
{{ hasImage ? '图片已就绪请选择指令' : '请先上传图片' }}
</div>
<div v-for="(msg, index) in messages" :key="index" class="message" :class="msg.role">
<div
v-for="(msg, index) in messages"
:key="index"
class="message"
:class="msg.role"
>
<div class="content">
<div v-if="msg.isJson" class="json-content">
<pre>{{ msg.content }}</pre>
</div>
<span v-else>{{ msg.content }}</span>
<span v-if="msg.role === 'assistant' && isGenerating && index === messages.length - 1" class="cursor">|</span>
<span
v-if="
msg.role === 'assistant' &&
isGenerating &&
index === messages.length - 1
"
class="cursor"
>|</span
>
</div>
</div>
</div>
<div class="input-area">
<div class="quick-actions" v-if="hasImage && !isGenerating">
<button v-for="q in currentQuestions" :key="q" @click="ask(q)" class="action-btn">
<button
v-for="q in currentQuestions"
:key="q"
@click="ask(q)"
class="action-btn"
>
{{ q }}
</button>
</div>
<div class="status-text" v-else-if="isGenerating">
AI 正在观察图片并思考...
</div>
<div class="status-text" v-else>
等待图片上传...
</div>
<div class="status-text" v-else>等待图片上传...</div>
</div>
</div>
</div>
@@ -132,75 +170,80 @@ const messages = ref([])
const messagesRef = ref(null)
const questionsMap = {
chat: [
"这里是哪里?",
"描述一下天气",
"写首关于这座山的诗"
],
detection: [
"检测图中的水果",
"数数有几个苹果",
"输出检测框坐标"
],
ocr: [
"提取所有文字",
"总金额是多少?",
"消费日期是哪天?"
],
analysis: [
"工人是否佩戴安全帽?",
"检测现场安全隐患",
"输出风险评估报告"
]
chat: ['这里是哪里?', '描述一下天气', '写首关于这座山的诗'],
detection: ['检测图中的水果', '数数有几个苹果', '输出检测框坐标'],
ocr: ['提取所有文字', '总金额是多少?', '消费日期是哪天?'],
analysis: ['工人是否佩戴安全帽?', '检测现场安全隐患', '输出风险评估报告']
}
const answersMap = {
chat: {
"这里是哪里?": "这是一张高山风景照。远处是覆盖着皑皑白雪的山峰,可能是阿尔卑斯山或喜马拉雅山脉。山脚下有郁郁葱葱的松树林。",
"描述一下天气": "天气看起来非常晴朗,阳光明媚(☀️),能见度很高。蓝天白云,是一个适合登山或滑雪的好天气。",
"写首关于这座山的诗": "🏔️ 雪岭插云天,\n🌲 松涛响翠烟。\n☀️ 金阳融冷色,\n🏞️ 壮丽入心田。"
'这里是哪里?':
'这是一张高山风景照。远处是覆盖着皑皑白雪的山峰,可能是阿尔卑斯山或喜马拉雅山脉。山脚下有郁郁葱葱的松树林。',
描述一下天气:
'天气看起来非常晴朗,阳光明媚(☀️),能见度很高。蓝天白云,是一个适合登山或滑雪的好天气。',
写首关于这座山的诗:
'🏔️ 雪岭插云天,\n🌲 松涛响翠烟。\n☀️ 金阳融冷色,\n🏞️ 壮丽入心田。'
},
detection: {
"检测图中的水果": {
type: 'json',
text: JSON.stringify({ objects: ['apple', 'banana', 'grape'], count: 3 }, null, 2),
检测图中的水果: {
type: 'json',
text: JSON.stringify(
{ objects: ['apple', 'banana', 'grape'], count: 3 },
null,
2
),
action: 'showBox'
},
"数数有几个苹果": "图中检测到 1 个苹果(🍎)。",
"输出检测框坐标": {
数数有几个苹果: '图中检测到 1 个苹果(🍎)。',
输出检测框坐标: {
type: 'json',
text: JSON.stringify({
objects: [
{ label: 'apple', box: [15, 15, 85, 85] },
{ label: 'banana', box: [95, 15, 165, 85] }
]
}, null, 2),
text: JSON.stringify(
{
objects: [
{ label: 'apple', box: [15, 15, 85, 85] },
{ label: 'banana', box: [95, 15, 165, 85] }
]
},
null,
2
),
action: 'showBox'
}
},
ocr: {
"提取所有文字": {
提取所有文字: {
type: 'json',
text: JSON.stringify({
lines: [
"RECEIPT",
"Coffee $4.50",
"Bagel $3.00",
"TOTAL $7.50",
"2023-10-24"
]
}, null, 2)
text: JSON.stringify(
{
lines: [
'RECEIPT',
'Coffee $4.50',
'Bagel $3.00',
'TOTAL $7.50',
'2023-10-24'
]
},
null,
2
)
},
"总金额是多少?": "这张小票的总金额是 $7.50。",
"消费日期是哪天?": "消费日期是 2023年10月24日。"
'总金额是多少?': '这张小票的总金额是 $7.50。',
'消费日期是哪天?': '消费日期是 2023年10月24日。'
},
analysis: {
"工人是否佩戴安全帽?": "检测到画面中有一名工人(👷),已正确佩戴红色安全帽(⛑️)。",
"检测现场安全隐患": {
'工人是否佩戴安全帽?':
'检测到画面中有一名工人(👷),已正确佩戴红色安全帽(⛑️)。',
检测现场安全隐患: {
type: 'json',
text: JSON.stringify({ hazards: [], safety_score: 100, status: "SAFE" }, null, 2)
text: JSON.stringify(
{ hazards: [], safety_score: 100, status: 'SAFE' },
null,
2
)
},
"输出风险评估报告": "✅ **安全合规**\n- 人员:1人\n- 防护装备:齐全\n- 机械设备:正常运行中\n- 风险等级:低"
输出风险评估报告:
'✅ **安全合规**\n- 人员:1人\n- 防护装备:齐全\n- 机械设备:正常运行中\n- 风险等级:低'
}
}
@@ -214,7 +257,9 @@ const getImageLabel = () => {
return map[currentScenario.value]
}
const currentQuestions = computed(() => questionsMap[currentScenario.value] || [])
const currentQuestions = computed(
() => questionsMap[currentScenario.value] || []
)
const switchScenario = (id) => {
currentScenario.value = id
@@ -232,16 +277,16 @@ const loadImage = () => {
const ask = async (question) => {
messages.value.push({ role: 'user', content: question })
isGenerating.value = true
await wait(800) // Simulate vision encoding time
const scenarioAnswers = answersMap[currentScenario.value]
const rawAnswer = scenarioAnswers[question] || "我还在学习这个任务..."
const rawAnswer = scenarioAnswers[question] || '我还在学习这个任务...'
let content = ''
let isJson = false
let action = null
if (typeof rawAnswer === 'object') {
content = rawAnswer.text
isJson = rawAnswer.type === 'json'
@@ -249,10 +294,10 @@ const ask = async (question) => {
} else {
content = rawAnswer
}
messages.value.push({ role: 'assistant', content: '', isJson })
const answerIdx = messages.value.length - 1
// Streaming effect
const stepSize = isJson ? 5 : 1 // JSON types faster
for (let i = 0; i < content.length; i += stepSize) {
@@ -260,15 +305,15 @@ const ask = async (question) => {
scrollToBottom()
await wait(20)
}
if (action === 'showBox') {
showBoundingBox.value = true
}
isGenerating.value = false
}
const wait = (ms) => new Promise(resolve => setTimeout(resolve, ms))
const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
const scrollToBottom = () => {
nextTick(() => {
@@ -286,7 +331,8 @@ const scrollToBottom = () => {
border-radius: 12px;
padding: 20px;
margin: 20px 0;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
font-family:
-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
}
.header {
@@ -413,7 +459,7 @@ const scrollToBottom = () => {
display: flex;
align-items: center;
justify-content: center;
background: linear-gradient(to bottom, #87CEEB 50%, #e0e0e0 50%);
background: linear-gradient(to bottom, #87ceeb 50%, #e0e0e0 50%);
border-radius: 8px;
overflow: hidden;
position: absolute;
@@ -563,7 +609,7 @@ const scrollToBottom = () => {
background: white;
padding: 15px;
width: 160px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
font-family: 'Courier New', Courier, monospace;
font-size: 11px;
text-align: left;
@@ -602,7 +648,7 @@ const scrollToBottom = () => {
.image-label {
font-size: 12px;
color: #666;
background: rgba(255,255,255,0.8);
background: rgba(255, 255, 255, 0.8);
padding: 4px 8px;
border-radius: 4px;
position: absolute;
@@ -711,28 +757,53 @@ const scrollToBottom = () => {
}
@keyframes popIn {
from { transform: scale(0); opacity: 0; }
to { transform: scale(1); opacity: 1; }
from {
transform: scale(0);
opacity: 0;
}
to {
transform: scale(1);
opacity: 1;
}
}
@keyframes slideUp {
from { transform: translateY(20px); opacity: 0; }
to { transform: translateY(0); opacity: 1; }
from {
transform: translateY(20px);
opacity: 0;
}
to {
transform: translateY(0);
opacity: 1;
}
}
@keyframes fadeIn {
from { opacity: 0; }
to { opacity: 1; }
from {
opacity: 0;
}
to {
opacity: 1;
}
}
@keyframes blink {
0%, 100% { opacity: 1; }
50% { opacity: 0; }
0%,
100% {
opacity: 1;
}
50% {
opacity: 0;
}
}
@keyframes spin {
from { transform: rotate(0deg); }
to { transform: rotate(360deg); }
from {
transform: rotate(0deg);
}
to {
transform: rotate(360deg);
}
}
@media (max-width: 600px) {
@@ -755,4 +826,4 @@ const scrollToBottom = () => {
white-space: nowrap;
}
}
</style>
</style>