feat: comprehensive documentation and demo updates
- Update READMEs and docs across multiple languages - Enhance interactive demos for Agent, LLM, VLM, Audio, Image Gen, Terminal, and Web Basics - Add new appendix sections for Database and IDE intros - Update VitePress config, theme, and utility scripts - Clean up unused assets and components
This commit is contained in:
@@ -3,11 +3,11 @@
|
||||
<div class="controls">
|
||||
<span class="hint">🖱️ 把鼠标悬停在方块上,查看它的“注意力”分配</span>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="visual-area">
|
||||
<div class="image-grid" @mouseleave="hoverIndex = -1">
|
||||
<div
|
||||
v-for="(item, index) in items"
|
||||
<div
|
||||
v-for="(item, index) in items"
|
||||
:key="index"
|
||||
class="grid-cell"
|
||||
:class="{ active: hoverIndex === index }"
|
||||
@@ -16,16 +16,16 @@
|
||||
{{ item.icon }}
|
||||
<div class="cell-label">{{ item.label }}</div>
|
||||
</div>
|
||||
|
||||
|
||||
<!-- SVG Overlay for lines -->
|
||||
<svg class="connections" v-if="hoverIndex !== -1">
|
||||
<line
|
||||
v-for="(target, tIndex) in items"
|
||||
<line
|
||||
v-for="(target, tIndex) in items"
|
||||
:key="tIndex"
|
||||
v-if="tIndex !== hoverIndex"
|
||||
:x1="getCenter(hoverIndex).x"
|
||||
:x1="getCenter(hoverIndex).x"
|
||||
:y1="getCenter(hoverIndex).y"
|
||||
:x2="getCenter(tIndex).x"
|
||||
:x2="getCenter(tIndex).x"
|
||||
:y2="getCenter(tIndex).y"
|
||||
:stroke="getAttentionColor(hoverIndex, tIndex)"
|
||||
:stroke-width="getAttentionWidth(hoverIndex, tIndex)"
|
||||
@@ -33,16 +33,22 @@
|
||||
/>
|
||||
</svg>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="info-panel" :class="{ visible: hoverIndex !== -1 }">
|
||||
<div class="info-title">Patch: {{ items[hoverIndex]?.label }}</div>
|
||||
<div class="info-desc">正在关注:</div>
|
||||
<ul class="attn-list" v-if="hoverIndex !== -1">
|
||||
<li v-for="(weight, targetIdx) in getTopAttentions(hoverIndex)" :key="targetIdx">
|
||||
<li
|
||||
v-for="(weight, targetIdx) in getTopAttentions(hoverIndex)"
|
||||
:key="targetIdx"
|
||||
>
|
||||
<span class="target-icon">{{ items[targetIdx].icon }}</span>
|
||||
<span class="target-name">{{ items[targetIdx].label }}</span>
|
||||
<div class="bar-bg">
|
||||
<div class="bar-fill" :style="{ width: (weight * 100) + '%' }"></div>
|
||||
<div
|
||||
class="bar-fill"
|
||||
:style="{ width: weight * 100 + '%' }"
|
||||
></div>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
@@ -57,9 +63,15 @@ import { ref } from 'vue'
|
||||
const hoverIndex = ref(-1)
|
||||
|
||||
const items = [
|
||||
{ icon: '🌲', label: '背景' }, { icon: '🌲', label: '背景' }, { icon: '☁️', label: '天空' },
|
||||
{ icon: '👂', label: '猫耳' }, { icon: '😼', label: '猫脸' }, { icon: '🌲', label: '背景' },
|
||||
{ icon: '🐾', label: '猫爪' }, { icon: '🧶', label: '毛线' }, { icon: '🌱', label: '草地' }
|
||||
{ icon: '🌲', label: '背景' },
|
||||
{ icon: '🌲', label: '背景' },
|
||||
{ icon: '☁️', label: '天空' },
|
||||
{ icon: '👂', label: '猫耳' },
|
||||
{ icon: '😼', label: '猫脸' },
|
||||
{ icon: '🌲', label: '背景' },
|
||||
{ icon: '🐾', label: '猫爪' },
|
||||
{ icon: '🧶', label: '毛线' },
|
||||
{ icon: '🌱', label: '草地' }
|
||||
]
|
||||
|
||||
// 3x3 Grid
|
||||
@@ -79,14 +91,14 @@ const getCenter = (index) => {
|
||||
// Mock attention weights
|
||||
const getAttentionWeight = (source, target) => {
|
||||
// Self attention is ignored for visualization clarity usually, but let's say:
|
||||
|
||||
|
||||
// Cat parts (3, 4, 6) attend strongly to each other
|
||||
const catParts = [3, 4, 6]
|
||||
const isSourceCat = catParts.includes(source)
|
||||
const isTargetCat = catParts.includes(target)
|
||||
|
||||
if (isSourceCat && isTargetCat) return 0.9 // Strong connection between cat parts
|
||||
|
||||
|
||||
// Cat interacts with Yarn (7)
|
||||
if (isSourceCat && target === 7) return 0.7
|
||||
if (source === 7 && isTargetCat) return 0.7
|
||||
@@ -94,7 +106,7 @@ const getAttentionWeight = (source, target) => {
|
||||
// Background parts attend to each other
|
||||
const bgParts = [0, 1, 2, 5, 8]
|
||||
if (bgParts.includes(source) && bgParts.includes(target)) return 0.5
|
||||
|
||||
|
||||
return 0.1 // Weak attention otherwise
|
||||
}
|
||||
|
||||
@@ -175,10 +187,11 @@ const getTopAttentions = (source) => {
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.grid-cell:hover, .grid-cell.active {
|
||||
.grid-cell:hover,
|
||||
.grid-cell.active {
|
||||
border-color: var(--vp-c-brand);
|
||||
transform: scale(1.05);
|
||||
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
|
||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
|
||||
background: var(--vp-c-bg-mute);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
<template>
|
||||
<div class="feature-alignment-demo">
|
||||
<div class="header">
|
||||
<div class="title">阶段一:特征对齐 (Feature Alignment / Pre-training)</div>
|
||||
<div class="title">
|
||||
阶段一:特征对齐 (Feature Alignment / Pre-training)
|
||||
</div>
|
||||
<div class="desc">
|
||||
目标:让 Projector 学会“翻译”图像语言。
|
||||
<br>做法:冻结 ViT 和 LLM,只训练 Projector。
|
||||
<br />做法:冻结 ViT 和 LLM,只训练 Projector。
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -13,11 +15,11 @@
|
||||
<div class="data-column">
|
||||
<div class="data-item image-data">
|
||||
<div class="data-icon">🖼️</div>
|
||||
<div class="data-label">图片<br>(猫)</div>
|
||||
<div class="data-label">图片<br />(猫)</div>
|
||||
</div>
|
||||
<div class="data-item text-data">
|
||||
<div class="data-icon">📝</div>
|
||||
<div class="data-label">标题<br>("一只猫")</div>
|
||||
<div class="data-label">标题<br />("一只猫")</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -35,7 +37,7 @@
|
||||
<div class="block-icon">👁️</div>
|
||||
<div class="block-name">ViT</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="arrow-small">➜</div>
|
||||
|
||||
<div class="model-block training">
|
||||
@@ -64,7 +66,7 @@
|
||||
<div class="vector-icon">🟢</div>
|
||||
<div class="vector-label">向量 V</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="loss-connection">
|
||||
<div class="loss-line"></div>
|
||||
<div class="loss-box" :class="{ active: isCalculatingLoss }">
|
||||
@@ -107,21 +109,31 @@ const nextStep = () => {
|
||||
|
||||
const buttonText = computed(() => {
|
||||
switch (step.value) {
|
||||
case 0: return '开始训练演示'
|
||||
case 1: return '下一步:计算 Loss'
|
||||
case 2: return '下一步:反向传播'
|
||||
case 3: return '完成并重置'
|
||||
default: return '开始'
|
||||
case 0:
|
||||
return '开始训练演示'
|
||||
case 1:
|
||||
return '下一步:计算 Loss'
|
||||
case 2:
|
||||
return '下一步:反向传播'
|
||||
case 3:
|
||||
return '完成并重置'
|
||||
default:
|
||||
return '开始'
|
||||
}
|
||||
})
|
||||
|
||||
const currentStepDesc = computed(() => {
|
||||
switch (step.value) {
|
||||
case 0: return '准备就绪。点击按钮开始模拟一次训练迭代。'
|
||||
case 1: return '前向传播:图片经过 ViT (冻结) 和 Projector (训练) 得到向量 V;文本经过 LLM (冻结) 得到向量 T。'
|
||||
case 2: return '计算 Loss:比较向量 V 和向量 T 的相似度。目标是让它们尽可能接近。'
|
||||
case 3: return '反向传播:根据 Loss 更新 Projector 的参数。注意 ViT 和 LLM 不会更新!'
|
||||
default: return ''
|
||||
case 0:
|
||||
return '准备就绪。点击按钮开始模拟一次训练迭代。'
|
||||
case 1:
|
||||
return '前向传播:图片经过 ViT (冻结) 和 Projector (训练) 得到向量 V;文本经过 LLM (冻结) 得到向量 T。'
|
||||
case 2:
|
||||
return '计算 Loss:比较向量 V 和向量 T 的相似度。目标是让它们尽可能接近。'
|
||||
case 3:
|
||||
return '反向传播:根据 Loss 更新 Projector 的参数。注意 ViT 和 LLM 不会更新!'
|
||||
default:
|
||||
return ''
|
||||
}
|
||||
})
|
||||
|
||||
@@ -135,7 +147,8 @@ const isCalculatingLoss = computed(() => step.value === 2)
|
||||
border-radius: 12px;
|
||||
padding: 20px;
|
||||
margin: 20px 0;
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
||||
font-family:
|
||||
-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
}
|
||||
|
||||
.header {
|
||||
@@ -184,8 +197,14 @@ const isCalculatingLoss = computed(() => step.value === 2)
|
||||
width: 60px;
|
||||
}
|
||||
|
||||
.data-icon { font-size: 24px; }
|
||||
.data-label { font-size: 10px; text-align: center; margin-top: 4px; }
|
||||
.data-icon {
|
||||
font-size: 24px;
|
||||
}
|
||||
.data-label {
|
||||
font-size: 10px;
|
||||
text-align: center;
|
||||
margin-top: 4px;
|
||||
}
|
||||
|
||||
/* Arrow Column */
|
||||
.arrow-column {
|
||||
@@ -200,9 +219,9 @@ const isCalculatingLoss = computed(() => step.value === 2)
|
||||
.model-column {
|
||||
display: grid;
|
||||
grid-template-columns: auto auto auto;
|
||||
grid-template-areas:
|
||||
"vit arrow proj"
|
||||
"llm llm llm";
|
||||
grid-template-areas:
|
||||
'vit arrow proj'
|
||||
'llm llm llm';
|
||||
gap: 10px;
|
||||
row-gap: 30px;
|
||||
align-items: center;
|
||||
@@ -262,8 +281,14 @@ const isCalculatingLoss = computed(() => step.value === 2)
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.block-icon { font-size: 20px; margin-bottom: 4px; }
|
||||
.block-name { font-size: 12px; font-weight: bold; }
|
||||
.block-icon {
|
||||
font-size: 20px;
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
.block-name {
|
||||
font-size: 12px;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.arrow-small {
|
||||
grid-area: arrow;
|
||||
@@ -316,8 +341,15 @@ const isCalculatingLoss = computed(() => step.value === 2)
|
||||
box-shadow: 0 0 10px rgba(255, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
.loss-label { font-size: 12px; font-weight: bold; color: var(--vp-c-danger); }
|
||||
.loss-desc { font-size: 10px; color: var(--vp-c-text-2); }
|
||||
.loss-label {
|
||||
font-size: 12px;
|
||||
font-weight: bold;
|
||||
color: var(--vp-c-danger);
|
||||
}
|
||||
.loss-desc {
|
||||
font-size: 10px;
|
||||
color: var(--vp-c-text-2);
|
||||
}
|
||||
|
||||
/* Controls */
|
||||
.controls {
|
||||
@@ -356,9 +388,15 @@ const isCalculatingLoss = computed(() => step.value === 2)
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
0% { box-shadow: 0 0 0 0 rgba(var(--vp-c-brand-rgb), 0.4); }
|
||||
70% { box-shadow: 0 0 0 10px rgba(var(--vp-c-brand-rgb), 0); }
|
||||
100% { box-shadow: 0 0 0 0 rgba(var(--vp-c-brand-rgb), 0); }
|
||||
0% {
|
||||
box-shadow: 0 0 0 0 rgba(var(--vp-c-brand-rgb), 0.4);
|
||||
}
|
||||
70% {
|
||||
box-shadow: 0 0 0 10px rgba(var(--vp-c-brand-rgb), 0);
|
||||
}
|
||||
100% {
|
||||
box-shadow: 0 0 0 0 rgba(var(--vp-c-brand-rgb), 0);
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 600px) {
|
||||
@@ -388,4 +426,4 @@ const isCalculatingLoss = computed(() => step.value === 2)
|
||||
height: 1px;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</style>
|
||||
|
||||
@@ -5,7 +5,12 @@
|
||||
<div class="step-box">
|
||||
<div class="label">1. Patch (4x4)</div>
|
||||
<div class="grid-patch">
|
||||
<div v-for="n in 16" :key="n" class="pixel" :style="{ backgroundColor: getPixelColor(n) }"></div>
|
||||
<div
|
||||
v-for="n in 16"
|
||||
:key="n"
|
||||
class="pixel"
|
||||
:style="{ backgroundColor: getPixelColor(n) }"
|
||||
></div>
|
||||
</div>
|
||||
<div class="desc">768 像素点</div>
|
||||
</div>
|
||||
@@ -16,7 +21,12 @@
|
||||
<div class="step-box">
|
||||
<div class="label">2. Flatten</div>
|
||||
<div class="vector-container">
|
||||
<div v-for="n in 16" :key="n" class="vector-cell" :style="{ backgroundColor: getPixelColor(n) }"></div>
|
||||
<div
|
||||
v-for="n in 16"
|
||||
:key="n"
|
||||
class="vector-cell"
|
||||
:style="{ backgroundColor: getPixelColor(n) }"
|
||||
></div>
|
||||
</div>
|
||||
<div class="desc">拉平成向量</div>
|
||||
</div>
|
||||
@@ -38,8 +48,8 @@
|
||||
<script setup>
|
||||
const getPixelColor = (n) => {
|
||||
// Generate a gradient of colors
|
||||
const hue = (n * 20) % 360;
|
||||
return `hsl(${hue}, 70%, 60%)`;
|
||||
const hue = (n * 20) % 360
|
||||
return `hsl(${hue}, 70%, 60%)`
|
||||
}
|
||||
</script>
|
||||
|
||||
|
||||
+86
-35
@@ -14,15 +14,15 @@
|
||||
</div>
|
||||
</div>
|
||||
<div class="status-desc">
|
||||
{{ isVLM
|
||||
? '给大脑装上眼睛:视觉信号经过翻译,变成 Token 混入文字流。'
|
||||
: '纯文本大脑:只能听懂 Token 语言,无法感知图像。'
|
||||
{{
|
||||
isVLM
|
||||
? '给大脑装上眼睛:视觉信号经过翻译,变成 Token 混入文字流。'
|
||||
: '纯文本大脑:只能听懂 Token 语言,无法感知图像。'
|
||||
}}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="diagram-stage" :class="{ 'vlm-mode': isVLM }">
|
||||
|
||||
<!-- Vision Pipeline (Only visible in VLM mode) -->
|
||||
<div class="pipeline vision-pipeline">
|
||||
<div class="node-group">
|
||||
@@ -31,12 +31,18 @@
|
||||
<span class="label">Image</span>
|
||||
</div>
|
||||
<div class="flow-arrow">⬇</div>
|
||||
<div class="node process-node vit-node" title="Vision Transformer: The Eye">
|
||||
<div
|
||||
class="node process-node vit-node"
|
||||
title="Vision Transformer: The Eye"
|
||||
>
|
||||
<span class="icon">�️</span>
|
||||
<span class="label">ViT</span>
|
||||
</div>
|
||||
<div class="flow-arrow">⬇</div>
|
||||
<div class="node adapter-node projector-node" title="Projector: The Translator">
|
||||
<div
|
||||
class="node adapter-node projector-node"
|
||||
title="Projector: The Translator"
|
||||
>
|
||||
<span class="icon">🔌</span>
|
||||
<span class="label">Projector</span>
|
||||
</div>
|
||||
@@ -56,7 +62,7 @@
|
||||
<span class="icon">�</span>
|
||||
<span class="label">Embed</span>
|
||||
</div>
|
||||
|
||||
|
||||
<!-- Merge Point Visualization -->
|
||||
<div class="merge-point" :class="{ active: isVLM }">
|
||||
<div class="plus-icon">+</div>
|
||||
@@ -80,20 +86,33 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="interactive-info">
|
||||
<div class="info-card" v-if="!isVLM">
|
||||
<h3>Standard LLM Flow</h3>
|
||||
<p>Text is converted into vectors (Embeddings) and processed by the Transformer to predict the next word.</p>
|
||||
<p>
|
||||
Text is converted into vectors (Embeddings) and processed by the
|
||||
Transformer to predict the next word.
|
||||
</p>
|
||||
</div>
|
||||
<div class="info-card vlm-info" v-else>
|
||||
<h3>VLM = LLM + Vision Encoder</h3>
|
||||
<ul>
|
||||
<li><strong>ViT (The Eye):</strong> Slices image into patches and extracts features.</li>
|
||||
<li><strong>Projector (The Translator):</strong> Converts visual features into the same "language" (vector dimension) as text embeddings.</li>
|
||||
<li><strong>Concatenation:</strong> The translated visual tokens are pasted <em>before</em> the text tokens. The LLM sees them as "foreign words" it learned to understand.</li>
|
||||
<li>
|
||||
<strong>ViT (The Eye):</strong> Slices image into patches and
|
||||
extracts features.
|
||||
</li>
|
||||
<li>
|
||||
<strong>Projector (The Translator):</strong> Converts visual
|
||||
features into the same "language" (vector dimension) as text
|
||||
embeddings.
|
||||
</li>
|
||||
<li>
|
||||
<strong>Concatenation:</strong> The translated visual tokens are
|
||||
pasted <em>before</em> the text tokens. The LLM sees them as
|
||||
"foreign words" it learned to understand.
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
@@ -173,7 +192,7 @@ const toggleMode = () => {
|
||||
justify-content: center;
|
||||
font-size: 14px;
|
||||
transition: transform 0.3s cubic-bezier(0.34, 1.56, 0.64, 1);
|
||||
box-shadow: 0 2px 4px rgba(0,0,0,0.2);
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
.toggle-track.active .toggle-thumb {
|
||||
@@ -271,29 +290,41 @@ const toggleMode = () => {
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
min-width: 70px;
|
||||
box-shadow: 0 4px 6px rgba(0,0,0,0.05);
|
||||
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
|
||||
position: relative;
|
||||
z-index: 2;
|
||||
}
|
||||
|
||||
.icon { font-size: 20px; margin-bottom: 4px; }
|
||||
.label { font-size: 11px; font-weight: bold; }
|
||||
.icon {
|
||||
font-size: 20px;
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
.label {
|
||||
font-size: 11px;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.input-node { border-color: #aaa; }
|
||||
.process-node { border-color: var(--vp-c-brand-dimm); }
|
||||
.core-node {
|
||||
border-color: var(--vp-c-brand);
|
||||
.input-node {
|
||||
border-color: #aaa;
|
||||
}
|
||||
.process-node {
|
||||
border-color: var(--vp-c-brand-dimm);
|
||||
}
|
||||
.core-node {
|
||||
border-color: var(--vp-c-brand);
|
||||
background: var(--vp-c-brand-dimm);
|
||||
min-width: 100px;
|
||||
}
|
||||
.output-node { border-color: var(--vp-c-brand); }
|
||||
.output-node {
|
||||
border-color: var(--vp-c-brand);
|
||||
}
|
||||
|
||||
.vit-node {
|
||||
border-color: var(--vp-c-yellow);
|
||||
.vit-node {
|
||||
border-color: var(--vp-c-yellow);
|
||||
background: rgba(255, 197, 23, 0.05);
|
||||
}
|
||||
.projector-node {
|
||||
border-color: var(--vp-c-yellow);
|
||||
border-color: var(--vp-c-yellow);
|
||||
background: var(--vp-c-yellow-dimm);
|
||||
}
|
||||
|
||||
@@ -355,13 +386,26 @@ const toggleMode = () => {
|
||||
animation: pulse 1s infinite alternate;
|
||||
}
|
||||
|
||||
.t1 { animation-delay: 0s; }
|
||||
.t2 { animation-delay: 0.2s; }
|
||||
.v1 { background: var(--vp-c-yellow); animation-delay: 0.4s; }
|
||||
.t1 {
|
||||
animation-delay: 0s;
|
||||
}
|
||||
.t2 {
|
||||
animation-delay: 0.2s;
|
||||
}
|
||||
.v1 {
|
||||
background: var(--vp-c-yellow);
|
||||
animation-delay: 0.4s;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
from { opacity: 0.3; transform: scale(0.8); }
|
||||
to { opacity: 1; transform: scale(1.1); }
|
||||
from {
|
||||
opacity: 0.3;
|
||||
transform: scale(0.8);
|
||||
}
|
||||
to {
|
||||
opacity: 1;
|
||||
transform: scale(1.1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Interactive Info */
|
||||
@@ -383,7 +427,8 @@ const toggleMode = () => {
|
||||
color: var(--vp-c-text-1);
|
||||
}
|
||||
|
||||
.info-card p, .info-card li {
|
||||
.info-card p,
|
||||
.info-card li {
|
||||
font-size: 13px;
|
||||
color: var(--vp-c-text-2);
|
||||
line-height: 1.6;
|
||||
@@ -395,8 +440,14 @@ const toggleMode = () => {
|
||||
}
|
||||
|
||||
@keyframes fadeIn {
|
||||
from { opacity: 0; transform: translateY(5px); }
|
||||
to { opacity: 1; transform: translateY(0); }
|
||||
from {
|
||||
opacity: 0;
|
||||
transform: translateY(5px);
|
||||
}
|
||||
to {
|
||||
opacity: 1;
|
||||
transform: translateY(0);
|
||||
}
|
||||
}
|
||||
|
||||
/* Mobile Adjustments */
|
||||
@@ -404,15 +455,15 @@ const toggleMode = () => {
|
||||
.diagram-stage {
|
||||
height: 300px;
|
||||
}
|
||||
|
||||
|
||||
.text-pipeline {
|
||||
flex-wrap: wrap;
|
||||
gap: 10px;
|
||||
width: 90%;
|
||||
}
|
||||
|
||||
|
||||
.vision-pipeline {
|
||||
left: 10%;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</style>
|
||||
|
||||
@@ -20,11 +20,11 @@
|
||||
<div class="visual-area">
|
||||
<!-- 原始/切分视图容器 -->
|
||||
<div class="image-container" :class="{ 'is-patchified': isPatchified }">
|
||||
<div
|
||||
v-for="n in 196"
|
||||
:key="n"
|
||||
<div
|
||||
v-for="n in 196"
|
||||
:key="n"
|
||||
class="patch"
|
||||
:style="{
|
||||
:style="{
|
||||
'--delay': `${n * 0.005}s`,
|
||||
'--hue': `${(n % 14) * 20 + Math.floor(n / 14) * 20}`
|
||||
}"
|
||||
@@ -32,16 +32,16 @@
|
||||
<span class="patch-id" v-if="isPatchified">{{ n }}</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="arrow" v-if="isPatchified">⬇</div>
|
||||
|
||||
|
||||
<!-- 线性序列视图 -->
|
||||
<div class="sequence-container" v-if="isPatchified">
|
||||
<div class="sequence-label">Flattened Sequence (Token Input)</div>
|
||||
<div class="token-stream">
|
||||
<div
|
||||
v-for="n in 196"
|
||||
:key="n"
|
||||
<div
|
||||
v-for="n in 196"
|
||||
:key="n"
|
||||
class="mini-patch"
|
||||
:style="{ '--hue': `${(n % 14) * 20 + Math.floor(n / 14) * 20}` }"
|
||||
></div>
|
||||
@@ -144,7 +144,7 @@ const toggleState = () => {
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 8px;
|
||||
color: rgba(0,0,0,0.5);
|
||||
color: rgba(0, 0, 0, 0.5);
|
||||
transition: all 0.5s ease;
|
||||
}
|
||||
|
||||
@@ -198,12 +198,23 @@ const toggleState = () => {
|
||||
}
|
||||
|
||||
@keyframes bounce {
|
||||
0%, 100% { transform: translateY(0); }
|
||||
50% { transform: translateY(5px); }
|
||||
0%,
|
||||
100% {
|
||||
transform: translateY(0);
|
||||
}
|
||||
50% {
|
||||
transform: translateY(5px);
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes fadeIn {
|
||||
from { opacity: 0; transform: translateY(10px); }
|
||||
to { opacity: 1; transform: translateY(0); }
|
||||
from {
|
||||
opacity: 0;
|
||||
transform: translateY(10px);
|
||||
}
|
||||
to {
|
||||
opacity: 1;
|
||||
transform: translateY(0);
|
||||
}
|
||||
}
|
||||
</style>
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
<div class="grid-wrapper">
|
||||
<div class="grid-title">Feature Vectors</div>
|
||||
<div class="grid-box feature-grid">
|
||||
<div v-for="n in 9" :key="'f'+n" class="cell feature-cell">V</div>
|
||||
<div v-for="n in 9" :key="'f' + n" class="cell feature-cell">V</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
<div class="grid-wrapper">
|
||||
<div class="grid-title">Position Embeddings</div>
|
||||
<div class="grid-box pos-grid">
|
||||
<div v-for="n in 9" :key="'p'+n" class="cell pos-cell">{{ n }}</div>
|
||||
<div v-for="n in 9" :key="'p' + n" class="cell pos-cell">{{ n }}</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -25,14 +25,16 @@
|
||||
<div class="grid-wrapper">
|
||||
<div class="grid-title">Input to Transformer</div>
|
||||
<div class="grid-box result-grid">
|
||||
<div v-for="n in 9" :key="'r'+n" class="cell result-cell">
|
||||
<span class="v">V</span><span class="plus">+</span><span class="p">{{ n }}</span>
|
||||
<div v-for="n in 9" :key="'r' + n" class="cell result-cell">
|
||||
<span class="v">V</span><span class="plus">+</span
|
||||
><span class="p">{{ n }}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="caption">
|
||||
位置编码 (Position Embedding) 是一组可学习的向量,直接<b>加</b>在图像特征上。
|
||||
位置编码 (Position Embedding)
|
||||
是一组可学习的向量,直接<b>加</b>在图像特征上。
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
@@ -5,16 +5,10 @@
|
||||
<template>
|
||||
<div class="projector-demo">
|
||||
<div class="mode-switch">
|
||||
<button
|
||||
:class="{ active: mode === 'linear' }"
|
||||
@click="mode = 'linear'"
|
||||
>
|
||||
<button :class="{ active: mode === 'linear' }" @click="mode = 'linear'">
|
||||
Linear (LLaVA)
|
||||
</button>
|
||||
<button
|
||||
:class="{ active: mode === 'qformer' }"
|
||||
@click="mode = 'qformer'"
|
||||
>
|
||||
<button :class="{ active: mode === 'qformer' }" @click="mode = 'qformer'">
|
||||
Q-Former (BLIP-2)
|
||||
</button>
|
||||
</div>
|
||||
@@ -26,14 +20,18 @@
|
||||
<div class="token-container input">
|
||||
<div v-for="n in 16" :key="n" class="token visual"></div>
|
||||
</div>
|
||||
<div class="count">{{ mode === 'linear' ? '256 Tokens' : '256 Tokens' }}</div>
|
||||
<div class="count">
|
||||
{{ mode === 'linear' ? '256 Tokens' : '256 Tokens' }}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Process: The Projector -->
|
||||
<div class="stage connector">
|
||||
<div class="arrow-line"></div>
|
||||
<div class="projector-box" :class="mode">
|
||||
<div class="title">{{ mode === 'linear' ? 'Linear Layer' : 'Q-Former' }}</div>
|
||||
<div class="title">
|
||||
{{ mode === 'linear' ? 'Linear Layer' : 'Q-Former' }}
|
||||
</div>
|
||||
<div class="desc">
|
||||
{{ mode === 'linear' ? '直接映射 (1:1)' : '查询提取 (N:M)' }}
|
||||
</div>
|
||||
@@ -50,26 +48,32 @@
|
||||
<div class="stage">
|
||||
<div class="label">LLM Tokens</div>
|
||||
<div class="token-container output">
|
||||
<div
|
||||
v-for="n in (mode === 'linear' ? 16 : 4)"
|
||||
:key="n"
|
||||
<div
|
||||
v-for="n in mode === 'linear' ? 16 : 4"
|
||||
:key="n"
|
||||
class="token llm"
|
||||
></div>
|
||||
</div>
|
||||
<div class="count">
|
||||
{{ mode === 'linear' ? '256 Tokens (保留全部细节)' : '32 Tokens (只保留关键信息)' }}
|
||||
{{
|
||||
mode === 'linear'
|
||||
? '256 Tokens (保留全部细节)'
|
||||
: '32 Tokens (只保留关键信息)'
|
||||
}}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="explanation">
|
||||
<div v-if="mode === 'linear'">
|
||||
<strong>Linear Projector:</strong>
|
||||
简单高效。它像一个直译器,保留了所有的视觉信息,虽然 Token 数量多(计算量大),但对细节的把控更好。
|
||||
<strong>Linear Projector:</strong>
|
||||
简单高效。它像一个直译器,保留了所有的视觉信息,虽然 Token
|
||||
数量多(计算量大),但对细节的把控更好。
|
||||
</div>
|
||||
<div v-else>
|
||||
<strong>Q-Former:</strong>
|
||||
精细优雅。它使用一组“查询向量”主动去图像中提取与文本相关的信息。大大压缩了 Token 数量,让 LLM 跑得更快。
|
||||
<strong>Q-Former:</strong>
|
||||
精细优雅。它使用一组“查询向量”主动去图像中提取与文本相关的信息。大大压缩了
|
||||
Token 数量,让 LLM 跑得更快。
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@@ -231,11 +235,20 @@ const mode = ref('linear')
|
||||
animation: pulse 1s infinite;
|
||||
}
|
||||
|
||||
.dot:nth-child(2) { animation-delay: 0.2s; }
|
||||
.dot:nth-child(3) { animation-delay: 0.4s; }
|
||||
.dot:nth-child(2) {
|
||||
animation-delay: 0.2s;
|
||||
}
|
||||
.dot:nth-child(3) {
|
||||
animation-delay: 0.4s;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
0%, 100% { opacity: 0.3; }
|
||||
50% { opacity: 1; }
|
||||
0%,
|
||||
100% {
|
||||
opacity: 0.3;
|
||||
}
|
||||
50% {
|
||||
opacity: 1;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
|
||||
@@ -1,16 +1,10 @@
|
||||
<template>
|
||||
<div class="pipeline-demo">
|
||||
<div class="stage-switch">
|
||||
<button
|
||||
:class="{ active: stage === 1 }"
|
||||
@click="stage = 1"
|
||||
>
|
||||
<button :class="{ active: stage === 1 }" @click="stage = 1">
|
||||
阶段一:特征对齐
|
||||
</button>
|
||||
<button
|
||||
:class="{ active: stage === 2 }"
|
||||
@click="stage = 2"
|
||||
>
|
||||
<button :class="{ active: stage === 2 }" @click="stage = 2">
|
||||
阶段二:指令微调
|
||||
</button>
|
||||
</div>
|
||||
@@ -43,8 +37,13 @@
|
||||
<div class="arrow">➜</div>
|
||||
|
||||
<!-- LLM -->
|
||||
<div class="component-box llm" :class="{ frozen: stage === 1, training: stage === 2 }">
|
||||
<div class="status-badge">{{ stage === 1 ? '❄️ Frozen' : '🔥 Train' }}</div>
|
||||
<div
|
||||
class="component-box llm"
|
||||
:class="{ frozen: stage === 1, training: stage === 2 }"
|
||||
>
|
||||
<div class="status-badge">
|
||||
{{ stage === 1 ? '❄️ Frozen' : '🔥 Train' }}
|
||||
</div>
|
||||
<div class="name">LLM</div>
|
||||
<div class="desc">Language Model</div>
|
||||
</div>
|
||||
@@ -67,7 +66,10 @@
|
||||
<p>任务:让图像向量与文本向量距离变近。</p>
|
||||
</div>
|
||||
<div class="data-content" v-else>
|
||||
<code>User: <Image: 🐱> 这只猫在干嘛?<br/>Assistant: 它在睡觉。</code>
|
||||
<code
|
||||
>User: <Image: 🐱> 这只猫在干嘛?<br />Assistant:
|
||||
它在睡觉。</code
|
||||
>
|
||||
<p>任务:根据图像和问题生成回答。</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -12,9 +12,7 @@
|
||||
<div class="avatar">👤</div>
|
||||
<div class="bubble">
|
||||
<div class="image-upload">
|
||||
<div class="placeholder-img">
|
||||
🐱
|
||||
</div>
|
||||
<div class="placeholder-img">🐱</div>
|
||||
</div>
|
||||
<div class="text">这只猫在做什么?</div>
|
||||
</div>
|
||||
@@ -39,8 +37,8 @@
|
||||
</div>
|
||||
|
||||
<div class="controls">
|
||||
<button
|
||||
class="send-btn"
|
||||
<button
|
||||
class="send-btn"
|
||||
:disabled="step > 0 && step < 3"
|
||||
@click="startInference"
|
||||
>
|
||||
@@ -54,13 +52,13 @@
|
||||
import { ref, watch } from 'vue'
|
||||
|
||||
const step = ref(0)
|
||||
const fullText = "它正趴在窗台上晒太阳,看起来非常惬意。"
|
||||
const typedText = ref("")
|
||||
const fullText = '它正趴在窗台上晒太阳,看起来非常惬意。'
|
||||
const typedText = ref('')
|
||||
|
||||
const startInference = () => {
|
||||
step.value = 1
|
||||
typedText.value = ""
|
||||
|
||||
typedText.value = ''
|
||||
|
||||
// Step 1: Vision Encoding
|
||||
setTimeout(() => {
|
||||
step.value = 2
|
||||
@@ -93,7 +91,7 @@ const typeText = () => {
|
||||
overflow: hidden;
|
||||
max-width: 500px;
|
||||
margin: 20px auto;
|
||||
box-shadow: 0 4px 12px rgba(0,0,0,0.05);
|
||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
|
||||
}
|
||||
|
||||
.chat-window {
|
||||
@@ -130,7 +128,7 @@ const typeText = () => {
|
||||
border-radius: 12px;
|
||||
border: 1px solid var(--vp-c-divider);
|
||||
max-width: 80%;
|
||||
box-shadow: 0 2px 4px rgba(0,0,0,0.02);
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.02);
|
||||
}
|
||||
|
||||
.message.user .bubble {
|
||||
@@ -192,7 +190,12 @@ const typeText = () => {
|
||||
}
|
||||
|
||||
@keyframes blink {
|
||||
0%, 100% { opacity: 1; }
|
||||
50% { opacity: 0; }
|
||||
0%,
|
||||
100% {
|
||||
opacity: 1;
|
||||
}
|
||||
50% {
|
||||
opacity: 0;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
<div class="stage">
|
||||
<div class="stage-label">1. Processed Patches (Grid)</div>
|
||||
<div class="grid-container">
|
||||
<div
|
||||
v-for="(item, index) in items"
|
||||
<div
|
||||
v-for="(item, index) in items"
|
||||
:key="index"
|
||||
class="grid-item"
|
||||
:class="{ active: activeIndex === index }"
|
||||
@@ -24,10 +24,12 @@
|
||||
|
||||
<!-- 2. Feature Vector Sequence -->
|
||||
<div class="stage">
|
||||
<div class="stage-label">2. Feature Vector Sequence (The "Image Sentence")</div>
|
||||
<div class="stage-label">
|
||||
2. Feature Vector Sequence (The "Image Sentence")
|
||||
</div>
|
||||
<div class="vector-sequence">
|
||||
<div
|
||||
v-for="(item, index) in items"
|
||||
<div
|
||||
v-for="(item, index) in items"
|
||||
:key="index"
|
||||
class="vector-wrapper"
|
||||
:class="{ active: activeIndex === index }"
|
||||
@@ -35,11 +37,26 @@
|
||||
>
|
||||
<div class="vector-col">
|
||||
<!-- Simulated vector dimensions -->
|
||||
<div class="v-cell" :style="{ opacity: 0.9, background: item.color }"></div>
|
||||
<div class="v-cell" :style="{ opacity: 0.7, background: item.color }"></div>
|
||||
<div class="v-cell" :style="{ opacity: 0.5, background: item.color }"></div>
|
||||
<div class="v-cell" :style="{ opacity: 0.8, background: item.color }"></div>
|
||||
<div class="v-cell" :style="{ opacity: 0.6, background: item.color }"></div>
|
||||
<div
|
||||
class="v-cell"
|
||||
:style="{ opacity: 0.9, background: item.color }"
|
||||
></div>
|
||||
<div
|
||||
class="v-cell"
|
||||
:style="{ opacity: 0.7, background: item.color }"
|
||||
></div>
|
||||
<div
|
||||
class="v-cell"
|
||||
:style="{ opacity: 0.5, background: item.color }"
|
||||
></div>
|
||||
<div
|
||||
class="v-cell"
|
||||
:style="{ opacity: 0.8, background: item.color }"
|
||||
></div>
|
||||
<div
|
||||
class="v-cell"
|
||||
:style="{ opacity: 0.6, background: item.color }"
|
||||
></div>
|
||||
</div>
|
||||
<div class="vector-idx">{{ index + 1 }}</div>
|
||||
</div>
|
||||
@@ -53,7 +70,10 @@
|
||||
<div class="header" :style="{ borderColor: items[activeIndex].color }">
|
||||
<span class="large-icon">{{ items[activeIndex].icon }}</span>
|
||||
<div class="title-group">
|
||||
<span class="title">Token #{{ activeIndex + 1 }}: {{ items[activeIndex].label }}</span>
|
||||
<span class="title"
|
||||
>Token #{{ activeIndex + 1 }}:
|
||||
{{ items[activeIndex].label }}</span
|
||||
>
|
||||
<span class="subtitle">Type: {{ items[activeIndex].type }}</span>
|
||||
</div>
|
||||
</div>
|
||||
@@ -61,7 +81,9 @@
|
||||
<div class="vector-repr">
|
||||
<span class="label">Vector Value:</span>
|
||||
<span class="code" :style="{ color: items[activeIndex].color }">
|
||||
[0.{{ (Math.random()*99).toFixed(0) }}, -0.{{ (Math.random()*99).toFixed(0) }}, 1.{{ (Math.random()*99).toFixed(0) }}, ...]
|
||||
[0.{{ (Math.random() * 99).toFixed(0) }}, -0.{{
|
||||
(Math.random() * 99).toFixed(0)
|
||||
}}, 1.{{ (Math.random() * 99).toFixed(0) }}, ...]
|
||||
</span>
|
||||
</div>
|
||||
<div class="meaning">
|
||||
@@ -71,8 +93,10 @@
|
||||
</div>
|
||||
</div>
|
||||
<div v-else class="placeholder">
|
||||
<span class="hint-icon">👆</span>
|
||||
<span class="hint-text">悬停在上方方块或向量上,查看 ViT 输出的“语义特征”</span>
|
||||
<span class="hint-icon">👆</span>
|
||||
<span class="hint-text"
|
||||
>悬停在上方方块或向量上,查看 ViT 输出的“语义特征”</span
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@@ -84,15 +108,69 @@ import { ref } from 'vue'
|
||||
const activeIndex = ref(-1)
|
||||
|
||||
const items = [
|
||||
{ icon: '🌲', label: 'Background', type: 'Environment', color: '#4caf50', desc: 'Recognized as outdoor nature elements (Trees/Greenery). Low relevance to main subject.' },
|
||||
{ icon: '🌲', label: 'Background', type: 'Environment', color: '#4caf50', desc: 'Redundant background info. Contextualizes the scene as "Outdoors".' },
|
||||
{ icon: '☁️', label: 'Sky', type: 'Environment', color: '#2196f3', desc: 'Spatial context: Upper region, open area.' },
|
||||
{ icon: '👂', label: 'Cat Ear', type: 'Subject Part', color: '#ff9800', desc: 'High Importance. Identified as "Feline Feature". Strongly linked to "Cat Face".' },
|
||||
{ icon: '😼', label: 'Cat Face', type: 'Subject Core', color: '#ff5722', desc: 'Global Focus Center. Contains "Eyes", "Whiskers". Aggregates info from surrounding patches.' },
|
||||
{ icon: '🌲', label: 'Background', type: 'Environment', color: '#4caf50', desc: 'Background noise.' },
|
||||
{ icon: '🐾', label: 'Cat Paw', type: 'Subject Part', color: '#ff9800', desc: 'Action component. Suggests "Standing" or "Walking" posture.' },
|
||||
{ icon: '🧶', label: 'Yarn', type: 'Object', color: '#e91e63', desc: 'Interacting Object. Semantically linked to "Play" or "Toy".' },
|
||||
{ icon: '🌱', label: 'Grass', type: 'Environment', color: '#8bc34a', desc: 'Ground context. Confirms "Ground level" view.' }
|
||||
{
|
||||
icon: '🌲',
|
||||
label: 'Background',
|
||||
type: 'Environment',
|
||||
color: '#4caf50',
|
||||
desc: 'Recognized as outdoor nature elements (Trees/Greenery). Low relevance to main subject.'
|
||||
},
|
||||
{
|
||||
icon: '🌲',
|
||||
label: 'Background',
|
||||
type: 'Environment',
|
||||
color: '#4caf50',
|
||||
desc: 'Redundant background info. Contextualizes the scene as "Outdoors".'
|
||||
},
|
||||
{
|
||||
icon: '☁️',
|
||||
label: 'Sky',
|
||||
type: 'Environment',
|
||||
color: '#2196f3',
|
||||
desc: 'Spatial context: Upper region, open area.'
|
||||
},
|
||||
{
|
||||
icon: '👂',
|
||||
label: 'Cat Ear',
|
||||
type: 'Subject Part',
|
||||
color: '#ff9800',
|
||||
desc: 'High Importance. Identified as "Feline Feature". Strongly linked to "Cat Face".'
|
||||
},
|
||||
{
|
||||
icon: '😼',
|
||||
label: 'Cat Face',
|
||||
type: 'Subject Core',
|
||||
color: '#ff5722',
|
||||
desc: 'Global Focus Center. Contains "Eyes", "Whiskers". Aggregates info from surrounding patches.'
|
||||
},
|
||||
{
|
||||
icon: '🌲',
|
||||
label: 'Background',
|
||||
type: 'Environment',
|
||||
color: '#4caf50',
|
||||
desc: 'Background noise.'
|
||||
},
|
||||
{
|
||||
icon: '🐾',
|
||||
label: 'Cat Paw',
|
||||
type: 'Subject Part',
|
||||
color: '#ff9800',
|
||||
desc: 'Action component. Suggests "Standing" or "Walking" posture.'
|
||||
},
|
||||
{
|
||||
icon: '🧶',
|
||||
label: 'Yarn',
|
||||
type: 'Object',
|
||||
color: '#e91e63',
|
||||
desc: 'Interacting Object. Semantically linked to "Play" or "Toy".'
|
||||
},
|
||||
{
|
||||
icon: '🌱',
|
||||
label: 'Grass',
|
||||
type: 'Environment',
|
||||
color: '#8bc34a',
|
||||
desc: 'Ground context. Confirms "Ground level" view.'
|
||||
}
|
||||
]
|
||||
</script>
|
||||
|
||||
@@ -102,7 +180,10 @@ const items = [
|
||||
border: 1px solid #e9ecef;
|
||||
border-radius: 12px;
|
||||
padding: 24px;
|
||||
font-family: system-ui, -apple-system, sans-serif;
|
||||
font-family:
|
||||
system-ui,
|
||||
-apple-system,
|
||||
sans-serif;
|
||||
max-width: 700px;
|
||||
margin: 20px auto;
|
||||
}
|
||||
@@ -143,7 +224,7 @@ const items = [
|
||||
background: #fff;
|
||||
padding: 8px;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.05);
|
||||
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
|
||||
}
|
||||
.dark .grid-container {
|
||||
background: #252529;
|
||||
@@ -165,12 +246,14 @@ const items = [
|
||||
background: #343a40;
|
||||
}
|
||||
|
||||
.grid-item:hover, .grid-item.active {
|
||||
.grid-item:hover,
|
||||
.grid-item.active {
|
||||
background: #e7f5ff;
|
||||
transform: scale(1.1);
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
.dark .grid-item:hover, .dark .grid-item.active {
|
||||
.dark .grid-item:hover,
|
||||
.dark .grid-item.active {
|
||||
background: #1c7ed6;
|
||||
}
|
||||
|
||||
@@ -194,7 +277,7 @@ const items = [
|
||||
padding: 10px;
|
||||
background: #fff;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 12px rgba(0,0,0,0.05);
|
||||
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.05);
|
||||
overflow-x: auto;
|
||||
max-width: 100%;
|
||||
}
|
||||
@@ -213,11 +296,13 @@ const items = [
|
||||
transition: background 0.2s;
|
||||
}
|
||||
|
||||
.vector-wrapper:hover, .vector-wrapper.active {
|
||||
background: rgba(0,0,0,0.05);
|
||||
.vector-wrapper:hover,
|
||||
.vector-wrapper.active {
|
||||
background: rgba(0, 0, 0, 0.05);
|
||||
}
|
||||
.dark .vector-wrapper:hover, .dark .vector-wrapper.active {
|
||||
background: rgba(255,255,255,0.1);
|
||||
.dark .vector-wrapper:hover,
|
||||
.dark .vector-wrapper.active {
|
||||
background: rgba(255, 255, 255, 0.1);
|
||||
}
|
||||
|
||||
.vector-col {
|
||||
|
||||
@@ -6,8 +6,8 @@
|
||||
</div>
|
||||
|
||||
<div class="scenario-tabs">
|
||||
<button
|
||||
v-for="s in scenarios"
|
||||
<button
|
||||
v-for="s in scenarios"
|
||||
:key="s.id"
|
||||
class="tab-btn"
|
||||
:class="{ active: currentScenario === s.id }"
|
||||
@@ -20,39 +20,59 @@
|
||||
<div class="demo-container">
|
||||
<!-- Image Area -->
|
||||
<div class="image-area">
|
||||
<div class="image-placeholder" :class="{ loaded: hasImage, 'receipt-bg': currentScenario === 'ocr' }">
|
||||
<div
|
||||
class="image-placeholder"
|
||||
:class="{ loaded: hasImage, 'receipt-bg': currentScenario === 'ocr' }"
|
||||
>
|
||||
<div v-if="!hasImage" class="upload-prompt">
|
||||
<div class="icon">🖼️</div>
|
||||
<button class="upload-btn" @click="loadImage">
|
||||
上传图片 (模拟)
|
||||
</button>
|
||||
</div>
|
||||
|
||||
|
||||
<div v-else class="image-content">
|
||||
<!-- Chat: Landscape -->
|
||||
<div v-if="currentScenario === 'chat'" class="real-image-container landscape">
|
||||
<div
|
||||
v-if="currentScenario === 'chat'"
|
||||
class="real-image-container landscape"
|
||||
>
|
||||
<div class="real-image">🏔️</div>
|
||||
<div class="sun">☀️</div>
|
||||
<div class="tree">🌲</div>
|
||||
</div>
|
||||
|
||||
<!-- Detection: Fruits -->
|
||||
<div v-else-if="currentScenario === 'detection'" class="real-image-container fruits">
|
||||
<div
|
||||
v-else-if="currentScenario === 'detection'"
|
||||
class="real-image-container fruits"
|
||||
>
|
||||
<div class="real-image">
|
||||
<span class="fruit apple">🍎</span>
|
||||
<span class="fruit banana">🍌</span>
|
||||
<span class="fruit grape">🍇</span>
|
||||
</div>
|
||||
<div v-if="showBoundingBox" class="bounding-box apple-box" title="Apple">
|
||||
<div
|
||||
v-if="showBoundingBox"
|
||||
class="bounding-box apple-box"
|
||||
title="Apple"
|
||||
>
|
||||
<span class="box-label">apple: 0.98</span>
|
||||
</div>
|
||||
<div v-if="showBoundingBox" class="bounding-box banana-box" title="Banana">
|
||||
<div
|
||||
v-if="showBoundingBox"
|
||||
class="bounding-box banana-box"
|
||||
title="Banana"
|
||||
>
|
||||
<span class="box-label">banana: 0.95</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Analysis: Factory Safety -->
|
||||
<div v-else-if="currentScenario === 'analysis'" class="factory-image">
|
||||
<div
|
||||
v-else-if="currentScenario === 'analysis'"
|
||||
class="factory-image"
|
||||
>
|
||||
<div class="safety-sign">⚠️ 安全生产</div>
|
||||
<div class="worker-container">
|
||||
<span class="worker">👷</span>
|
||||
@@ -67,7 +87,9 @@
|
||||
<div class="receipt-body">
|
||||
<div class="line"><span>Coffee</span><span>$4.50</span></div>
|
||||
<div class="line"><span>Bagel</span><span>$3.00</span></div>
|
||||
<div class="line total"><span>TOTAL</span><span>$7.50</span></div>
|
||||
<div class="line total">
|
||||
<span>TOTAL</span><span>$7.50</span>
|
||||
</div>
|
||||
<div class="line date"><span>2023-10-24</span></div>
|
||||
</div>
|
||||
</div>
|
||||
@@ -85,29 +107,45 @@
|
||||
<div v-if="messages.length === 0" class="empty-text">
|
||||
{{ hasImage ? '图片已就绪,请选择指令' : '请先上传图片' }}
|
||||
</div>
|
||||
<div v-for="(msg, index) in messages" :key="index" class="message" :class="msg.role">
|
||||
<div
|
||||
v-for="(msg, index) in messages"
|
||||
:key="index"
|
||||
class="message"
|
||||
:class="msg.role"
|
||||
>
|
||||
<div class="content">
|
||||
<div v-if="msg.isJson" class="json-content">
|
||||
<pre>{{ msg.content }}</pre>
|
||||
</div>
|
||||
<span v-else>{{ msg.content }}</span>
|
||||
<span v-if="msg.role === 'assistant' && isGenerating && index === messages.length - 1" class="cursor">|</span>
|
||||
<span
|
||||
v-if="
|
||||
msg.role === 'assistant' &&
|
||||
isGenerating &&
|
||||
index === messages.length - 1
|
||||
"
|
||||
class="cursor"
|
||||
>|</span
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="input-area">
|
||||
<div class="quick-actions" v-if="hasImage && !isGenerating">
|
||||
<button v-for="q in currentQuestions" :key="q" @click="ask(q)" class="action-btn">
|
||||
<button
|
||||
v-for="q in currentQuestions"
|
||||
:key="q"
|
||||
@click="ask(q)"
|
||||
class="action-btn"
|
||||
>
|
||||
{{ q }}
|
||||
</button>
|
||||
</div>
|
||||
<div class="status-text" v-else-if="isGenerating">
|
||||
AI 正在观察图片并思考...
|
||||
</div>
|
||||
<div class="status-text" v-else>
|
||||
等待图片上传...
|
||||
</div>
|
||||
<div class="status-text" v-else>等待图片上传...</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@@ -132,75 +170,80 @@ const messages = ref([])
|
||||
const messagesRef = ref(null)
|
||||
|
||||
const questionsMap = {
|
||||
chat: [
|
||||
"这里是哪里?",
|
||||
"描述一下天气",
|
||||
"写首关于这座山的诗"
|
||||
],
|
||||
detection: [
|
||||
"检测图中的水果",
|
||||
"数数有几个苹果",
|
||||
"输出检测框坐标"
|
||||
],
|
||||
ocr: [
|
||||
"提取所有文字",
|
||||
"总金额是多少?",
|
||||
"消费日期是哪天?"
|
||||
],
|
||||
analysis: [
|
||||
"工人是否佩戴安全帽?",
|
||||
"检测现场安全隐患",
|
||||
"输出风险评估报告"
|
||||
]
|
||||
chat: ['这里是哪里?', '描述一下天气', '写首关于这座山的诗'],
|
||||
detection: ['检测图中的水果', '数数有几个苹果', '输出检测框坐标'],
|
||||
ocr: ['提取所有文字', '总金额是多少?', '消费日期是哪天?'],
|
||||
analysis: ['工人是否佩戴安全帽?', '检测现场安全隐患', '输出风险评估报告']
|
||||
}
|
||||
|
||||
const answersMap = {
|
||||
chat: {
|
||||
"这里是哪里?": "这是一张高山风景照。远处是覆盖着皑皑白雪的山峰,可能是阿尔卑斯山或喜马拉雅山脉。山脚下有郁郁葱葱的松树林。",
|
||||
"描述一下天气": "天气看起来非常晴朗,阳光明媚(☀️),能见度很高。蓝天白云,是一个适合登山或滑雪的好天气。",
|
||||
"写首关于这座山的诗": "🏔️ 雪岭插云天,\n🌲 松涛响翠烟。\n☀️ 金阳融冷色,\n🏞️ 壮丽入心田。"
|
||||
'这里是哪里?':
|
||||
'这是一张高山风景照。远处是覆盖着皑皑白雪的山峰,可能是阿尔卑斯山或喜马拉雅山脉。山脚下有郁郁葱葱的松树林。',
|
||||
描述一下天气:
|
||||
'天气看起来非常晴朗,阳光明媚(☀️),能见度很高。蓝天白云,是一个适合登山或滑雪的好天气。',
|
||||
写首关于这座山的诗:
|
||||
'🏔️ 雪岭插云天,\n🌲 松涛响翠烟。\n☀️ 金阳融冷色,\n🏞️ 壮丽入心田。'
|
||||
},
|
||||
detection: {
|
||||
"检测图中的水果": {
|
||||
type: 'json',
|
||||
text: JSON.stringify({ objects: ['apple', 'banana', 'grape'], count: 3 }, null, 2),
|
||||
检测图中的水果: {
|
||||
type: 'json',
|
||||
text: JSON.stringify(
|
||||
{ objects: ['apple', 'banana', 'grape'], count: 3 },
|
||||
null,
|
||||
2
|
||||
),
|
||||
action: 'showBox'
|
||||
},
|
||||
"数数有几个苹果": "图中检测到 1 个苹果(🍎)。",
|
||||
"输出检测框坐标": {
|
||||
数数有几个苹果: '图中检测到 1 个苹果(🍎)。',
|
||||
输出检测框坐标: {
|
||||
type: 'json',
|
||||
text: JSON.stringify({
|
||||
objects: [
|
||||
{ label: 'apple', box: [15, 15, 85, 85] },
|
||||
{ label: 'banana', box: [95, 15, 165, 85] }
|
||||
]
|
||||
}, null, 2),
|
||||
text: JSON.stringify(
|
||||
{
|
||||
objects: [
|
||||
{ label: 'apple', box: [15, 15, 85, 85] },
|
||||
{ label: 'banana', box: [95, 15, 165, 85] }
|
||||
]
|
||||
},
|
||||
null,
|
||||
2
|
||||
),
|
||||
action: 'showBox'
|
||||
}
|
||||
},
|
||||
ocr: {
|
||||
"提取所有文字": {
|
||||
提取所有文字: {
|
||||
type: 'json',
|
||||
text: JSON.stringify({
|
||||
lines: [
|
||||
"RECEIPT",
|
||||
"Coffee $4.50",
|
||||
"Bagel $3.00",
|
||||
"TOTAL $7.50",
|
||||
"2023-10-24"
|
||||
]
|
||||
}, null, 2)
|
||||
text: JSON.stringify(
|
||||
{
|
||||
lines: [
|
||||
'RECEIPT',
|
||||
'Coffee $4.50',
|
||||
'Bagel $3.00',
|
||||
'TOTAL $7.50',
|
||||
'2023-10-24'
|
||||
]
|
||||
},
|
||||
null,
|
||||
2
|
||||
)
|
||||
},
|
||||
"总金额是多少?": "这张小票的总金额是 $7.50。",
|
||||
"消费日期是哪天?": "消费日期是 2023年10月24日。"
|
||||
'总金额是多少?': '这张小票的总金额是 $7.50。',
|
||||
'消费日期是哪天?': '消费日期是 2023年10月24日。'
|
||||
},
|
||||
analysis: {
|
||||
"工人是否佩戴安全帽?": "检测到画面中有一名工人(👷),已正确佩戴红色安全帽(⛑️)。",
|
||||
"检测现场安全隐患": {
|
||||
'工人是否佩戴安全帽?':
|
||||
'检测到画面中有一名工人(👷),已正确佩戴红色安全帽(⛑️)。',
|
||||
检测现场安全隐患: {
|
||||
type: 'json',
|
||||
text: JSON.stringify({ hazards: [], safety_score: 100, status: "SAFE" }, null, 2)
|
||||
text: JSON.stringify(
|
||||
{ hazards: [], safety_score: 100, status: 'SAFE' },
|
||||
null,
|
||||
2
|
||||
)
|
||||
},
|
||||
"输出风险评估报告": "✅ **安全合规**\n- 人员:1人\n- 防护装备:齐全\n- 机械设备:正常运行中\n- 风险等级:低"
|
||||
输出风险评估报告:
|
||||
'✅ **安全合规**\n- 人员:1人\n- 防护装备:齐全\n- 机械设备:正常运行中\n- 风险等级:低'
|
||||
}
|
||||
}
|
||||
|
||||
@@ -214,7 +257,9 @@ const getImageLabel = () => {
|
||||
return map[currentScenario.value]
|
||||
}
|
||||
|
||||
const currentQuestions = computed(() => questionsMap[currentScenario.value] || [])
|
||||
const currentQuestions = computed(
|
||||
() => questionsMap[currentScenario.value] || []
|
||||
)
|
||||
|
||||
const switchScenario = (id) => {
|
||||
currentScenario.value = id
|
||||
@@ -232,16 +277,16 @@ const loadImage = () => {
|
||||
const ask = async (question) => {
|
||||
messages.value.push({ role: 'user', content: question })
|
||||
isGenerating.value = true
|
||||
|
||||
|
||||
await wait(800) // Simulate vision encoding time
|
||||
|
||||
|
||||
const scenarioAnswers = answersMap[currentScenario.value]
|
||||
const rawAnswer = scenarioAnswers[question] || "我还在学习这个任务..."
|
||||
|
||||
const rawAnswer = scenarioAnswers[question] || '我还在学习这个任务...'
|
||||
|
||||
let content = ''
|
||||
let isJson = false
|
||||
let action = null
|
||||
|
||||
|
||||
if (typeof rawAnswer === 'object') {
|
||||
content = rawAnswer.text
|
||||
isJson = rawAnswer.type === 'json'
|
||||
@@ -249,10 +294,10 @@ const ask = async (question) => {
|
||||
} else {
|
||||
content = rawAnswer
|
||||
}
|
||||
|
||||
|
||||
messages.value.push({ role: 'assistant', content: '', isJson })
|
||||
const answerIdx = messages.value.length - 1
|
||||
|
||||
|
||||
// Streaming effect
|
||||
const stepSize = isJson ? 5 : 1 // JSON types faster
|
||||
for (let i = 0; i < content.length; i += stepSize) {
|
||||
@@ -260,15 +305,15 @@ const ask = async (question) => {
|
||||
scrollToBottom()
|
||||
await wait(20)
|
||||
}
|
||||
|
||||
|
||||
if (action === 'showBox') {
|
||||
showBoundingBox.value = true
|
||||
}
|
||||
|
||||
|
||||
isGenerating.value = false
|
||||
}
|
||||
|
||||
const wait = (ms) => new Promise(resolve => setTimeout(resolve, ms))
|
||||
const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
|
||||
|
||||
const scrollToBottom = () => {
|
||||
nextTick(() => {
|
||||
@@ -286,7 +331,8 @@ const scrollToBottom = () => {
|
||||
border-radius: 12px;
|
||||
padding: 20px;
|
||||
margin: 20px 0;
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
||||
font-family:
|
||||
-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
}
|
||||
|
||||
.header {
|
||||
@@ -413,7 +459,7 @@ const scrollToBottom = () => {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
background: linear-gradient(to bottom, #87CEEB 50%, #e0e0e0 50%);
|
||||
background: linear-gradient(to bottom, #87ceeb 50%, #e0e0e0 50%);
|
||||
border-radius: 8px;
|
||||
overflow: hidden;
|
||||
position: absolute;
|
||||
@@ -563,7 +609,7 @@ const scrollToBottom = () => {
|
||||
background: white;
|
||||
padding: 15px;
|
||||
width: 160px;
|
||||
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
||||
font-family: 'Courier New', Courier, monospace;
|
||||
font-size: 11px;
|
||||
text-align: left;
|
||||
@@ -602,7 +648,7 @@ const scrollToBottom = () => {
|
||||
.image-label {
|
||||
font-size: 12px;
|
||||
color: #666;
|
||||
background: rgba(255,255,255,0.8);
|
||||
background: rgba(255, 255, 255, 0.8);
|
||||
padding: 4px 8px;
|
||||
border-radius: 4px;
|
||||
position: absolute;
|
||||
@@ -711,28 +757,53 @@ const scrollToBottom = () => {
|
||||
}
|
||||
|
||||
@keyframes popIn {
|
||||
from { transform: scale(0); opacity: 0; }
|
||||
to { transform: scale(1); opacity: 1; }
|
||||
from {
|
||||
transform: scale(0);
|
||||
opacity: 0;
|
||||
}
|
||||
to {
|
||||
transform: scale(1);
|
||||
opacity: 1;
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes slideUp {
|
||||
from { transform: translateY(20px); opacity: 0; }
|
||||
to { transform: translateY(0); opacity: 1; }
|
||||
from {
|
||||
transform: translateY(20px);
|
||||
opacity: 0;
|
||||
}
|
||||
to {
|
||||
transform: translateY(0);
|
||||
opacity: 1;
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes fadeIn {
|
||||
from { opacity: 0; }
|
||||
to { opacity: 1; }
|
||||
from {
|
||||
opacity: 0;
|
||||
}
|
||||
to {
|
||||
opacity: 1;
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes blink {
|
||||
0%, 100% { opacity: 1; }
|
||||
50% { opacity: 0; }
|
||||
0%,
|
||||
100% {
|
||||
opacity: 1;
|
||||
}
|
||||
50% {
|
||||
opacity: 0;
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
from { transform: rotate(0deg); }
|
||||
to { transform: rotate(360deg); }
|
||||
from {
|
||||
transform: rotate(0deg);
|
||||
}
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 600px) {
|
||||
@@ -755,4 +826,4 @@ const scrollToBottom = () => {
|
||||
white-space: nowrap;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</style>
|
||||
|
||||
Reference in New Issue
Block a user