feat: add AI and Backend evolution history with interactive demos, and refine Frontend evolution demo

2026-01-18 10:24:35 +08:00
parent 82be39a9ac
commit 26ed39e1eb
44 changed files with 9868 additions and 2633 deletions
@@ -1,57 +1,110 @@
 <template>
  <div class="attn-demo">
-    <div class="controls">
-      <span class="hint">🖱️ 把鼠标悬停在方块上，查看它的“注意力”分配</span>
+    <div class="header">
+      <div class="title">Self-Attention Mechanism</div>
+      <div class="subtitle">自注意力机制：全局信息交互</div>
    </div>

-    <div class="visual-area">
-      <div class="image-grid" @mouseleave="hoverIndex = -1">
+    <div class="visual-stage">
+      <!-- Grid Layout -->
+      <div class="grid-container" @mouseleave="hoverIndex = -1">
+        <!-- SVG Layer for Connection Lines -->
+        <svg class="connections-layer">
+          <defs>
+            <marker id="arrowhead" markerWidth="6" markerHeight="4" refX="18" refY="2" orient="auto">
+              <polygon points="0 0, 6 2, 0 4" fill="var(--vp-c-brand)" opacity="0.6"/>
+            </marker>
+          </defs>
+          <!-- Draw lines from hoverIndex to ALL other nodes -->
+          <g v-if="hoverIndex !== -1">
+            <line
+              v-for="(target, tIndex) in items"
+              :key="tIndex"
+              v-show="tIndex !== hoverIndex"
+              :x1="getCenter(hoverIndex).x"
+              :y1="getCenter(hoverIndex).y"
+              :x2="getCenter(tIndex).x"
+              :y2="getCenter(tIndex).y"
+              :stroke="getLineColor(hoverIndex, tIndex)"
+              :stroke-width="getLineWidth(hoverIndex, tIndex)"
+              stroke-linecap="round"
+              :opacity="getLineOpacity(hoverIndex, tIndex)"
+            />
+          </g>
+        </svg>
+
+        <!-- Cells -->
        <div
          v-for="(item, index) in items"
          :key="index"
          class="grid-cell"
-          :class="{ active: hoverIndex === index }"
+          :class="{ 
+            'is-source': hoverIndex === index,
+            'is-target': hoverIndex !== -1 && hoverIndex !== index,
+            'is-strong-attn': hoverIndex !== -1 && getAttentionScore(hoverIndex, index) > 0.5
+          }"
          @mouseenter="hoverIndex = index"
+          :style="{
+            left: getCenter(index).x - 30 + 'px',
+            top: getCenter(index).y - 30 + 'px'
+          }"
        >
-          {{ item.icon }}
-          <div class="cell-label">{{ item.label }}</div>
+          <div class="cell-content">
+            <span class="cell-icon">{{ item.icon }}</span>
+            <span class="cell-label">{{ item.label }}</span>
+          </div>
+          <!-- Attention Score Badge -->
+          <div 
+            class="attn-badge" 
+            v-if="hoverIndex !== -1 && hoverIndex !== index"
+            :style="{ opacity: Math.max(0.3, getAttentionScore(hoverIndex, index)) }"
+          >
+            {{ (getAttentionScore(hoverIndex, index) * 100).toFixed(0) }}%
+          </div>
        </div>
-
-        <!-- SVG Overlay for lines -->
-        <svg class="connections" v-if="hoverIndex !== -1">
-          <line
-            v-for="(target, tIndex) in items"
-            :key="tIndex"
-            v-if="tIndex !== hoverIndex"
-            :x1="getCenter(hoverIndex).x"
-            :y1="getCenter(hoverIndex).y"
-            :x2="getCenter(tIndex).x"
-            :y2="getCenter(tIndex).y"
-            :stroke="getAttentionColor(hoverIndex, tIndex)"
-            :stroke-width="getAttentionWidth(hoverIndex, tIndex)"
-            stroke-linecap="round"
-          />
-        </svg>
      </div>

-      <div class="info-panel" :class="{ visible: hoverIndex !== -1 }">
-        <div class="info-title">Patch: {{ items[hoverIndex]?.label }}</div>
-        <div class="info-desc">正在关注：</div>
-        <ul class="attn-list" v-if="hoverIndex !== -1">
-          <li
-            v-for="(weight, targetIdx) in getTopAttentions(hoverIndex)"
-            :key="targetIdx"
-          >
-            <span class="target-icon">{{ items[targetIdx].icon }}</span>
-            <span class="target-name">{{ items[targetIdx].label }}</span>
-            <div class="bar-bg">
-              <div
-                class="bar-fill"
-                :style="{ width: weight * 100 + '%' }"
-              ></div>
+      <!-- Info Panel -->
+      <div class="info-panel">
+        <div v-if="hoverIndex === -1" class="placeholder-text">
+          <span class="cursor-icon">👆</span>
+          把鼠标悬停在任意方块上，<br>观察它在"关注"谁
+        </div>
+        <div v-else class="active-info">
+          <div class="source-info">
+            <span class="label">当前 Patch:</span>
+            <div class="patch-tag">
+              {{ items[hoverIndex].icon }} {{ items[hoverIndex].label }}
            </div>
-          </li>
-        </ul>
+          </div>
+          
+          <div class="attn-list">
+            <div class="list-header">Attention Weights (注意力权重)</div>
+            <div 
+              class="attn-item"
+              v-for="(score, idx) in getTopAttentions(hoverIndex)"
+              :key="idx"
+            >
+              <div class="item-left">
+                <span class="item-icon">{{ items[idx].icon }}</span>
+                <span class="item-name">{{ items[idx].label }}</span>
+              </div>
+              <div class="item-right">
+                <div class="progress-bar">
+                  <div class="progress-fill" :style="{ width: score * 100 + '%' }"></div>
+                </div>
+                <span class="score-text">{{ (score * 100).toFixed(0) }}%</span>
+              </div>
+            </div>
+          </div>
+          
+          <div class="insight-box">
+            <span class="bulb">💡</span>
+            <span class="insight-text">
+              {{ getInsightText(hoverIndex) }}
+            </span>
+          </div>
+        </div>
      </div>
    </div>
  </div>
@@ -62,207 +115,309 @@ import { ref } from 'vue'

 const hoverIndex = ref(-1)

+// 3x3 Grid Data (Cat in grass)
 const items = [
-  { icon: '🌲', label: '背景' },
-  { icon: '🌲', label: '背景' },
-  { icon: '☁️', label: '天空' },
-  { icon: '👂', label: '猫耳' },
-  { icon: '😼', label: '猫脸' },
-  { icon: '🌲', label: '背景' },
-  { icon: '🐾', label: '猫爪' },
-  { icon: '🧶', label: '毛线' },
-  { icon: '🌱', label: '草地' }
+  { icon: '🌿', label: '草地' }, // 0
+  { icon: '🌿', label: '草地' }, // 1
+  { icon: '🦋', label: '蝴蝶' }, // 2
+  { icon: '🌿', label: '草地' }, // 3
+  { icon: '🐱', label: '猫头' }, // 4
+  { icon: '🌿', label: '草地' }, // 5
+  { icon: '🧶', label: '毛球' }, // 6
+  { icon: '🐾', label: '猫爪' }, // 7
+  { icon: '🌿', label: '草地' }  // 8
 ]

-// 3x3 Grid
+// Layout Logic
 const getCenter = (index) => {
  const row = Math.floor(index / 3)
  const col = index % 3
-  // Assuming 80px cell + 10px gap
-  const cellSize = 80
-  const gap = 10
-  const offset = cellSize / 2
+  const gap = 100
+  const offsetX = 50
+  const offsetY = 50
  return {
-    x: col * (cellSize + gap) + offset,
-    y: row * (cellSize + gap) + offset
+    x: col * gap + offsetX,
+    y: row * gap + offsetY
  }
 }

-// Mock attention weights
-const getAttentionWeight = (source, target) => {
-  // Self attention is ignored for visualization clarity usually, but let's say:
+// Attention Logic
+const getAttentionScore = (source, target) => {
+  if (source === target) return 0
+  
+  // Cat Head (4) attends strongly to:
+  if (source === 4) {
+    if (target === 7) return 0.95 // Paws (Body parts connected)
+    if (target === 2) return 0.8  // Butterfly (Interest)
+    if (target === 6) return 0.6  // Yarn (Toy)
+    return 0.1 // Background
+  }

-  // Cat parts (3, 4, 6) attend strongly to each other
-  const catParts = [3, 4, 6]
-  const isSourceCat = catParts.includes(source)
-  const isTargetCat = catParts.includes(target)
+  // Cat Paws (7) attends strongly to:
+  if (source === 7) {
+    if (target === 4) return 0.95 // Head
+    if (target === 6) return 0.9  // Yarn (Touching)
+    return 0.1
+  }

-  if (isSourceCat && isTargetCat) return 0.9 // Strong connection between cat parts
+  // Butterfly (2)
+  if (source === 2) {
+    if (target === 4) return 0.7 // Danger?
+    return 0.2
+  }

-  // Cat interacts with Yarn (7)
-  if (isSourceCat && target === 7) return 0.7
-  if (source === 7 && isTargetCat) return 0.7
+  // Grass (Background)
+  // Background patches attend to each other for texture consistency
+  const bgIndices = [0, 1, 3, 5, 8]
+  if (bgIndices.includes(source)) {
+    if (bgIndices.includes(target)) return 0.6
+    return 0.05
+  }

-  // Background parts attend to each other
-  const bgParts = [0, 1, 2, 5, 8]
-  if (bgParts.includes(source) && bgParts.includes(target)) return 0.5
-
-  return 0.1 // Weak attention otherwise
+  // Default fallback
+  return 0.1
 }

-const getAttentionColor = (source, target) => {
-  const weight = getAttentionWeight(source, target)
-  // Green for strong, gray for weak
-  if (weight > 0.6) return `rgba(16, 185, 129, ${weight})`
-  return `rgba(156, 163, 175, ${weight * 0.5})`
+const getLineColor = (source, target) => {
+  const score = getAttentionScore(source, target)
+  return score > 0.5 ? 'var(--vp-c-brand)' : 'var(--vp-c-text-3)'
 }

-const getAttentionWidth = (source, target) => {
-  const weight = getAttentionWeight(source, target)
-  return weight * 5
+const getLineWidth = (source, target) => {
+  const score = getAttentionScore(source, target)
+  return 1 + score * 4
+}
+
+const getLineOpacity = (source, target) => {
+  const score = getAttentionScore(source, target)
+  return 0.2 + score * 0.8
 }

 const getTopAttentions = (source) => {
-  const weights = {}
+  const scores = {}
  items.forEach((_, idx) => {
    if (idx !== source) {
-      weights[idx] = getAttentionWeight(source, idx)
+      scores[idx] = getAttentionScore(source, idx)
    }
  })
-  // Sort by weight desc
-  return weights
+  // Sort descending
+  const sortedKeys = Object.keys(scores).sort((a, b) => scores[b] - scores[a])
+  const top3 = {}
+  sortedKeys.slice(0, 3).forEach(key => {
+    top3[key] = scores[key]
+  })
+  return top3
+}
+
+const getInsightText = (idx) => {
+  if (idx === 4) return "猫头最关注猫爪（组成身体）和蝴蝶（捕猎目标）。"
+  if (idx === 7) return "猫爪最关注毛球（正在玩耍）和猫头。"
+  if (idx === 2) return "蝴蝶关注到了猫，可能是因为它是个威胁。"
+  if ([0,1,3,5,8].includes(idx)) return "草地主要关注周围的草地，确认背景纹理。"
+  if (idx === 6) return "毛球和猫爪有很强的互动关系。"
+  return "Self-Attention 让每个部分找到它的上下文关联。"
 }
 </script>

 <style scoped>
 .attn-demo {
-  padding: 20px;
  background: var(--vp-c-bg-soft);
-  border-radius: 8px;
+  border: 1px solid var(--vp-c-divider);
+  border-radius: 12px;
+  padding: 24px;
  margin: 20px 0;
  user-select: none;
+  font-family: 'Menlo', 'Monaco', sans-serif;
 }

-.controls {
+.header {
  text-align: center;
-  margin-bottom: 20px;
+  margin-bottom: 30px;
 }

-.hint {
-  font-size: 0.9em;
-  color: var(--vp-c-text-2);
-  background: var(--vp-c-bg);
-  padding: 4px 12px;
-  border-radius: 12px;
-  border: 1px solid var(--vp-c-divider);
+.title {
+  font-size: 16px;
+  font-weight: bold;
+  color: var(--vp-c-text-1);
 }

-.visual-area {
-  display: flex;
-  justify-content: center;
-  gap: 40px;
-  flex-wrap: wrap;
-}
-
-.image-grid {
-  display: grid;
-  grid-template-columns: repeat(3, 80px);
-  gap: 10px;
-  position: relative;
-}
-
-.grid-cell {
-  width: 80px;
-  height: 80px;
-  background: var(--vp-c-bg);
-  border: 2px solid var(--vp-c-divider);
-  border-radius: 8px;
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  justify-content: center;
-  cursor: pointer;
-  transition: all 0.2s;
-  z-index: 2;
-  position: relative;
-}
-
-.grid-cell:hover,
-.grid-cell.active {
-  border-color: var(--vp-c-brand);
-  transform: scale(1.05);
-  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
-  background: var(--vp-c-bg-mute);
-}
-
-.cell-label {
-  font-size: 0.8em;
+.subtitle {
+  font-size: 12px;
  color: var(--vp-c-text-2);
  margin-top: 4px;
 }

-.connections {
+.visual-stage {
+  display: flex;
+  gap: 40px;
+  justify-content: center;
+  align-items: flex-start;
+  flex-wrap: wrap;
+}
+
+/* Grid Area */
+.grid-container {
+  width: 300px;
+  height: 300px;
+  position: relative;
+  /* background: rgba(0,0,0,0.02); */
+  border-radius: 12px;
+}
+
+.connections-layer {
  position: absolute;
  top: 0;
  left: 0;
  width: 100%;
  height: 100%;
-  pointer-events: none;
  z-index: 1;
+  pointer-events: none;
 }

+.grid-cell {
+  position: absolute;
+  width: 60px;
+  height: 60px;
+  background: var(--vp-c-bg);
+  border: 2px solid var(--vp-c-divider);
+  border-radius: 12px;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  cursor: pointer;
+  z-index: 2;
+  transition: all 0.3s cubic-bezier(0.34, 1.56, 0.64, 1);
+  box-shadow: 0 4px 6px rgba(0,0,0,0.05);
+}
+
+.cell-content {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+}
+
+.cell-icon {
+  font-size: 24px;
+  line-height: 1.2;
+}
+
+.cell-label {
+  font-size: 10px;
+  color: var(--vp-c-text-2);
+  font-weight: bold;
+}
+
+/* Interaction States */
+.grid-cell:hover, .grid-cell.is-source {
+  z-index: 10;
+  border-color: var(--vp-c-brand);
+  background: var(--vp-c-bg);
+  transform: scale(1.15);
+  box-shadow: 0 8px 20px rgba(0,0,0,0.15);
+}
+
+.grid-cell.is-strong-attn {
+  border-color: var(--vp-c-brand-light);
+  background: var(--vp-c-brand-dimm);
+}
+
+.attn-badge {
+  position: absolute;
+  top: -8px;
+  right: -8px;
+  background: var(--vp-c-brand);
+  color: white;
+  font-size: 9px;
+  padding: 2px 6px;
+  border-radius: 10px;
+  font-weight: bold;
+  box-shadow: 0 2px 4px rgba(0,0,0,0.2);
+}
+
+/* Info Panel */
 .info-panel {
-  width: 200px;
+  width: 280px;
+  min-height: 260px;
  background: var(--vp-c-bg);
  border: 1px solid var(--vp-c-divider);
-  border-radius: 8px;
-  padding: 15px;
-  opacity: 0;
-  transition: opacity 0.2s;
-  pointer-events: none;
+  border-radius: 12px;
+  padding: 20px;
+  display: flex;
+  flex-direction: column;
+  justify-content: center;
 }

-.info-panel.visible {
-  opacity: 1;
-  pointer-events: auto;
+.placeholder-text {
+  text-align: center;
+  color: var(--vp-c-text-3);
+  font-size: 13px;
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 10px;
 }

-.info-title {
-  font-weight: bold;
-  margin-bottom: 5px;
-  color: var(--vp-c-brand);
+.cursor-icon {
+  font-size: 32px;
+  animation: bounce 2s infinite;
 }

-.info-desc {
-  font-size: 0.85em;
+.source-info {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  margin-bottom: 20px;
+  padding-bottom: 15px;
+  border-bottom: 1px dashed var(--vp-c-divider);
+}
+
+.label {
+  font-size: 12px;
  color: var(--vp-c-text-2);
+}
+
+.patch-tag {
+  background: var(--vp-c-brand-dimm);
+  color: var(--vp-c-brand-dark);
+  padding: 4px 12px;
+  border-radius: 6px;
+  font-size: 13px;
+  font-weight: bold;
+}
+
+.list-header {
+  font-size: 11px;
+  color: var(--vp-c-text-3);
+  text-transform: uppercase;
  margin-bottom: 10px;
+  letter-spacing: 0.5px;
 }

-.attn-list {
-  list-style: none;
-  padding: 0;
-  margin: 0;
+.attn-item {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  margin-bottom: 12px;
 }

-.attn-list li {
+.item-left {
  display: flex;
  align-items: center;
  gap: 8px;
-  margin-bottom: 6px;
-  font-size: 0.85em;
+  width: 80px;
 }

-.target-icon {
-  width: 20px;
-  text-align: center;
+.item-icon { font-size: 16px; }
+.item-name { font-size: 12px; font-weight: 500; }
+
+.item-right {
+  flex: 1;
+  display: flex;
+  align-items: center;
+  gap: 10px;
 }

-.target-name {
-  width: 40px;
-}
-
-.bar-bg {
+.progress-bar {
  flex: 1;
  height: 6px;
  background: var(--vp-c-bg-soft);
@@ -270,9 +425,50 @@ const getTopAttentions = (source) => {
  overflow: hidden;
 }

-.bar-fill {
+.progress-fill {
  height: 100%;
  background: var(--vp-c-brand);
  border-radius: 3px;
 }
+
+.score-text {
+  font-size: 11px;
+  color: var(--vp-c-text-2);
+  width: 30px;
+  text-align: right;
+  font-family: monospace;
+}
+
+.insight-box {
+  margin-top: 15px;
+  background: var(--vp-c-yellow-dimm);
+  padding: 10px;
+  border-radius: 8px;
+  display: flex;
+  gap: 8px;
+  align-items: flex-start;
+}
+
+.bulb { font-size: 16px; }
+.insight-text {
+  font-size: 12px;
+  color: var(--vp-c-text-1);
+  line-height: 1.4;
+}
+
+@keyframes bounce {
+  0%, 100% { transform: translateY(0); }
+  50% { transform: translateY(-5px); }
+}
+
+@media (max-width: 768px) {
+  .visual-stage {
+    flex-direction: column;
+    align-items: center;
+  }
+  .info-panel {
+    width: 100%;
+    min-height: auto;
+  }
+}
 </style>
@@ -3,16 +3,16 @@
    <div class="demo-container">
      <!-- Step 1: Patch -->
      <div class="step-box">
-        <div class="label">1. Patch (4x4)</div>
+        <div class="label">1. Patch (16×16×3) (示意 / Toy)</div>
        <div class="grid-patch">
          <div
-            v-for="n in 16"
+            v-for="n in patchCellCount"
            :key="n"
            class="pixel"
            :style="{ backgroundColor: getPixelColor(n) }"
          ></div>
        </div>
-        <div class="desc">768 像素点</div>
+        <div class="desc">16×16 像素 × 3 通道 = 768 标量值</div>
      </div>

      <div class="arrow">➜</div>
@@ -22,13 +22,14 @@
        <div class="label">2. Flatten</div>
        <div class="vector-container">
          <div
-            v-for="n in 16"
+            v-for="n in flattenSampleCount"
            :key="n"
            class="vector-cell"
            :style="{ backgroundColor: getPixelColor(n) }"
          ></div>
+          <div class="vector-ellipsis">…</div>
        </div>
-        <div class="desc">拉平成向量</div>
+        <div class="desc">得到 1×768 向量 (Vector)</div>
      </div>

      <div class="arrow">× W</div>
@@ -39,13 +40,16 @@
        <div class="embedding-container">
          <div v-for="n in 8" :key="n" class="embed-cell"></div>
        </div>
-        <div class="desc">压缩特征 (D=8)</div>
+        <div class="desc">映射到 D 维 (示意 D=8；常见 D=768)</div>
      </div>
    </div>
  </div>
 </template>

 <script setup>
+const patchCellCount = 16 * 16
+const flattenSampleCount = 32
+
 const getPixelColor = (n) => {
  // Generate a gradient of colors
  const hue = (n * 20) % 360
@@ -89,8 +93,8 @@ const getPixelColor = (n) => {

 .grid-patch {
  display: grid;
-  grid-template-columns: repeat(4, 1fr);
-  gap: 2px;
+  grid-template-columns: repeat(16, 1fr);
+  gap: 1px;
  width: 80px;
  height: 80px;
 }
@@ -105,7 +109,7 @@ const getPixelColor = (n) => {
  display: flex;
  flex-direction: column;
  gap: 1px;
-  height: 120px;
+  height: 140px;
  width: 20px;
  justify-content: center;
 }
@@ -115,6 +119,14 @@ const getPixelColor = (n) => {
  flex: 1;
 }

+.vector-ellipsis {
+  font-size: 12px;
+  line-height: 1;
+  color: var(--vp-c-text-3);
+  text-align: center;
+  padding-top: 4px;
+}
+
 .embedding-container {
  display: flex;
  flex-direction: column;
@@ -8,113 +8,135 @@
          </div>
        </div>
        <div class="toggle-label">
-          <span :class="{ active: !isVLM }">Pure LLM</span>
+          <span :class="{ active: !isVLM }">Pure LLM (纯文本)</span>
          <span class="arrow">→</span>
-          <span :class="{ active: isVLM }">Multimodal VLM</span>
+          <span :class="{ active: isVLM }">Multimodal VLM (多模态)</span>
        </div>
      </div>
      <div class="status-desc">
        {{
          isVLM
-            ? '给大脑装上眼睛：视觉信号经过翻译，变成 Token 混入文字流。'
-            : '纯文本大脑：只能听懂 Token 语言，无法感知图像。'
+            ? 'Tokens from vision are translated and placed before text tokens. (视觉信息被翻译成 Token，放在文字 Token 之前。)'
+            : 'Text-only tokens flow into the LLM. (只有文字 Token 流入大模型。)'
        }}
      </div>
    </div>

-    <div class="diagram-stage" :class="{ 'vlm-mode': isVLM }">
-      <!-- Vision Pipeline (Only visible in VLM mode) -->
-      <div class="pipeline vision-pipeline">
-        <div class="node-group">
-          <div class="node input-node image-node">
-            <span class="icon">�️</span>
-            <span class="label">Image</span>
-          </div>
-          <div class="flow-arrow">⬇</div>
-          <div
-            class="node process-node vit-node"
-            title="Vision Transformer: The Eye"
-          >
-            <span class="icon">�️</span>
-            <span class="label">ViT</span>
-          </div>
-          <div class="flow-arrow">⬇</div>
-          <div
-            class="node adapter-node projector-node"
-            title="Projector: The Translator"
-          >
-            <span class="icon">🔌</span>
-            <span class="label">Projector</span>
-          </div>
-          <div class="flow-arrow connector-arrow">⤵</div>
-        </div>
-      </div>
-
-      <!-- Text Pipeline (Always visible) -->
-      <div class="pipeline text-pipeline">
-        <div class="node-group horizontal">
-          <div class="node input-node text-node">
-            <span class="icon">�</span>
-            <span class="label">Prompt</span>
-          </div>
-          <div class="flow-arrow">➜</div>
-          <div class="node process-node embed-node">
-            <span class="icon">�</span>
-            <span class="label">Embed</span>
-          </div>
-
-          <!-- Merge Point Visualization -->
-          <div class="merge-point" :class="{ active: isVLM }">
-            <div class="plus-icon">+</div>
-            <div class="merge-label">Concat</div>
-          </div>
-
-          <div class="flow-arrow">➜</div>
-          <div class="node core-node llm-node">
-            <span class="icon">🧠</span>
-            <span class="label">LLM Backbone</span>
-            <div class="inner-flow">
-              <span class="dot t1"></span>
-              <span class="dot t2"></span>
-              <span class="dot v1" v-if="isVLM"></span>
+    <div class="diagram-stage">
+      <div class="lanes">
+        <div class="lane lane-vision" v-show="isVLM">
+          <div class="lane-title">Vision Path (视觉路径)</div>
+          <div class="lane-flow">
+            <div class="node input-node">
+              <span class="icon">🖼️</span>
+              <span class="label">Image (图片)</span>
+            </div>
+            <span class="mini-arrow">→</span>
+            <div class="node process-node vit-node">
+              <span class="icon">👁️</span>
+              <span class="label">ViT (视觉模型)</span>
+            </div>
+            <span class="mini-arrow">→</span>
+            <div class="node adapter-node">
+              <span class="icon">🔌</span>
+              <span class="label">Projector (投影器)</span>
+            </div>
+            <span class="mini-arrow">→</span>
+            <div class="token-box token-box-vision">
+              <div class="token-box-title">Vision Tokens (视觉 Token)</div>
+              <div class="tokens">
+                <span class="token vision">v1</span>
+                <span class="token vision">v2</span>
+                <span class="token vision">v3</span>
+                <span class="token vision">…</span>
+              </div>
            </div>
          </div>
-          <div class="flow-arrow">➜</div>
-          <div class="node output-node">
-            <span class="icon">💬</span>
-            <span class="label">Response</span>
+        </div>
+
+        <div class="lane lane-text">
+          <div class="lane-title">Text Path (文字路径)</div>
+          <div class="lane-flow">
+            <div class="node input-node">
+              <span class="icon">⌨️</span>
+              <span class="label">Prompt (提示词)</span>
+            </div>
+            <span class="mini-arrow">→</span>
+            <div class="node process-node">
+              <span class="icon">🔤</span>
+              <span class="label">Embed (向量化)</span>
+            </div>
+            <span class="mini-arrow">→</span>
+            <div class="token-box">
+              <div class="token-box-title">Text Tokens (文字 Token)</div>
+              <div class="tokens">
+                <span class="token text">t1</span>
+                <span class="token text">t2</span>
+                <span class="token text">t3</span>
+                <span class="token text">…</span>
+              </div>
+            </div>
+          </div>
+        </div>
+
+        <div class="merge-stage">
+          <div class="merge-title">Token Sequence (输入序列)</div>
+          <div class="sequence">
+            <div v-if="isVLM" class="sequence-row">
+              <span class="sequence-tag vision">Vision (视觉)</span>
+              <div class="tokens">
+                <span class="token vision">v1</span>
+                <span class="token vision">v2</span>
+                <span class="token vision">v3</span>
+                <span class="token vision">…</span>
+              </div>
+            </div>
+            <div class="sequence-row">
+              <span class="sequence-tag text">Text (文字)</span>
+              <div class="tokens">
+                <span class="token text">t1</span>
+                <span class="token text">t2</span>
+                <span class="token text">t3</span>
+                <span class="token text">…</span>
+              </div>
+            </div>
+            <div class="sequence-hint">
+              <span v-if="isVLM">Concat: [Vision Tokens] + [Text Tokens] (拼接：视觉在前，文字在后)</span>
+              <span v-else>Only [Text Tokens] (只有文字 Token)</span>
+            </div>
+          </div>
+
+          <div class="core-stage">
+            <span class="big-arrow">→</span>
+            <div class="node core-node">
+              <span class="icon">🧠</span>
+              <span class="label">LLM Backbone (大模型)</span>
+            </div>
+            <span class="big-arrow">→</span>
+            <div class="node output-node">
+              <span class="icon">💬</span>
+              <span class="label">Response (回复)</span>
+            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="interactive-info">
-      <div class="info-card" v-if="!isVLM">
-        <h3>Standard LLM Flow</h3>
-        <p>
-          Text is converted into vectors (Embeddings) and processed by the
-          Transformer to predict the next word.
-        </p>
-      </div>
-      <div class="info-card vlm-info" v-else>
-        <h3>VLM = LLM + Vision Encoder</h3>
-        <ul>
-          <li>
-            <strong>ViT (The Eye):</strong> Slices image into patches and
-            extracts features.
-          </li>
-          <li>
-            <strong>Projector (The Translator):</strong> Converts visual
-            features into the same "language" (vector dimension) as text
-            embeddings.
-          </li>
-          <li>
-            <strong>Concatenation:</strong> The translated visual tokens are
-            pasted <em>before</em> the text tokens. The LLM sees them as
-            "foreign words" it learned to understand.
-          </li>
-        </ul>
-      </div>
+      <transition name="fade" mode="out-in">
+        <div class="info-card" v-if="!isVLM" key="llm">
+          <h3>Standard LLM Flow (标准大模型流程)</h3>
+          <p>Prompt → Embedding → Token Sequence → LLM → Response。</p>
+        </div>
+        <div class="info-card vlm-info" v-else key="vlm">
+          <h3>VLM = LLM + Vision Encoder (视觉大模型原理)</h3>
+          <ul>
+            <li><strong>ViT (The Eye):</strong> 把图片编码成视觉特征。</li>
+            <li><strong>Projector (The Translator):</strong> 把视觉特征映射到 LLM 的 Token 空间。</li>
+            <li><strong>Concatenation (拼接):</strong> 把视觉 Token 放在文字 Token 之前，作为同一条输入序列。</li>
+          </ul>
+        </div>
+      </transition>
    </div>
  </div>
 </template>
@@ -140,12 +162,11 @@ const toggleMode = () => {
  user-select: none;
 }

-/* Controls */
 .controls-header {
  display: flex;
  flex-direction: column;
  align-items: center;
-  margin-bottom: 30px;
+  margin-bottom: 18px;
  gap: 12px;
 }

@@ -216,105 +237,160 @@ const toggleMode = () => {
  font-size: 13px;
  color: var(--vp-c-text-2);
  text-align: center;
-  height: 20px;
+  line-height: 1.5;
+  max-width: 720px;
 }

-/* Diagram Stage */
 .diagram-stage {
-  position: relative;
-  height: 240px;
  background: var(--vp-c-bg);
  border: 1px dashed var(--vp-c-divider);
  border-radius: 8px;
-  overflow: hidden;
+  padding: 18px;
+}
+
+.lanes {
  display: flex;
-  justify-content: center;
-  align-items: center;
-}
-
-/* Pipelines */
-.pipeline {
-  transition: all 0.5s cubic-bezier(0.34, 1.56, 0.64, 1);
-}
-
-.text-pipeline {
-  position: absolute;
-  bottom: 80px; /* Centered vertically in LLM mode */
-  left: 50%;
-  transform: translateX(-50%);
-  width: 100%;
-  display: flex;
-  justify-content: center;
-}
-
-.vlm-mode .text-pipeline {
-  bottom: 40px; /* Move down in VLM mode */
-}
-
-.vision-pipeline {
-  position: absolute;
-  top: 20px;
-  left: 20%; /* Align with input side */
-  opacity: 0;
-  transform: translateY(-20px);
-  pointer-events: none;
-}
-
-.vlm-mode .vision-pipeline {
-  opacity: 1;
-  transform: translateY(0);
-  pointer-events: auto;
-}
-
-.node-group {
-  display: flex;
-  align-items: center;
-  gap: 6px;
-}
-
-.node-group.horizontal {
-  flex-direction: row;
-}
-
-.vision-pipeline .node-group {
  flex-direction: column;
+  gap: 14px;
+}
+
+.lane {
+  background: var(--vp-c-bg-mute);
+  border: 1px solid var(--vp-c-divider);
+  border-radius: 10px;
+  padding: 12px;
+}
+
+.lane-title {
+  font-size: 12px;
+  color: var(--vp-c-text-2);
+  margin-bottom: 10px;
+  font-weight: 700;
+}
+
+.lane-flow {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  flex-wrap: wrap;
+}
+
+.merge-stage {
+  background: var(--vp-c-bg);
+  border: 1px solid var(--vp-c-divider);
+  border-radius: 10px;
+  padding: 12px;
+}
+
+.merge-title {
+  font-size: 12px;
+  color: var(--vp-c-text-2);
+  margin-bottom: 10px;
+  font-weight: 700;
+}
+
+.sequence {
+  border: 1px solid var(--vp-c-divider);
+  background: var(--vp-c-bg-soft);
+  border-radius: 10px;
+  padding: 10px;
+}
+
+.sequence-row {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  margin-bottom: 8px;
+  flex-wrap: wrap;
+}
+
+.sequence-row:last-child {
+  margin-bottom: 0;
+}
+
+.sequence-tag {
+  font-size: 11px;
+  font-weight: 800;
+  padding: 2px 8px;
+  border-radius: 999px;
+  border: 1px solid var(--vp-c-divider);
+  background: var(--vp-c-bg);
+  color: var(--vp-c-text-2);
+}
+
+.sequence-tag.vision {
+  border-color: var(--vp-c-yellow);
+}
+
+.sequence-tag.text {
+  border-color: var(--vp-c-brand);
+}
+
+.sequence-hint {
+  margin-top: 8px;
+  font-size: 11px;
+  color: var(--vp-c-text-2);
+}
+
+.core-stage {
+  margin-top: 14px;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  gap: 10px;
+  flex-wrap: wrap;
+}
+
+.big-arrow {
+  font-size: 18px;
+  color: var(--vp-c-text-2);
+  font-weight: 800;
+}
+
+.mini-arrow {
+  font-size: 14px;
+  color: var(--vp-c-text-3);
+  font-weight: 800;
 }

-/* Nodes */
 .node {
  background: var(--vp-c-bg);
  border: 2px solid var(--vp-c-divider);
-  border-radius: 8px;
+  border-radius: 10px;
  padding: 8px 12px;
  display: flex;
  flex-direction: column;
  align-items: center;
-  min-width: 70px;
+  min-width: 110px;
  box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
-  position: relative;
-  z-index: 2;
 }

 .icon {
  font-size: 20px;
  margin-bottom: 4px;
 }
+
 .label {
  font-size: 11px;
-  font-weight: bold;
+  font-weight: 800;
+  text-align: center;
+  line-height: 1.2;
 }

 .input-node {
  border-color: #aaa;
 }
+
 .process-node {
  border-color: var(--vp-c-brand-dimm);
 }
+
 .core-node {
  border-color: var(--vp-c-brand);
  background: var(--vp-c-brand-dimm);
-  min-width: 100px;
+  min-width: 140px;
 }
+
 .output-node {
  border-color: var(--vp-c-brand);
 }
@@ -323,101 +399,64 @@ const toggleMode = () => {
  border-color: var(--vp-c-yellow);
  background: rgba(255, 197, 23, 0.05);
 }
-.projector-node {
+
+.adapter-node {
  border-color: var(--vp-c-yellow);
  background: var(--vp-c-yellow-dimm);
 }

-/* Arrows */
-.flow-arrow {
-  color: var(--vp-c-text-3);
-  font-size: 16px;
+.token-box {
+  background: var(--vp-c-bg);
+  border: 1px solid var(--vp-c-divider);
+  border-radius: 10px;
+  padding: 10px;
+  min-width: 220px;
 }

-.connector-arrow {
-  font-size: 24px;
-  color: var(--vp-c-yellow);
-  margin-top: -10px;
-  margin-bottom: -10px;
-  transform: rotate(-45deg) translateX(10px);
+.token-box-vision {
+  border-color: var(--vp-c-yellow);
 }

-/* Merge Point */
-.merge-point {
-  width: 0;
-  overflow: hidden;
-  transition: all 0.5s;
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  opacity: 0;
-}
-
-.merge-point.active {
-  width: 40px;
-  opacity: 1;
-}
-
-.plus-icon {
-  font-weight: bold;
+.token-box-title {
+  font-size: 11px;
+  font-weight: 800;
  color: var(--vp-c-text-2);
-  font-size: 18px;
+  margin-bottom: 8px;
 }

-.merge-label {
-  font-size: 9px;
-  color: var(--vp-c-text-3);
-}
-
-/* Inner Flow Animation inside LLM */
-.inner-flow {
+.tokens {
  display: flex;
-  gap: 4px;
-  margin-top: 4px;
-  height: 6px;
+  gap: 6px;
+  flex-wrap: wrap;
 }

-.dot {
-  width: 6px;
-  height: 6px;
-  border-radius: 50%;
-  background: #fff;
-  opacity: 0.6;
-  animation: pulse 1s infinite alternate;
+.token {
+  font-size: 11px;
+  padding: 2px 8px;
+  border-radius: 999px;
+  border: 1px solid var(--vp-c-divider);
+  background: var(--vp-c-bg-soft);
+  color: var(--vp-c-text-1);
 }

-.t1 {
-  animation-delay: 0s;
-}
-.t2 {
-  animation-delay: 0.2s;
-}
-.v1 {
-  background: var(--vp-c-yellow);
-  animation-delay: 0.4s;
+.token.vision {
+  border-color: var(--vp-c-yellow);
+  background: rgba(255, 197, 23, 0.12);
 }

-@keyframes pulse {
-  from {
-    opacity: 0.3;
-    transform: scale(0.8);
-  }
-  to {
-    opacity: 1;
-    transform: scale(1.1);
-  }
+.token.text {
+  border-color: var(--vp-c-brand);
+  background: rgba(59, 130, 246, 0.12);
 }

-/* Interactive Info */
 .interactive-info {
-  margin-top: 20px;
+  margin-top: 16px;
 }

 .info-card {
  background: var(--vp-c-bg-mute);
  padding: 16px;
  border-radius: 8px;
-  animation: fadeIn 0.3s;
 }

 .info-card h3 {
@@ -439,31 +478,25 @@ const toggleMode = () => {
  margin: 0;
 }

-@keyframes fadeIn {
-  from {
-    opacity: 0;
-    transform: translateY(5px);
-  }
-  to {
-    opacity: 1;
-    transform: translateY(0);
-  }
+.fade-enter-active,
+.fade-leave-active {
+  transition: opacity 0.3s ease;
 }

-/* Mobile Adjustments */
-@media (max-width: 600px) {
+.fade-enter-from,
+.fade-leave-to {
+  opacity: 0;
+}
+
+@media (max-width: 720px) {
  .diagram-stage {
-    height: 300px;
+    padding: 14px;
  }
-
-  .text-pipeline {
-    flex-wrap: wrap;
-    gap: 10px;
-    width: 90%;
+  .node {
+    min-width: 100px;
  }
-
-  .vision-pipeline {
-    left: 10%;
+  .token-box {
+    min-width: 200px;
  }
 }
 </style>
@@ -6,67 +6,137 @@
  <div class="patchify-demo">
    <div class="control-panel">
      <div class="controls">
-        <button class="action-btn" @click="toggleState">
-          {{ isPatchified ? '还原图片 (Restore)' : '切分图片 (Patchify)' }}
+        <button 
+          class="action-btn" 
+          @click="prevStep" 
+          :disabled="currentStep === 0"
+        >
+          ⬅ 上一步 (Prev)
        </button>
-        <div class="info">
-          <span>Resolution: 224x224</span>
-          <span>Patch Size: 16x16</span>
-          <span>Total Patches: {{ 14 * 14 }}</span>
-        </div>
+        <span class="step-indicator">Step {{ currentStep + 1 }} / 4</span>
+        <button 
+          class="action-btn primary" 
+          @click="nextStep"
+          :disabled="currentStep === 3"
+        >
+          {{ currentStep === 3 ? '完成 (Done)' : '下一步 (Next) ➡' }}
+        </button>
+      </div>
+      <div class="step-desc">
+        {{ stepDescriptions[currentStep] }}
      </div>
    </div>

    <div class="visual-area">
      <!-- 原始/切分视图容器 -->
-      <div class="image-container" :class="{ 'is-patchified': isPatchified }">
+      <!-- 
+        Step 0: Show container background, cells hidden
+        Step 1: Show container background, grid overlay visible (cells with border)
+        Step 2+: Container background hidden, cells visible with individual backgrounds
+      -->
+      <div 
+        class="image-container" 
+        :class="{ 
+          'is-pixelated': currentStep >= 1,
+          'is-patchified': currentStep >= 2 
+        }"
+      >
+        <div class="grid-overlay" v-if="currentStep === 1"></div>
        <div
          v-for="n in 196"
          :key="n"
          class="patch"
-          :style="{
-            '--delay': `${n * 0.005}s`,
-            '--hue': `${(n % 14) * 20 + Math.floor(n / 14) * 20}`
-          }"
+          :style="getPatchStyle(n)"
        >
-          <span class="patch-id" v-if="isPatchified">{{ n }}</span>
+          <!-- Show number only in Pixelated stage to represent 'digitization' -->
+          <span class="pixel-val" v-if="currentStep === 1">{{ Math.floor(Math.random() * 9) }}</span>
+          <!-- Show ID in Patchified stage -->
+          <span class="patch-id" v-if="currentStep >= 2">{{ n }}</span>
        </div>
      </div>

-      <div class="arrow" v-if="isPatchified">⬇</div>
+      <div class="arrow-down" v-if="currentStep >= 3">⬇</div>

      <!-- 线性序列视图 -->
-      <div class="sequence-container" v-if="isPatchified">
-        <div class="sequence-label">Flattened Sequence (Token Input)</div>
+      <div class="sequence-container" v-if="currentStep >= 3">
+        <div class="sequence-label">Token Sequence: 196×D (每个 Token 是 D 维向量)</div>
        <div class="token-stream">
          <div
            v-for="n in 196"
            :key="n"
            class="mini-patch"
-            :style="{ '--hue': `${(n % 14) * 20 + Math.floor(n / 14) * 20}` }"
+            :style="getMiniPatchStyle(n)"
          ></div>
        </div>
      </div>
    </div>
-
-    <div class="explanation">
-      <p>
-        <span class="icon">💡</span>
-        计算机将图片切成 <strong>14x14 = 196</strong> 个小方块（Patch）。
-        然后把这些方块“拉直”成一长串序列，就像把一段话里的单词排成一排一样。
-        这就是 <strong>Visual Tokenization</strong>。
-      </p>
-    </div>
  </div>
 </template>

 <script setup>
-import { ref } from 'vue'
+import { ref, computed } from 'vue'

-const isPatchified = ref(false)
+const currentStep = ref(0)

-const toggleState = () => {
-  isPatchified.value = !isPatchified.value
+const stepDescriptions = [
+  "1. 原始图片 (Original Image): 计算机看到的原始输入。",
+  "2. 数字化 (Digitization): 图片本质上是一个数字矩阵 (H x W x C)。",
+  "3. 切块 (Patchify): 典型设置：224×224 按 16×16 切成 14×14=196 个 Patch（此处等比示意）。",
+  "4. 序列化 (Serialize): 将二维分布的 Patch “拍扁”成一维序列 (Spatial Flatten)。现在它看起来就像一串“视觉单词”，可以被 Transformer 逐个读取。"
+]
+
+const nextStep = () => {
+  if (currentStep.value < 3) currentStep.value++
+}
+
+const prevStep = () => {
+  if (currentStep.value > 0) currentStep.value--
+}
+
+// 模拟一张风景图的 CSS 渐变
+// Sky (Blue) -> Mountains (Green/Grey) -> Sun (Yellow)
+const bgImage = 'linear-gradient(to bottom, #87CEEB 0%, #87CEEB 50%, #228B22 50%, #228B22 100%)'
+// Add a sun using radial gradient
+const complexBg = 'radial-gradient(circle at 70% 20%, #FFD700 0%, #FFD700 10%, transparent 10.5%), linear-gradient(to bottom, #87CEEB 0%, #87CEEB 60%, #4CA1AF 60%, #2C3E50 100%)'
+
+const getPatchStyle = (n) => {
+  const row = Math.floor((n - 1) / 14)
+  const col = (n - 1) % 14
+  
+  // Calculate background position for each patch to match the original image
+  // The container is 280px, each patch is 20px.
+  // 14 cols.
+  const posX = col * -20
+  const posY = row * -20
+  
+  const isPatchified = currentStep.value >= 2
+  
+  return {
+    backgroundImage: complexBg,
+    backgroundPosition: `${posX}px ${posY}px`,
+    backgroundSize: '280px 280px',
+    // In Step 0, patches are hidden to show pure container background
+    // In Step 1, patches are visible but transparent background to show numbers/borders over container background
+    // In Step 2, patches take over with their own background
+    opacity: currentStep.value === 0 ? 0 : 1,
+    // In Step 1, background must be transparent to see container bg
+    backgroundImage: isPatchified ? complexBg : 'none', 
+    transform: isPatchified ? 'scale(0.9)' : 'scale(1)',
+    transition: 'all 0.5s ease',
+  }
+}
+
+const getMiniPatchStyle = (n) => {
+  const row = Math.floor((n - 1) / 14)
+  const col = (n - 1) % 14
+  const posX = col * -20
+  const posY = row * -20
+  
+  return {
+    backgroundImage: complexBg,
+    backgroundPosition: `${posX}px ${posY}px`,
+    backgroundSize: '280px 280px',
+  }
 }
 </script>

@@ -77,40 +147,68 @@ const toggleState = () => {
  padding: 20px;
  background: var(--vp-c-bg-soft);
  margin: 20px 0;
+  user-select: none;
 }

 .control-panel {
  margin-bottom: 20px;
  display: flex;
-  justify-content: center;
+  flex-direction: column;
+  align-items: center;
+  gap: 15px;
 }

 .controls {
  display: flex;
-  gap: 20px;
+  gap: 15px;
  align-items: center;
 }

-.action-btn {
-  background: var(--vp-c-brand);
-  color: white;
-  border: none;
+.step-indicator {
+  font-family: monospace;
+  font-weight: bold;
+  color: var(--vp-c-text-2);
+}
+
+.step-desc {
+  font-size: 0.9em;
+  color: var(--vp-c-text-1);
+  text-align: center;
+  background: var(--vp-c-bg-mute);
  padding: 8px 16px;
  border-radius: 4px;
-  cursor: pointer;
-  font-weight: 600;
-  transition: opacity 0.2s;
-}
-
-.action-btn:hover {
-  opacity: 0.9;
-}
-
-.info {
+  min-height: 40px;
  display: flex;
-  gap: 15px;
+  align-items: center;
+  justify-content: center;
+  width: 100%;
+}
+
+.action-btn {
+  background: var(--vp-c-bg-mute);
+  color: var(--vp-c-text-1);
+  border: 1px solid var(--vp-c-divider);
+  padding: 6px 12px;
+  border-radius: 4px;
+  cursor: pointer;
+  transition: all 0.2s;
  font-size: 0.9em;
-  color: var(--vp-c-text-2);
+}
+
+.action-btn.primary {
+  background: var(--vp-c-brand);
+  color: white;
+  border-color: var(--vp-c-brand);
+}
+
+.action-btn:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+}
+
+.action-btn:not(:disabled):hover {
+  opacity: 0.8;
+  transform: translateY(-1px);
 }

 .visual-area {
@@ -118,7 +216,7 @@ const toggleState = () => {
  flex-direction: column;
  align-items: center;
  gap: 20px;
-  min-height: 300px;
+  min-height: 350px;
 }

 .image-container {
@@ -126,31 +224,55 @@ const toggleState = () => {
  grid-template-columns: repeat(14, 1fr);
  width: 280px;
  height: 280px;
-  gap: 0;
-  background: #333;
+  /* Step 0 & 1 Background */
+  background-image: radial-gradient(circle at 70% 20%, #FFD700 0%, #FFD700 10%, transparent 10.5%), linear-gradient(to bottom, #87CEEB 0%, #87CEEB 60%, #4CA1AF 60%, #2C3E50 100%);
+  position: relative;
  transition: all 0.5s ease;
-  border: 2px solid var(--vp-c-text-1);
+  box-shadow: 0 4px 12px rgba(0,0,0,0.1);
 }

+/* Step 2+: Remove container background, let patches show */
 .image-container.is-patchified {
+  background-image: none;
+  background-color: transparent;
  gap: 2px;
-  background: transparent;
-  border-color: transparent;
 }

 .patch {
-  background-color: hsl(var(--hue), 70%, 60%);
  display: flex;
  align-items: center;
  justify-content: center;
  font-size: 8px;
-  color: rgba(0, 0, 0, 0.5);
-  transition: all 0.5s ease;
+  color: rgba(255, 255, 255, 0.8);
+  position: relative;
 }

-.is-patchified .patch {
+/* Step 1: Pixelated Overlay Effect */
+.image-container.is-pixelated:not(.is-patchified) .patch {
+  border: 1px solid rgba(255, 255, 255, 0.1);
+  /* Use pseudo-element or just opacity logic in JS */
+}
+
+/* Step 1: Digitization numbers */
+.pixel-val {
+  font-family: monospace;
+  font-size: 8px;
+  color: rgba(0, 0, 0, 0.3);
+  mix-blend-mode: overlay;
+}
+
+.patch-id {
+  background: rgba(0, 0, 0, 0.5);
+  color: white;
+  padding: 1px 2px;
  border-radius: 2px;
-  transform: scale(0.9);
+  font-size: 7px;
+}
+
+.arrow-down {
+  font-size: 24px;
+  color: var(--vp-c-text-2);
+  animation: bounce 1s infinite;
 }

 .sequence-container {
@@ -159,7 +281,7 @@ const toggleState = () => {
  padding: 15px;
  border-radius: 8px;
  border: 1px solid var(--vp-c-divider);
-  animation: fadeIn 0.5s ease;
+  animation: slideUp 0.5s ease;
 }

 .sequence-label {
@@ -171,50 +293,48 @@ const toggleState = () => {

 .token-stream {
  display: flex;
-  flex-wrap: wrap;
-  gap: 2px;
+  flex-wrap: nowrap;
+  gap: 1px;
+  overflow-x: auto;
+  padding: 10px 5px; /* Space for brackets */
+  align-items: center;
+  position: relative;
+}
+
+/* Add Matrix Brackets */
+.token-stream::before,
+.token-stream::after {
+  content: '';
+  display: block;
+  width: 6px;
+  height: 36px; /* Match vector height + padding */
+  border: 2px solid var(--vp-c-text-3);
+  flex-shrink: 0;
+}
+
+.token-stream::before {
+  border-right: none;
+}
+
+.token-stream::after {
+  border-left: none;
 }

 .mini-patch {
-  width: 10px;
-  height: 10px;
-  background-color: hsl(var(--hue), 70%, 60%);
+  width: 6px; /* Thinner to allow more density */
+  height: 32px; /* Taller to represent Vector Dimension D */
  border-radius: 1px;
-}
-
-.explanation {
-  margin-top: 20px;
-  padding: 12px;
-  background: var(--vp-c-bg-mute);
-  border-radius: 6px;
-  font-size: 0.9em;
-  line-height: 1.6;
-}
-
-.arrow {
-  font-size: 24px;
-  color: var(--vp-c-text-2);
-  animation: bounce 1s infinite;
+  flex-shrink: 0;
+  opacity: 0.9;
 }

@keyframes bounce {
-  0%,
-  100% {
-    transform: translateY(0);
-  }
-  50% {
-    transform: translateY(5px);
-  }
+  0%, 100% { transform: translateY(0); }
+  50% { transform: translateY(5px); }
 }

-@keyframes fadeIn {
-  from {
-    opacity: 0;
-    transform: translateY(10px);
-  }
-  to {
-    opacity: 1;
-    transform: translateY(0);
-  }
+@keyframes slideUp {
+  from { opacity: 0; transform: translateY(20px); }
+  to { opacity: 1; transform: translateY(0); }
 }
 </style>
@@ -3,7 +3,7 @@
    <div class="pipeline">
      <!-- 1. Transformer Output Grid -->
      <div class="stage">
-        <div class="stage-label">1. Processed Patches (Grid)</div>
+        <div class="stage-label">1. Patch Tokens (Shown as Grid) (Patch Token 网格示意)</div>
        <div class="grid-container">
          <div
            v-for="(item, index) in items"
@@ -19,13 +19,13 @@

      <div class="arrow-section">
        <div class="arrow-line"></div>
-        <div class="arrow-text">Flatten & Output</div>
+        <div class="arrow-text">Reshape for View: Grid ⇄ Sequence (重排显示：网格⇄序列)</div>
      </div>

      <!-- 2. Feature Vector Sequence -->
      <div class="stage">
        <div class="stage-label">
-          2. Feature Vector Sequence (The "Image Sentence")
+          2. Output Token Sequence (N×D) (输出序列)
        </div>
        <div class="vector-sequence">
          <div