- Recommended {{ currentSelection.interest }} × {{ currentSelection.purpose }} industries:
+
+ {{ recommendationTopics.length }} recommended scenarios for you
+
+ ({{ currentSelection.interest }} + {{ currentSelection.purpose }})
+
-
-
- {{ topic.title }}
-
+
scrollToAnchor(row.industryAnchor)"
+ highlight-current-row
+ >
+
+
+ {{ scope.row.title }}
+ {{ scope.row.desc }}
+
+
+
+
+ {{ scope.row.industryName }}
+
+
+
+
+ 💡 Click any row in the table to jump to the corresponding industry section
-
- Reset Selection
-
+
+
+
+ 💡 Please select both interest direction and purpose
+ 💡 Please select an interest direction
+ 💡 Please select a purpose
+
+
+
+ Reset Selection
@@ -411,6 +430,11 @@ Learners can choose directions based on these dimensions:
| 3 | Technical Documentation Auto-Generation System | LLM auto-generates product specifications, operation manuals; supports multi-format export |
| 4 | Production Equipment Inspection Report Auto-Generation | Voice input describes equipment status; structured inspection report auto-generated |
| 5 | Industrial Equipment Fault Diagnosis Q&A | Builds vector knowledge base from historical fault cases; provides intelligent diagnosis suggestions |
+| 6 | LLM Information-Retrieval Data Warehouse | Uses Text-to-SQL to convert natural-language queries into database queries; Superset visualizes results; Doris or ClickHouse as OLAP engine |
+| 7 | Industrial Equipment Fault-Diagnosis Knowledge Q&A Assistant | Builds a vector knowledge base from historical fault cases; LLM provides diagnosis suggestions and solution plans based on fault descriptions |
+| 8 | Production Quality Inspection Report Generation and Defect Classification | OCR identifies defects in inspection photos; LLM generates structured quality reports and classifies defect type and severity |
+| 9 | Inventory Counting Assistant and Inventory Report Generation | Inputs stocktaking data; LLM compares with system inventory and generates discrepancy reports with abnormal-inventory alerts |
+| 10 | Process Optimization Suggestion Intelligent Q&A System | Builds a RAG knowledge base from process documents; LLM provides optimization suggestions based on production issues |
---
@@ -425,6 +449,11 @@ Learners can choose directions based on these dimensions:
| 3 | Enterprise Internal Knowledge Intelligent Q&A | Builds vector knowledge base from internal documents; provides precise Q&A service for employees |
| 4 | Customer Service Conversation Smart Summary | Automatically generates conversation summaries; extracts key information and creates follow-up tickets |
| 5 | Golden Script Recommendation Knowledge Base | Analyzes excellent service cases; extracts golden scripts for team sharing and training |
+| 6 | Customer Service Script Compliance Auto-Check Assistant | Customer-service staff input reply drafts; LLM checks script compliance and sensitive words in real time and provides revision suggestions |
+| 7 | Customer Service Ticket Auto-Summary and Classification Tool | LLM summarizes long conversations and auto-classifies tags; Elasticsearch supports full-text ticket search |
+| 8 | Customer Emotion Monitoring and Abnormality Alert Tool | Real-time analysis of voice tone and text sentiment; LLM identifies abnormal emotions and triggers alerts with WebSocket push |
+| 9 | Golden Script Recommendation Knowledge-Base System for Customer Service | LLM analyzes excellent customer-service conversations, refines high-performing templates, and recommends scripts based on context |
+| 10 | Intelligent Outbound-Call Conversation Analysis and QA Assistant | After outbound-call recording transcription, LLM extracts key information; automatically generates QA reports and improvement suggestions |
---
@@ -439,6 +468,11 @@ Learners can choose directions based on these dimensions:
| 3 | Homework Auto-Grading & Learning Diagnosis | OCR recognizes handwritten answers; AI provides grading and improvement suggestions |
| 4 | Job Competency Model & Learning Map | Analyzes job requirements; generates competency models and corresponding learning paths |
| 5 | Foreign Language Oral Practice with AI | LLM plays role-play partners; simulates various real-life scenarios for speaking practice |
+| 6 | School-Based Curriculum Construction and Courseware Production Tool | LLM analyzes school characteristics and student needs to generate curriculum frameworks; integrates PPT generation APIs for automatic courseware creation |
+| 7 | College-Application Recommendation and Career Planning Platform | LLM analyzes candidate scores, ranking, interests, and other factors, then combines admissions data to recommend schools and majors |
+| 8 | Youth Programming Code Assistant | LLM explains code logic and provides coding guidance; supports switching between block languages and Python |
+| 9 | Knowledge-Point Mind Map Auto-Generation and Learning-Path Recommendation Tool | Input course topics; LLM automatically generates knowledge maps and recommends next-step learning content based on progress |
+| 10 | Chinese/English Essay Auto-Scoring and Correction Engine | LLM scores from dimensions such as idea, structure, language, and diversity, and generates annotations with high-quality sample comparison |
---
@@ -453,6 +487,11 @@ Learners can choose directions based on these dimensions:
| 3 | Unit Test Auto-Generation | Analyzes source code structure; generates boundary condition test cases automatically |
| 4 | Code Quality Analysis Tool | Analyzes code complexity, security vulnerabilities; provides optimization recommendations |
| 5 | UI Code Auto-Generation from Design | Uploads design draft images; AI generates responsive HTML/CSS code |
+| 6 | Natural Language to SQL Auto-Generation Tool | LLM converts natural-language data requests to SQL and supports complex multi-table joins and aggregation queries |
+| 7 | API Automated Testing and Documentation Generation Platform | LLM analyzes code comments and API definitions, auto-generates test cases and API docs, and integrates Postman for test execution |
+| 8 | System Log Analysis and Fault Localization | ELK Stack collects log data; LLM extracts key anomaly information and locates root causes, then recommends fixes |
+| 9 | Frontend UI Code Auto-Generation Tool | OCR recognizes layout structures from design images; LLM generates responsive CSS and component code with TailwindCSS integration |
+| 10 | Intelligent Database Schema Design and Modeling Assistant | Input business requirement docs to LLM to auto-generate ER diagrams and schema definitions; supports exporting MySQL/PostgreSQL DDL scripts |
---
@@ -467,6 +506,11 @@ Learners can choose directions based on these dimensions:
| 3 | Clinical Research Data Analysis Platform | Integrates EMR data; assists in generating statistical analysis code for research |
| 4 | Medical Imaging Report Auto-Generation | Describes imaging features; generates structured medical imaging reports |
| 5 | Chronic Disease Medication Reminder | Generates personalized medication plans; supports drug interaction and contraindication checks |
+| 6 | Drug Package-Insert Intelligent Q&A Assistant | Upload package-insert images or input drug names; LLM answers dosage, side effects, and precautions |
+| 7 | Disease Knowledge Popular-Science Article Generator | Input disease name and audience type; LLM generates easy-to-understand educational content and supports multiple versions |
+| 8 | Medical Imaging Report Auto-Generation Tool | Radiologists describe imaging features; LLM auto-generates structured report content and supports common exam templates |
+| 9 | Surgical Record Intelligent Generation and Archiving Assistant | Voice input records key surgical steps; LLM generates structured surgical records and auto-links surgery codes |
+| 10 | Chronic Disease Medication Reminder Intelligent Assistant | Patients input medication lists; LLM generates personalized reminders and supports contraindication checking and interactive Q&A |
---
@@ -481,6 +525,11 @@ Learners can choose directions based on these dimensions:
| 3 | Security Operations Daily Report | Aggregates security logs; automatically extracts and generates daily reports |
| 4 | Penetration Test Report Generation | Inputs vulnerability descriptions; AI generates complete penetration test reports |
| 5 | Threat Intelligence Analysis Assistant | Connects to threat intelligence sources; interprets and analyzes potential threats |
+| 6 | Malicious Code Protection and Privacy Compliance Monitoring | Sandboxes suspicious-file behavior; LLM identifies malicious features and generates signatures; scans sensitive data exposure |
+| 7 | Security Configuration Compliance Checklist Generation Tool | Input target system type; LLM generates configuration checklists supporting standards such as MLPS 2.0 and CIS |
+| 8 | Threat Intelligence Intelligent Query and Analysis Assistant | Connects multi-source threat intelligence (open-source/commercial); LLM interprets intelligence and links it with enterprise assets |
+| 9 | Security Incident Postmortem Report Generation Assistant | After incidents, LLM auto-generates timeline-based postmortem reports with root-cause analysis and remediation suggestions |
+| 10 | Global Threat Intelligence Monitoring and Alert Center | Crawlers collect global security news and vulnerability disclosures; LLM extracts key information, assesses impact, and sends alerts |
---
@@ -495,6 +544,11 @@ Learners can choose directions based on these dimensions:
| 3 | IPO Prospectus Generation & Compliance Check | Uses modular templates; auto-fills business descriptions with compliance verification |
| 4 | Financial Report & Anomaly Warning | Auto-generates financial analysis reports; monitors business anomalies in real-time |
| 5 | Insurance Agent Practice Coach | Simulates customer scenarios; evaluates script compliance and persuasion skills |
+| 6 | Compliance Case Intelligent Retrieval and Q&A Assistant | Builds knowledge bases from regulatory penalty cases; LLM answers compliance questions and provides relevant case references |
+| 7 | Insurance Agent Intelligent Script Practice | LLM plays different customer personas for simulation and evaluates script compliance and persuasion with transcription analysis |
+| 8 | Insurance Product Clause Analysis and Competitor Comparison Platform | Parses clauses structurally; LLM generates feature summaries and key cautions |
+| 9 | Customer Script Emotion Recognition Service | Combines voice-emotion recognition with script-compliance checks and gives real-time coaching suggestions |
+| 10 | Insurance Claim Progress Intelligent Query and Dialogue Assistant | Users input policy or case numbers; LLM queries claim status and answers claim-related questions |
---
@@ -509,6 +563,11 @@ Learners can choose directions based on these dimensions:
| 3 | Marketing Content Auto-Generation | Generates marketing copy, social media posts, and advertising scripts |
| 4 | Competitor Ad Analysis Platform | Collects and analyzes competitor advertising strategies |
| 5 | Hot Topic Analysis & Content Recommendation | Analyzes trending topics; recommends content creation angles |
+| 6 | Resume Intelligent Parsing and Job Matching System | Parses resume PDFs to extract key information; LLM matches suitable roles and generates interview suggestions; integrates with ATS systems |
+| 7 | Employee Onboarding Guidance and Q&A Assistant | Uses RAG retrieval over onboarding docs; LLM answers common new-hire questions |
+| 8 | Employee Performance Feedback and OKR Management Platform | Collects OKR data; LLM analyzes goal completion and generates feedback suggestions with 360-feedback integration |
+| 9 | Intelligent Meeting Minutes and To-Do Management | Transcribes meeting recordings; LLM extracts key points and action items; auto-creates tasks in task systems |
+| 10 | Invoice Recognition and Expense Reimbursement Auto-Processing | OCR recognizes invoice fields and automatically checks authenticity and reimbursement compliance; integrates with finance systems |
---
@@ -523,6 +582,11 @@ Learners can choose directions based on these dimensions:
| 3 | Digital Human Live Streaming System | Creates digital human anchors; generates real-time dialogue for live streaming |
| 4 | Short Video Script & Editing | Generates short video scripts; provides intelligent editing suggestions |
| 5 | Marketing Content Design System | Generates advertising copy and designs marketing materials |
+| 6 | Intelligent Marketing Content Generation and Design System | Input product information; LLM generates marketing copy and selling-point extraction; integrates with template-design tools |
+| 7 | Multi-Platform Ad ROI Real-Time Monitoring and Strategy Optimization System | Connect ad-platform APIs for data collection; LLM analyzes performance and generates optimization suggestions with anomaly alerts |
+| 8 | Search-Engine Keyword and Traffic Analysis | Collect keyword-tool data; LLM analyzes trend and competition and recommends topic direction |
+| 9 | Competitor Ad Placement Analysis Platform | Uses third-party data APIs to collect competitor ads; LLM analyzes placement strategy and creative patterns |
+| 10 | Full-Network Hot Topic Analysis and Content Recommendation System | Collects trending data; LLM analyzes trend shifts and recommends content angles with calendar scheduling |
---
@@ -537,6 +601,11 @@ Learners can choose directions based on these dimensions:
| 3 | Enterprise Policy Matching Platform | Analyzes enterprise profiles; intelligently matches applicable support policies |
| 4 | Approval Materials Pre-Review | OCR recognizes application materials; automatically checks completeness |
| 5 | City Grid Event Management | Identifies event types from reports; intelligently dispatches to responsible departments |
+| 6 | Social Sentiment Big-Data Analysis and Risk Early Warning System | Fuses multiple sources such as hotlines, online sentiment, and field visits; LLM identifies risk hotspots |
+| 7 | Government Archive Digitization Recognition and Intelligent Filing Platform | OCR recognizes archive text; LLM extracts key information and auto-classifies; supports full-text retrieval |
+| 8 | Emergency Command and Rescue Resource Intelligent Dispatch Platform | Collects emergency-event data; LLM generates emergency response plans with resource-dispatch optimization |
+| 9 | Grid-Based Atmospheric Pollution Monitoring and Precision Traceability System | Collects air-quality sensor data; CV identifies pollution sources; LLM analyzes trends and traces causes |
+| 10 | Public-Safety Incident Intelligent Risk Warning Assistant | Integrates historical events and real-time reports; LLM estimates risk levels and outputs warning recommendations |
---
@@ -551,6 +620,11 @@ Learners can choose directions based on these dimensions:
| 3 | Legal Regulation Change Monitoring | Monitors regulatory updates; analyzes impact on business operations |
| 4 | Legal Letter Auto-Drafting | Inputs case facts; AI generates standard legal letters |
| 5 | Legal Terms Plain Language Explanation | Translates complex legal terms into easy-to-understand language |
+| 6 | Courtroom Recording Real-Time Transcription and Dispute-Focus Extraction Recorder | ASR transcribes hearing audio; LLM extracts dispute focuses and key arguments with timestamps |
+| 7 | Full-Network IP Infringement Clue Monitoring and Blockchain Evidence Preservation System | Monitors e-commerce and social media infringement; automatically collects and preserves evidence |
+| 8 | LLM-Based IPO Prospectus Key-Data Consistency Check and Risk Alert Agent | Compares data across prospectus sections; LLM identifies inconsistencies and abnormal values with risk tags |
+| 9 | Complex Legal Clause "Translation" Plugin in Plain Language | Users select legal clauses and LLM outputs understandable explanations |
+| 10 | Case Evidence-Chain Intelligent Structuring and Visualization System | Upload evidence materials; LLM analyzes evidence relationships and timelines |
---
@@ -565,6 +639,11 @@ Learners can choose directions based on these dimensions:
| 3 | Visa Materials Pre-Review | OCR recognizes visa materials; automatically checks for completeness |
| 4 | Real-Time Translation for Travel | Offline voice translation; recognizes and translates menu images abroad |
| 5 | Travel Notes Auto-Generation | Extracts information from travel photos; generates shareable travel journals |
+| 6 | Data-Driven Hotel "Pitfall Avoidance" Analyzer Based on Real Reviews | Collects hotel review data; LLM extracts positive and negative keyword patterns |
+| 7 | Immersive Destination VR Preview and Virtual Room Selection Platform | Collects 360-degree panoramas; VR enables immersive previews and virtual room tours |
+| 8 | Travel Footprint Auto-Generated Travel Notes and Social Copy Assistant | Extracts time/location metadata from photos; LLM generates travel notes with template-based layout |
+| 9 | Enterprise Travel Invoice Aggregation and Compliance Reimbursement Management Platform | Connects travel-platform APIs for automatic invoice collection and compliance checks |
+| 10 | Scenic-Area Crowd Congestion Prediction and Off-Peak Route Navigation | Collects scenic-area crowd data; ML predicts congestion windows and recommends off-peak routes |
---
@@ -579,6 +658,11 @@ Learners can choose directions based on these dimensions:
| 3 | Cognitive Training for Elderly | Provides cognitive games; uses old photos to trigger memory for dementia patients |
| 4 | Social Anxiety Practice Coach | Creates virtual social scenarios; helps practice social interactions |
| 5 | Mood Monitoring & Incentive Assistant | Analyzes mood patterns; generates positive encouragement content |
+| 6 | Generative AI Customized Bedtime Story Machine for Children | Parents input themes/preferences; LLM generates customized stories with background music support |
+| 7 | Deceased Digital-Life Reconstruction and LLM Cross-Time Dialogue System | Trains personalized models from pre-death voice/text data and generates memory-based conversations |
+| 8 | MBTI-Based AI Personality Mirror and Empathetic Chatbot | Inputs MBTI results; LLM outputs personality analysis and empathetic responses with match suggestions |
+| 9 | Privacy-Protected AI Confession Tree-Hole for Teenagers | Anonymous channel for emotional expression; LLM provides listening/suggestions with sensitive-word alerts |
+| 10 | Self-Evolving AI Virtual Pet Growth System | Trains pet personality models and supports interaction-driven growth and virtual customization |
---
@@ -593,6 +677,11 @@ Learners can choose directions based on these dimensions:
| 3 | Interactive Novel Story Generator | Reader choices affect story development |
| 4 | Esports Game Analysis & Commentary | Real-time game analysis with AI-powered commentary |
| 5 | Audiobook Auto-Generation | Converts text to audio with character-specific voices |
+| 6 | Personalized Humor Content Recommendation Algorithm Engine | Builds user-interest profiles and recommends matching humor content |
+| 7 | AI Smart Vocal Tuning and KTV Voice Enhancement Software | Performs denoising and vocal enhancement with AI tuning algorithms |
+| 8 | Film/TV Character-Centric Plot Extraction and Editing Tool | Analyzes video content, extracts character-related clips, and auto-generates edited cuts |
+| 9 | Multi-Role TTS Audiobook Auto-Generation System | Assigns text roles and generates personalized voices with background music/effects |
+| 10 | Board-Game Reinforcement-Learning Review Coach | Analyzes game records, simulates AI opponents, and generates review suggestions |
---
@@ -607,6 +696,11 @@ Learners can choose directions based on these dimensions:
| 3 | Multi-Language Translation | Localizes product descriptions for international markets |
| 4 | Digital Human Live Streaming | AI-powered virtual streamers for 24/7 live commerce |
| 5 | Trend Analysis & Product Selection | Analyzes market trends; suggests trending products to sell |
+| 6 | Full-Network Same-Product AI Price Comparison and Trend Prediction Plugin | Crawls e-commerce prices, displays comparison charts, and predicts price trends |
+| 7 | Buyer-Show Image AI Selection and Short-Video Synthesis Platform | Scores buyer-show images, auto-recommends high-quality content, and synthesizes short videos from templates |
+| 8 | LLM-Based Real-Time Sales Dialogue Voice Analysis and Golden-Script Recommendation | ASR transcribes calls and performs real-time script compliance checks with recommendation output |
+| 9 | Market Trend AI Insight and Best-Seller Prediction Engine | Collects and analyzes social media and e-commerce data; LLM identifies trend hotspots and recommends product choices |
+| 10 | Private-Domain User Profiling AI Clustering and Precision Operations System | Clusters user behavior data, generates profile tags, and triggers automated marketing flows |
---
@@ -621,6 +715,11 @@ Learners can choose directions based on these dimensions:
| 3 | Electricity Price Prediction | ML predicts spot prices; generates trading strategies |
| 4 | Carbon Emission Calculation | Auto-calculates enterprise carbon footprint; generates ESG reports |
| 5 | Grid Load Prediction | Predicts grid load under extreme weather; generates dispatch plans |
+| 6 | Gas-Station Violation AI Video Recognition and Alert Guard | Analyzes surveillance video and detects violations (calling/smoking, etc.) with alert pushes |
+| 7 | Long-Distance Oil/Gas Pipeline Leak Acoustic AI Monitoring and Precision Positioning System | Collects acoustic-sensor data for leak detection and localization algorithms |
+| 8 | Virtual Power Plant Resource Aggregation and AI Power-Trading Decision System | Connects distributed resources for aggregated optimization dispatch and strategy execution |
+| 9 | Mine Personnel AI Position Tracking and Dangerous-Area Intrusion Alarm | Uses UWB/Bluetooth positioning for trajectory tracking and geofenced danger-zone alerts |
+| 10 | Energy-Storage Battery Health AI Assessment and Thermal-Runaway Warning | Monitors battery runtime data, evaluates health status, and triggers thermal-risk alerts |
---
@@ -635,6 +734,11 @@ Learners can choose directions based on these dimensions:
| 3 | Video Restoration & Colorization | 4K super-resolution; AI adds color to black and white footage |
| 4 | Text-to-Speech with Emotion | Generates natural-sounding speech with emotional expression |
| 5 | Meeting Transcription | Multi-speaker voice separation; generates meeting transcripts with action items |
+| 6 | Video Object Removal AI Engine | Uses object tracking and inpainting to remove unwanted objects with frame-level consistency |
+| 7 | Copyright-Safe Background Music AIGC Auto-Composer | Uses music-generation models with controllable emotional style and copyright checks |
+| 8 | Specific-Person Voice Clone and Voice Conversion Software | Trains timbre models from small voice samples and supports voice conversion |
+| 9 | One-Click Script-to-Storyboard and AI Dynamic Preview Video Platform | Parses scripts into storyboards and auto-generates previsualization videos |
+| 10 | Meeting Recording AI Smart Transcription and Core To-Do Extraction Assistant | Performs multi-speaker transcription and LLM-based to-do extraction with timestamps |
---
@@ -649,6 +753,11 @@ Learners can choose directions based on these dimensions:
| 3 | Logo & Brand Design | Generates brand logos; creates complete VI systems |
| 4 | Trend Analysis & Content Ideas | Tracks trending topics; suggests marketing angles |
| 5 | Video Script Generator | Generates short video scripts with shooting suggestions |
+| 6 | Competitor Marketing Strategy Deep Analysis and AI Weekly Report Generator | Collects/analyzes competitor content, extracts strategy insights, and auto-generates weekly reports |
+| 7 | Search-Engine Keyword AI Layout and Traffic Article Batch Writing | Analyzes keywords, generates articles at scale, and gives SEO optimization recommendations |
+| 8 | Personalized Marketing Email AI Writing Expert | Uses user-profile data for personalized content generation with A/B testing |
+| 9 | Brand Reputation Full-Network Monitoring and Crisis AI Alert Radar | Collects network sentiment data, runs sentiment analysis, and pushes crisis alerts |
+| 10 | Short-Video Script Creative AIGC Generation and Storyboard Guidance Assistant | Inputs themes and outputs scripts, storyboards, and practical shooting guidance |
---
@@ -663,3 +772,8 @@ Learners can choose directions based on these dimensions:
| 3 | Data Quality Monitoring | Detects data anomalies; suggests fixes |
| 4 | Report Generator | Creates reports and dashboards through conversation |
| 5 | Metric Q&A Assistant | Answers questions about data metric definitions and calculations |
+| 6 | Intelligent Data-Report Interpretation and Trend Analysis Assistant | Upload report images or input data; VLM interprets chart content and analyzes trends |
+| 7 | Intelligent DB-Schema Interpretation and Query-Example Generation Assistant | Input table names or field descriptions; LLM generates schema explanations and sample SQL |
+| 8 | Enterprise Master-Data Intelligent Alignment and AI Dedup Governance | Matches master data across sources, identifies duplicates, and supports merge-rule configuration |
+| 9 | Data Requirement Doc to Test-Case Intelligent Conversion Tool | Input data requirement descriptions; LLM generates test scenarios and validation test cases |
+| 10 | Data Metric-Definition Intelligent Q&A Assistant | Builds a knowledge base from metric-definition docs; LLM answers definition and calculation logic questions |
diff --git a/docs/en/stage-2/ai-capabilities/2.1-dify-knowledge-base/index.md b/docs/en/stage-2/ai-capabilities/2.1-dify-knowledge-base/index.md
new file mode 100644
index 0000000..acffff5
--- /dev/null
+++ b/docs/en/stage-2/ai-capabilities/2.1-dify-knowledge-base/index.md
@@ -0,0 +1,1069 @@
+# Dify Basics and Knowledge Base Integration
+
+# Review of the Previous Lesson
+
+In the previous lessons, we learned in groups the basics of AI coding, prompt engineering, and AI image generation. These topics helped us build an initial understanding of the boundaries and capabilities of different large language models (LLMs) and generative models.
+
+To help you review the previous lesson, think through these quick questions:
+
+1. What is AI programming? How can you use an AI coding tool (for example, [z.ai](http://z.ai)) to create a webpage?
+2. What is a large language model? What are prompt engineering and context engineering? How should you write a complex prompt?
+3. Across text, AI coding, and image generation, where do you think model strengths and weaknesses show up most clearly?
+4. What is an API? How do you use [z.ai](http://z.ai) to connect to third-party APIs?
+
+If any question still feels unclear, you can revisit the previous lesson docs or ask directly in the WeChat group.
+
+In this lesson, we move from simple AI text/image tools to workflow-building platforms closer to real business deployment. We go from chatbots to AI agents and AI workflows, and then use APIs to turn them into interactive "intelligent" chatbot pages.
+
+During hands-on operation, if any step is hard to understand, do not worry. A recommended approach is to take a screenshot of the page you are on and ask a model directly. Current models can already resolve most common issues.
+
+If you still cannot solve it after asking, keep trying. Do not be afraid of mistakes. Every attempt is part of learning and progress. With more practice, you will become increasingly fluent and confident.
+
+# What You Will Learn in This Lesson
+
+1. Why we need to move from chatbots to agents and workflow orchestration.
+2. What an agent/workflow development platform is, and how to turn AI capability into SOP-style, orchestratable processes.
+3. What Dify is, and how to quickly build applications on this open-source LLM platform, especially a knowledge-base QA chatbot.
+4. How RAG works and why retrieval-augmented generation is needed.
+5. How to learn Dify and AI IDE Trae (`Extra Knowledge 4 - What is AI IDE and Trae`) from 0 to 1, including building agents, workflows, and a frontend chatbot webpage using Dify API.
+
+- Basic Dify principles, agent/workflow building methods, and API invocation.
+- AI IDE usage and AI-assisted coding workflow.
+- A frontend agent program that can chat.
+
+# 1. From Conversation to Agent
+
+In the previous stage, we learned how to use prompts to make models play roles, generate text, or write simple code. But if you think carefully, there is a key issue: a chatbot itself cannot actually do work.
+
+It can answer "how to check an order," but it cannot truly query your database for the order number. It can describe what a weekly report should include, but it cannot automatically collect project data and send the email. This "can say but cannot do" limitation makes pure conversational AI hard to truly embed into business processes.
+
+To upgrade AI from chat companion to digital employee, we need to give it three core capabilities:
+
+1. Proprietary knowledge: let it read and understand your product docs, customer materials, and internal policies.
+2. Tool calling (or plugins): let it operate databases and call APIs.
+3. Structured execution: let it complete tasks step by step with predefined logic, not free improvisation.
+
+This is the prototype of an AI Agent: an automation unit with goals, knowledge, tools, and an execution path.
+
+
+
+> Note: In current industry usage, "simple agents" usually mean enhanced applications built from LLM + tools + knowledge base, not fully autonomous planning agents. Even though these simple agents do not have true long-horizon reasoning and planning, they are already enough for many enterprise automation scenarios. We will introduce truly autonomous agents in later chapters.
+
+## 1.1 The Simplest Agent: Knowledge-Base QA Chatbot
+
+After clarifying the core capabilities of an agent, a natural question follows: can we build a practical basic agent by implementing only one of these capabilities? The answer is yes.
+
+In many real business scenarios, users do not need AI to execute complex operations (such as API orchestration across multiple systems). Their core need is accurate, reliable QA grounded in company-specific materials. This maps exactly to the first core capability: proprietary knowledge service.
+
+That leads to the simplest and most widely used agent form: a knowledge-base QA chatbot.
+
+Although it does not yet include tool calling or autonomous planning, the key breakthrough is this: model answers are no longer generated "from thin air." They become evidence-grounded. How is that achieved? We need to solve one core challenge: when there are thousands of pages of internal docs, how can the model quickly find the most relevant parts for each user question?
+
+One solution is Retrieval-Augmented Generation (RAG).
+
+The core RAG idea is: when a user asks a question, the system first retrieves the most semantically relevant text chunks from enterprise knowledge (for example, one paragraph from a product manual, one policy clause from HR docs), then injects these chunks into model context so the answer is generated based on real source material.
+
+
+
+Image source: [https://www.datacamp.com/blog/what-is-retrieval-augmented-generation-rag](https://www.datacamp.com/blog/what-is-retrieval-augmented-generation-rag)
+
+This means responses no longer rely only on generalized training knowledge. They are anchored to enterprise-authoritative information. The goal of RAG is exactly this dynamic external-knowledge injection, which significantly improves answer truthfulness, accuracy, and consistency. It can even enforce response persona/style, such as customer-support tone or technical-document style.
+
+In real business, this is especially important because models can hallucinate. For example, if you ask for concrete metrics as a CFO or consultant, a model may fabricate dates and events. With RAG, controllability and reliability improve significantly.
+
+
+
+Image source: [https://www.databricks.com/glossary/retrieval-augmented-generation-rag](https://www.databricks.com/glossary/retrieval-augmented-generation-rag)
+
+In this lesson's hands-on section, we will use Dify, a popular AI workflow platform, to build a knowledge-base QA chatbot. You can easily turn many kinds of proprietary materials into a knowledge base, such as product manuals, company policy docs, project docs, research papers, knowledge-base articles, and even personal notes.
+
+After setup, you can test with questions such as:
+
+- "What are the major upgrades in the latest version of Product A?"
+- "According to the employee handbook, how is annual leave policy defined this year?"
+- "In project XX, how did we solve technical challenge 'XXX'?"
+- "What is the core research method described in this paper?"
+
+You will directly feel how RAG transforms static, scattered documents into a precise intelligent knowledge base that supports high-accuracy QA across scenarios.
+
+## 1.2 From Conversational Agent to Workflow
+
+However, even "enhanced agents" with knowledge base and tool calling are still insufficient for more complex business processes.
+
+Imagine this request:
+"What new features were released in our newly launched SaaS product recently? Can you organize them into a client-facing brief?"
+
+This looks simple, but behind the scenes it requires coordinated steps: first retrieve the last month's release notes from internal docs or Notion knowledge base; then filter customer-facing key features; then call an LLM to rewrite technical descriptions into customer-friendly language; and finally send the generated content to the marketing team's email or save it into a Google Docs template.
+
+If we rely only on a single LLM to reason freely, it is hard to execute the entire process in one dialogue. Even if it does, it can miss key details, confuse internal terms with customer language, or fail to output in structured form. More importantly, enterprises need an auditable, reusable, monitorable standardized execution path, not one-off improvisation in each run. Monitoring and reproducibility are crucial for enterprise risk control.
+
+This leads to a higher-level AI application pattern: AI Workflow.
+
+
+
+Workflow means decomposing a complex task into ordered, configurable, automatically executable sub-steps, then orchestrating logic between steps (conditionals, loops, parallelism) visually or via code. Turning AI capability into SOP means solidifying "how AI completes this task" into reusable templates.
+
+This brings multiple benefits: non-technical roles (such as product managers or operators) can build AI apps quickly via drag-and-drop; developers can encapsulate RAG retrieval, LLM calls, API tools as standard nodes for reuse across business scenarios; and the full process can be tracked, debugged, and optimized continuously to satisfy enterprise requirements for stability and compliance.
+
+AI workflow users are broad. Product managers can design full interaction flows without writing code; operations can quickly build customer-service bots, content generators, or notification systems; developers and ML engineers can modularize capabilities for frontend integration; founders and indie developers can validate AI MVPs at low cost and launch prototypes with query + generation + actions in days.
+
+Also note that AI workflows are usually described by an intermediate representation. Platform specifics differ, but most use structured files (JSON, YAML, etc.) to define node types, inputs/outputs, and execution logic, as shown below:
+
+
+
+In short, if agents let AI move from "can chat" to "can do," workflows let AI move from "occasionally complete one task" to "stably, reliably, and at scale complete a class of tasks." In the following practice, we will build a full AI workflow on Dify and experience the full path from idea to runnable app.
+
+## 1.3 Common Agent / Workflow Platforms
+
+As generative AI develops rapidly, many low-code and no-code agent/workflow platforms have emerged to help developers and business users build intelligent processes quickly without falling into low-level coding complexity.
+
+First, clarify what low-code means: development tools that significantly reduce manual coding through drag-and-drop visual components, preset logic templates, and graphical rule configuration. Core idea: replace direct coding with visual node orchestration. This frees technical users from repetitive work and allows non-technical users familiar with business logic to participate in app building. It is essentially a bridge between efficiency and flexibility.
+
+The key value of low-code/no-code AI platforms is reducing development threshold. Work that used to take weeks of cross-functional collaboration (requirements, coding, testing, deployment) can now go from idea to launch in hours for common agent scenarios such as customer QA bots and data-processing assistants.
+
+Mainstream low-code AI workflow platforms include:
+
+| Platform | Features | Typical Scenarios |
+| --------------------------------------------- | -------------------------------------------------- | -------------------------------------- |
+| Dify | Open source; supports knowledge-base RAG, LLM orchestration, API output; Chinese-friendly | Enterprise knowledge QA, custom agents, API services |
+| Coze (ByteDance) | Available in China, integrated with Doubao/Feishu ecosystem, rich plugins | Social bots, domestic mini-program integration |
+| n8n | General automation platform with AI nodes, strong in API orchestration | Cross-system sync, AI + traditional SaaS automation |
+| Baidu Qianfan AppBuilder / Alibaba Bailian / Tencent HunYuan | Cloud-native vendor stacks with in-house models | Enterprise deployment, strict compliance scenarios |
+
+There are many choices in the market. Although AWS, Azure, Alibaba Cloud, and others all provide workflow solutions, Dify, Coze, and n8n are currently among the most widely used due to three major advantages:
+
+1. Extreme usability: visual drag-and-drop UIs make onboarding easy without deep low-level understanding.
+2. High flexibility: custom components and extensible APIs support both lightweight demo/MVP and agile iteration for SMB teams.
+3. Mature ecosystem: detailed docs, responsive support, and active communities with reusable templates.
+
+All three support exposing built agents as standardized APIs, enabling seamless integration with frontend web apps, enterprise ERP systems, and mobile apps, which further lowers deployment threshold.
+
+### 1.3.1 Dify: Enterprise LLMOps and Application Lifecycle Platform
+
+Dify is positioned as an LLM application development and operations platform, focused on full lifecycle management from idea to deployment to optimization. Its core is a low-code platform helping developers and non-technical innovators rapidly build production-grade AI applications.
+
+
+
+Feature-wise, Dify includes visual workflow orchestration, agent building, knowledge-base management, and multi-model support. You can design complex processes by dragging nodes and create intent-based agents. Its knowledge-base capability can process many document formats and support efficient vector retrieval. Dify supports GPT, Claude, and many open-source models, and can publish apps as standard APIs with one click.
+
+
+
+Architecturally, Dify emphasizes open source and private deployment, with flexibility, extensibility, and enterprise compliance. Typical users include developer teams and business innovators. Typical use cases include enterprise knowledge QA/customer support, content automation, vertical AI assistants, and enterprise AI middle platforms.
+
+### 1.3.2 Coze (ByteDance): Popularizing Zero-Code AI Agent Building
+
+Coze is ByteDance's AI agent platform. Its core value is extreme usability, allowing users with no programming background to create, debug, and publish rich AI chatbots.
+
+
+
+Its core interaction is "building blocks." Users can configure bot roles and knowledge bases via UI, and use rich built-in plugin libraries for external capabilities such as news, travel, and image generation. Built bots can be published with one click to Doubao, Feishu, WeChat Official Account, and other channels.
+
+
+
+Its architecture is designed around low-threshold usage, integrating ByteDance models behind cloud services and abstracting complex flow details, with emphasis on multimodal understanding and real-time responses. Private deployment capability is relatively limited. Typical scenarios include personal assistant and entertainment bots, customer QA systems, online learning assistants, and rapid prototyping.
+
+### 1.3.2 n8n: Programmable Backend Workflow Automation Engine
+
+n8n is a general-purpose programmable workflow automation platform. Its core positioning is connecting applications, databases, and APIs to automate data movement and task execution.
+
+It supports hundreds of SaaS services, databases, and protocols through a large integration-node ecosystem, and combines visual design with code: you can drag nodes on canvas while injecting JavaScript/Python for custom logic. n8n is strong in backend, data-intensive workflows such as sync, ETL, and API orchestration.
+
+
+
+Its key technical characteristic is visible source code and self-hosting, allowing full control of data and environment. This is especially attractive for industries with strict data-security requirements. Main users are developers, technical operators, and data analysts. n8n's biggest strength is its powerful community ecosystem: rich online tutorials and shared templates lower learning cost. It also connects to global ecosystems such as YouTube and Instagram, helping users break cross-platform data/service barriers.
+
+### 1.3.3 Other Workflow Platforms
+
+Besides these well-known platforms, major Chinese tech vendors also launched integrated AI platforms. For example, Baidu Qianfan AppBuilder supports end-to-end model selection, RAG building, and agent publishing, deeply integrated with Wenxin models; Alibaba Bailian (Tongyi-based) emphasizes enterprise security and private deployment; Tencent Cloud TI focuses on finance/healthcare vertical templates. These are often deeply integrated with their cloud ecosystems and fit enterprises already in those stacks.
+
+However, in terms of generality, openness, and community ecosystem, Dify and Coze are still among the most widely adopted choices due to usability, broad model support, and active developer communities.
+
+Although platform positioning and ecosystems differ, the core logic is similar: visually orchestrate and connect capability modules. Once you master the design and operation of one platform, you can transfer quickly to others. In the following practice, we use Dify as the example.
+
+# 2. Understanding Dify Step by Step
+
+## 2.1 What is Dify
+
+We already covered basic Dify introduction earlier. For more details, visit [https://cloud.dify.ai/apps](https://cloud.dify.ai/apps), and for official information visit https://dify.ai.
+
+Dify is an open-source platform for developing LLM applications. It provides an intuitive interface that combines agent workflows, RAG pipelines, tool capabilities, model management, and observability, helping you move quickly from prototype to production.
+
+
+
+In Dify, you can combine large models and many tools to build a "workflow." A workflow is a business-logic chain that automates operations you would otherwise do manually step by step, such as data retrieval, LLM calls, web search, result filtering, and format organization. Without workflows, you repeatedly copy/paste similar prompts, which is inefficient, error-prone, and hard to reuse in real business.
+
+Building workflows is like assembling blocks/puzzle pieces. You connect LLM nodes (understanding/generation), tool nodes (specific actions such as querying DB, sending email, translating text), and data nodes (read/store info). They then collaborate automatically under your predefined logic without manual repetition. You can also think of it as "low-code programming": by drag-and-drop and input/output configuration, you can implement fairly complex business logic.
+
+For example, if you run an Amazon or Douyin e-commerce store and want an AI customer service system, you can design a workflow like this:
+
+1. Trigger node (`START`): receives user query, for example "How long is the warranty period for this product?"
+2. Question classifier node (`QUESTION CLASSIFIER`): uses an LLM (for example GPT) to classify the query into after-sales (warranty), usage guidance, or other types.
+3. Knowledge retrieval node (`KNOWLEDGE RETRIEVAL`): automatically queries the corresponding knowledge base based on classification. If warranty-related, retrieve precise warranty SOP content.
+4. LLM node: sends user query + retrieved context to model and generates user-friendly response.
+5. Condition node: checks whether response includes clear warranty period terms (for example "1 year" or "3 years"). If yes, continue; if no, return "please provide product model."
+6. Output node (`ANSWER`): returns final answer and logs this consultation into a table automatically.
+
+
+
+In this process, you do not manually browse docs, repeatedly tune outputs, or separately log data. The workflow chains it all automatically. It is also flexible: if later you add a new rule like "when user asks warranty coverage, query another KB," just add one conditional node instead of rebuilding the system.
+
+This is a relatively simple workflow example. Fully mastering all capabilities may still feel hard at this stage. So in this lesson, we start from a more basic knowledge-base agent and gradually move to advanced workflow techniques later.
+
+### 2.1.1 Deploy Your Own Dify (Optional)
+
+This part was originally scheduled for later lessons. Because some learners currently cannot access Dify official cloud due to network constraints, we provide this optional path earlier so you can continue smoothly.
+
+You need to reference this tutorial for basic web deployment platform usage:
+[How to Deploy a Web Application](/en/stage-2/backend/2.5-zeabur-deployment/)
+
+
+
+Learn how to deploy your own Dify on Zeabur. After deployment, register and log in via your deployment URL, then continue with the steps below.
+
+Note: different Dify versions may have small UI/operation differences, but overall logic is similar. If something looks different, do not panic; find equivalent entry points and continue.
+
+## 2.2 Create Your First Dify Chatbot App
+
+Visit Dify home page [https://cloud.dify.ai/apps](https://cloud.dify.ai/apps), register and log in, then choose Studio. You will see an interface similar to:
+
+
+
+Find `CREATE APP` on the left and click `Create from Blank`.
+
+
+
+
+
+In APP Type, choose Chatbot (if not visible at first, click "see more types" and find it in full list). Then fill app name and description and click create.
+
+
+
+After creation, you will see an interface like this:
+
+
+
+The middle "INSTRUCTIONS" area means built-in instructions (default/system prompt).
+
+Below that is the "Knowledge" area where we upload knowledge base later.
+
+The right panel is the debug window where you can test interactions in real time after editing prompts.
+
+You can type your own role prompt in INSTRUCTIONS, or click Generate to let the model draft one.
+
+
+
+Note the top-right model choices: you can switch different models and compare differences in tone, reasoning, and long-context handling to pick what best fits your needs.
+
+
+
+## 2.3 Support Custom Model Providers
+
+To fully leverage Dify flexibility, and because model availability differs by region and business constraints (cost/privacy), we often need custom models. Dify supports three core model types: LLM, Embedding, and Rerank. This section walks through custom configuration.
+
+Dify can connect mainstream providers (OpenAI, Azure, Anthropic) and also supports any self-hosted or third-party model that follows OpenAI API compatibility. You can do this by installing the built-in OpenAI Compatible plugin and vendor-specific plugins.
+
+Detailed steps:
+
+1. Install `OpenAI-API-compatible` and `SiliconFlow` plugins to support most LLM and Embedding models. The first supports OpenAI-compatible APIs; the second is a service hub containing many common high-quality open-source models.
+ 1. https://marketplace.dify.ai/plugins/langgenius/openai_api_compatible
+ 2. https://marketplace.dify.ai/plugins/langgenius/siliconflow
+2. If you self-hosted Dify, go to plugin marketplace in system settings and install there.
+
+
+
+
+
+After entering plugin marketplace, search plugin names directly.
+
+
+
+3. After installation, configure model providers. In settings -> model providers, you can see all currently supported providers:
+ 
+4. Before use, complete model config first. For OpenAI-API-compatible plugin, click "Add Model" and configure any model. In "Model Type," select whether it is LLM or Embedding, and ensure type is correct.
+ You need model name, endpoint URL, and API key to enable it. If this feels cumbersome initially, you can skip to SiliconFlow key setup or install OpenRouter plugin for easier provider support (ensure your provider account has remaining quota).
+
+ 
+
+ For `SiliconFlow`, just click Setup and configure key to use Embedding/Rerank for testing. You can click "Get your API Key from SiliconFlow" to obtain credentials.
+
+ 
+
+5. After configuration, open model list to inspect supported models. Basic model setup is now complete.
+ 
+
+ It supports most common Embedding and Rerank models:
+
+ 
+
+ If you want to modify Dify's default model set, click `System Model Settings` and update defaults.
+
+ 
+
+## 2.4 Create Your First Dify Knowledge Base
+
+At this point, we created a basic agent, but it still lacks a knowledge base. Click `Knowledge` in the top menu to enter knowledge-base creation.
+
+
+
+Then click `Create Knowledge` on the left to create your first knowledge base.
+
+
+
+On this page, you can upload many file types (PDF, TXT, etc.) to build knowledge. You can upload long text or copy Wikipedia content into TXT and upload. In this example we upload an Elon Musk Wikipedia TXT file.
+
+After clicking Next, you enter Knowledge Base Settings. There are many options, so let us walk through step by step.
+
+First in **General** settings, this is the "text chunking rules" area. Because long text must be split into smaller chunks, we define chunk strategy first. For entry level, only focus on **maximum chunk length**. Try 512, 2048, or 4096, and click **Preview Chunk** to compare effects.
+
+You can also adjust **Chunk overlap**. It controls whether adjacent chunks preserve overlapping content. Proper overlap helps avoid splitting critical information across chunks in a way that harms comprehension.
+
+
+
+There is also **Chunk using Q&A format in English**. When enabled, the system uses LLM to convert part of knowledge into Q&A format before storage, which can significantly improve retrieval in some scenarios.
+
+In real business, selecting chunk strategy according to scenario greatly affects retrieval quality and whether returned content matches expectations.
+
+Scroll down for Embedding model settings.
+
+Simple explanation: Embedding models convert unstructured data (text, images, etc.) into machine-understandable numeric vectors. This enables rapid similarity computation and semantic matching, such as retrieving documents/images/products closest in meaning to user input.
+
+Embedding choice significantly affects retrieval quality (accuracy, latency, etc.). Here we recommend starting with Qwen 0.6B Embedding. You can switch to 4B or 8B and compare parameter-scale impact.
+
+
+
+You will also see **Rerank model**, default **Jina-rerank-m0**. (If you are outside campus environment, you may see missing Rerank model errors. In that case configure rerank model in model provider settings first.)
+
+Rerank's purpose is second-stage fine sorting over initial candidates, moving results most aligned with user intent to top positions, improving relevance and UX.
+
+Simple intuition: rerank solves "first-stage retrieval not refined enough." Search engines may retrieve 1000 potential pages by simple rules, then rerank top 10 for page one. Recommenders work similarly: from 500 possible items, rerank promotes most likely conversions.
+
+
+
+After settings are complete, click **Save & Process** to start vectorization. Embedding models transform chunked text into vectors at this stage.
+
+
+
+After processing finishes, click **Go to document** to inspect processed/stored KB content.
+
+
+
+Click KB name directly to view each chunk detail.
+
+
+
+You can precisely edit or delete unsuitable chunks here.
+
+
+
+In left sidebar, choose **Retrieval Testing** to test recall and verify retrieval quality. Each test returns several highest-similarity chunks.
+
+
+
+If you want more retrieved chunks, click `VECTOR SEARCH` settings:
+
+
+
+
+
+Top K means number of most similar text chunks returned from vector search. Current value 3 means top 3 chunks are returned.
+
+Score Threshold is a minimum score filter: only chunks with similarity score >= threshold (for example 0.5) are returned, filtering low-relevance content for higher precision.
+
+Now KB setup is complete. Next, click top menu "studio," find the agent we created earlier, and connect this KB.
+
+
+
+
+
+In each chat round, you can now see cited knowledge sources in the response. Click entries to inspect retrieved text chunks.
+
+
+
+
+
+## 2.5 More Common Dify Operations
+
+After mastering basic chatbot + KB setup, we can go deeper into common Dify operations.
+
+### 2.5.1 Workflow Import and Export
+
+Remember intermediate representation mentioned earlier? Dify supports importing/exporting workflows in DSL (Domain Specific Language) format. DSL is a JSON-based standardized representation preserving node structure, links, and config parameters. You can easily export/import DSL files to share workflows or study others' designs.
+
+In practice, you can find import entry on workflow workspace:
+
+
+
+For export, click the lower-right corner of a workflow block to find export action:
+
+
+
+Using DSL makes migration/sharing of complex workflows across Dify instances straightforward.
+
+### 2.5.2 Explore More Dify Projects
+
+If your own workflow feels too simple, Dify provides rich sample projects for learning more advanced application construction. These examples cover many business scenarios. Click Explore to view workflows built by others.
+
+
+
+## 2.6 Create Your First Dify Workflow App
+
+After starting with chatbot-style agents, we now build more complex business workflows. Workflow is Dify's core method for visualizing complex business logic. You can directly observe data flow between nodes, where decision logic is placed, where human intervention points are set, and how final business outcomes are produced.
+
+You can create from blank or from templates. Here we demonstrate creating from blank:
+
+
+
+
+
+Here you will see Chatflow and Workflow. How do you choose? Decide based on whether your core need is continuous conversation or task pipeline execution.
+
+Chatflow is designed for dialogue. It simulates a conversational entity with memory and context continuity, ideal for multi-turn interactions and stateful sessions. For customer support, it can handle follow-up questions coherently. Streaming output also feels more natural. If you need an agent that "converses," choose Chatflow.
+
+Workflow focuses on automated process execution. It acts like a predefined pipeline for one-off inputs, multi-step processing, and deterministic outputs. For example daily report generation, batch file processing, or chained API calls. These tasks are usually event-triggered and not real-time conversational. If your need is "automation," choose Workflow.
+
+To avoid mismatched architecture, evaluate with four questions:
+
+1. Does the process require repeated user input/adjustment?
+2. Does output need stepwise/streaming presentation?
+3. Does logic strongly depend on previous interaction history?
+4. Is the task event-triggered and mostly one-shot input/output?
+
+If first three are yes, Chatflow is ideal (customer support, tutoring, creative collaboration). If fourth dominates, Workflow is a better fit (data cleaning, report generation, batch processing).
+
+Here we choose Chatflow for demonstration and enter workspace:
+
+
+
+Quick interface tour: the center canvas is where you visually build app logic. A basic workflow usually starts at `START` (input), passes data through links into `LLM`, and outputs through `ANSWER`. Each node is a function module; links determine execution order.
+
+Around the canvas are management controls. Top area includes global actions like `Preview` (test) and `Publish` (release). Canvas corners include zoom/undo and other view controls.
+
+Left panel contains app-management areas. `Orchestrate` is for flow design. After building, use `API Access` for integration credentials. `Logs & Annotations` records execution traces for debugging. `Monitoring` provides runtime status/performance visibility.
+
+You can type simple prompt instructions in Chatflow LLM node SYSTEM, run Preview, and verify behavior changes as expected.
+
+### 2.6.1 Common Node Types
+
+Dify provides many node types. First understand each node's role. For practical usage, test directly, learn from templates, or ask a model with screenshots about parameters and usage. A good beginner tactic: replace nodes in existing templates and infer best practices from known working patterns.
+
+Right-click canvas and choose `Add Node`, or inspect all available nodes from side panel:
+
+
+
+You can also open tool selection panel to view callable tool categories:
+
+
+
+Below is a brief intro to common nodes/tools. You do not need to master all at once. Keep a basic mental map and learn progressively in practice.
+
+1. LLM and reasoning nodes
+
+
+
+
+
+These nodes are core processing components:
+
+- LLM node: core compute unit that calls an LLM. Key focus is prompt engineering and parameter tuning to map business tasks into executable model instructions.
+- Knowledge Retrieval node: retrieves relevant information from configured KBs or external authoritative sources to support LLM and reduce hallucination risk.
+- Answer node: output unit that formats processed content into final business-ready result (response template, formatting spec, etc.).
+- Agent node: advanced decision unit. Beyond model call, it can do multi-step planning and dynamic tool selection, suitable for complex task chains.
+- Question Classifier node: classifies user input by intent/topic and routes to appropriate downstream paths (different prompts/toolchains per category).
+
+2. Logic and flow-control nodes
+
+
+
+These nodes define execution path/rules:
+
+- Condition node (`IF/ELSE`): Boolean-based branching. Key is strict condition design that covers business cases comprehensively.
+- Iteration node: stateless batch-parallel processing, best when sub-tasks have no interdependency (batch translation, parallel review, multi-report generation). It takes input array, slices elements, runs same chain in parallel. Use `{{item}}` for current element and `{{index}}` for index. Outputs aggregate back to array. Configure parallelism to balance speed/load; configure retry/failure handling for reliability.
+- Loop node: stateful recursive iterator, best when each round depends on previous output (parameter tuning loops, iterative content polishing, chained dependent calculations). Core is state variable management: initialize before loop, update each round, and define strict stop conditions (max rounds, quality threshold, external stop signal) plus timeout and exception path to avoid infinite loops.
+
+3. Data operation and integration nodes
+
+
+
+- Code node: executes custom logic for data transform, complex computation, etc. Focus on syntax correctness and runtime compatibility.
+- Template node: fills dynamic data into templates (custom copy/report skeleton). Focus on template syntax and variable mapping.
+- Variable Aggregator node: collects outputs from multiple nodes into a unified dataset. Focus on scope and merge rules.
+- Doc Extractor node: extracts text/tables from PDF/Word and converts into structured processable data.
+- Variable Assigner node: defines/initializes/updates workflow variables for data passing.
+- Parameter Extractor node: extracts structured parameters from user/API inputs (regex/JSON path, etc.).
+- HTTP Request node: sends external API requests (GET/POST, etc.) for system integration.
+- List Operator node: filters/sorts/splits list data to match downstream structure.
+
+### 2.6.2 Common Tools
+
+
+
+In Dify, most tools can be used directly as canvas nodes and connected like other nodes. As long as your input matches expected parameters, the tool runs and outputs results for downstream processing.
+
+From side panels, you can inspect available tool nodes and extend capabilities through plugin marketplace. A few common tool categories:
+
+- Web search tools
+ - Tavily Search is a common representative, providing AI-optimized real-time factual retrieval.
+ - It returns structured results (title/summary/link, etc.), suitable for injecting into LLM prompts for latest-info and evidence-required answers.
+- Data processing tools
+ - For example JSON Process plugin supports querying/filtering/transform/merge on JSON data.
+ - Useful when handling complex API responses and nested data, reducing repeated manual parsing code in Code nodes.
+- Format processing tools
+ - For example Markdown Exporter can export generated content into target formats (Markdown, custom templates, etc.) for display/reporting/system integration.
+
+You can view install counts and descriptions in tool list. At the beginning, prioritize "Featured/Recommended" tools because they cover common scenarios.
+
+Tool usage can still be complex. A practical shortcut is to search official workflow DSL examples for each tool and import directly, which is often much faster than building everything from scratch.
+
+### 2.6.3 Build a Simple Intent Classification Workflow
+
+Now that we understand Dify workflow/tool basics, we need hands-on practice. Without practice, details never become fluent. We need a realistic business scenario.
+
+For example, in real food-ordering chat scenarios, user input is never clean parameters. Some users place orders, some complain, some chat casually, some go off topic. If all these inputs are sent to one shared LLM path, two common issues appear:
+
+1. Unstable response style
+ Same complaint may get an apology in one run but an excuse in another. Same order may trigger missing-info follow-up in one run but hallucinated order details in another.
+2. Uncontrollable business logic
+ You want "complaints must start with apology," but model may not always comply. You want "off-topic queries should be redirected," but model may continue chatting off-domain.
+
+A more engineering approach is standardized pipeline decomposition:
+intent classification first (determine what user wants), then intent-based routing (different prompts/roles per scenario), then unified output packaging from routed branches (for frontend/system integration).
+
+Goal: handle multiple dialogue types in a food-service scenario. Follow once to build familiarity.
+
+First define intents:
+
+- **buy_food**: user shows clear purchase/order intent.
+ - Example: "Give me one fried chicken and one cola."
+- **complain**: user expresses dissatisfaction/anger/complaint.
+ - Example: "Why is it so slow? I've waited for an hour."
+- **chitchat**: user asks open recommendations without explicit order command.
+ - Example: "What should I eat today? Any recommendations?"
+- **other**: irrelevant to food-ordering scenario.
+ - Example: "Help me write a funny social post."
+
+For these four intents, predefine four communication personas via four dedicated LLM nodes:
+
+- **LLM_BuyFood**: professional and efficient. Confirm order details and proactively complete missing information.
+- **LLM_Complain**: empathetic and calm. First soothe user and provide clear resolution steps.
+- **LLM_Chitchat**: relaxed and friendly. Provide personalized recommendations and guide potential conversion.
+- **LLM_Other**: polite and boundary-aware. Redirect off-topic conversations back to core business.
+
+#### Workflow Orchestration Design
+
+Now define node architecture. Beginners often do not know what nodes to use (and even advanced users often ask models for first-pass design because it is fast). Core structure:
+
+- Start: data entry node receiving raw input `user_text`.
+- Question Classifier: "brain + dispatcher." It analyzes `user_text` and outputs one of four intent labels.
+- Condition: "routing valve." It forwards flow based on classifier label to the corresponding handling branch.
+- Four parallel LLM nodes (`LLM_BuyFood`, `LLM_Complain`, `LLM_Chitchat`, `LLM_Other`): each gets original question but responds differently based on its own SYSTEM prompt persona.
+- Variable Aggregator: after branch processing, aggregate the one activated branch output into unified variable `final_reply` for stable output structure.
+- Output: final structured output (for example JSON) including intent, original query, and reply, suitable for downstream integration/debugging.
+
+#### Workflow Orchestration Implementation
+
+In this tutorial we choose Workflow (not Chatflow). Select User Input:
+
+
+
+Then click Start -> User Input and define a string variable `user_text` as global flow input source.
+
+
+
+Save and click Test Run (top right). You will be prompted to provide test text.
+
+
+
+Next click `+` after input node and add Question Classifier. Configure four labels, each with clear description and examples:
+
+- `buy_food`: user clearly wants to buy/order food.
+- `complain`: user is complaining/angry, usually with dissatisfaction.
+- `chitchat`: user is chatting, discussing what to eat, asking recommendations.
+- `other`: irrelevant to food scenario or hard to classify.
+
+Also set prompt in ADVANCED SETTING for classification behavior. Example prompt:
+
+```text
+Choose the most appropriate label from buy_food / complain / chitchat / other.
+If user both complains and orders, prioritize core emotion: if dissatisfaction is primary, classify as complain.
+If complaint is minor and primary intent is ordering, classify as buy_food.
+If truly hard to determine, use other as fallback.
+```
+
+
+
+After setup, use top-right play icon on this node to test classification.
+
+
+
+
+
+From OUTPUT we can see classification is accurate. Test multiple input types to verify classifier stability.
+
+Next connect classifier to downstream LLM branches. For example, when `label == "buy_food"`, route to `LLM_BuyFood`.
+Create four LLM nodes and set different SYSTEM prompts:
+
+- LLM_BuyFood (ordering assistant):
+
+ You are an ordering assistant. Requirements:
+ 1. Confirm what user wants to order.
+ 2. If info is incomplete, ask follow-up questions politely.
+ 3. Keep tone polite and concise.
+
+- LLM_Complain (support specialist):
+
+ You are a food-service customer support specialist handling complaints. Requirements:
+ 1. Apologize sincerely.
+ 2. Briefly explain likely reasons (no blame shifting).
+ 3. Provide clear next-step resolution.
+
+- LLM_Chitchat (chat companion):
+
+ You are a casual food recommendation assistant. Requirements:
+ 1. Use relaxed friendly tone.
+ 2. Give 1-3 simple recommendations.
+ 3. If no preference, provide options with different styles.
+
+- LLM_Other (polite gatekeeper):
+
+ You are a food-ordering assistant focused only on food topics. For irrelevant user input:
+ 1. Politely explain scope.
+ 2. Guide user back to core scenario.
+
+Important: in each node, after setting SYSTEM prompt, enable USER prompt variable mapping. Click `{x}`, choose `user_text` as user input variable, and prepend `user input:` to indicate source semantics. During response generation, model uses both initial user input and system prompt.
+
+As always, click node-level play icon to test with sample input such as "I want bubble milk tea" and verify behavior.
+
+
+
+Next process parallel branch outputs. In `Variable Aggregator`, find `ASSIGN VARIABLES` and add branch outputs one by one.
+
+
+
+Now aggregate final output including user input, intent, and reply. Because this is Workflow (not Chatflow), there is no Answer node for this exact structure, so we can use Template node for equivalent output packaging. In variable area specify intent result, user input, and aggregator final reply. In CODE, write final JSON template:
+
+- `intent` <- `class_name`
+- `original_text` <- `user_text`
+- `final_reply` <- `variable_aggregator`
+
+```text
+{
+ "intent": "{{ intent }}",
+ "original_text": "{{ original_text }}",
+ "reply": {{ final_reply }}
+}
+```
+
+
+
+Finally add Output node and all setup is complete.
+
+
+
+#### Workflow Runtime Testing
+
+Done. Now run this workflow and observe distinct behavior for different inputs:
+
+- Input (ordering): "Give me one spicy chicken burger combo and a large cola."
+ - Path: `buy_food` -> `LLM_BuyFood`
+ - Output reply: "Sure, one spicy chicken burger combo and a large cola are noted. Would you like to swap fries in the combo?"
+- Input (complaint): "Why are you so slow? I've waited more than an hour!"
+ - Path: `complain` -> `LLM_Complain`
+ - Output reply: "We are truly sorry for the long wait. This was our mistake and caused you a poor experience. We are urgently checking your delivery status and will arrange compensation. Thank you for your patience and feedback."
+- Input (chitchat): "What is healthier to eat today?"
+ - Path: `chitchat` -> `LLM_Chitchat`
+ - Output reply: "If you prefer healthier options, you can try our light salad series or grilled chicken breast with vegetables. Do you prefer lighter taste or richer flavor? I can recommend more specifically."
+- Input (irrelevant): "Help me think of a joke for tomorrow's meeting."
+ - Path: `other` -> `LLM_Other`
+ - Output reply: "That sounds fun, but I mainly specialize in food recommendations and ordering. If you want to order something to reward your hard work, I can help anytime."
+
+> Hidden bug: if you encounter strange `aggregation group` issues, it is likely a built-in Dify bug. It can be triggered by certain operation sequences. If AGGREGATION GROUP was enabled and then disabled, residual group config may remain and cause errors (for example involving `any` params) even when switch appears off. Solution: delete this node and recreate it.
+
+After running in Test Run, you can inspect full execution path. It should follow correct branch and output expected final result. Full flow complete.
+
+
+
+## 2.7 Run Your First Template Workflow App
+
+After the simple classification workflow, next learn how to run workflows created by others. Usually you only need small modifications to turn them into your own. Here we use official DeepResearch workflow as example. It builds a deep-search framework using LLM + search engine and returns rich answers with citations and model-generated synthesis.
+
+After importing, first run directly. Then fix each error step by step based on failing node and cause. If stuck, screenshot and ask a model for debugging help.
+
+
+
+At first glance it may feel complex. That is okay. Click `Preview` on top right and run until first error appears:
+
+
+
+
+
+Troubleshoot the failing node. In this case Tavily API token was missing. Tavily Search is an AI-native search API providing real-time accurate factual results. Follow prompt to configure:
+
+
+
+After fixing it, search engine works normally:
+
+
+
+Then fix model-call issues as needed. You should be able to get results like this with model-understood synthesis:
+
+
+
+At the end, you can inspect referenced source links:
+
+
+
+If you want to understand each step deeply, best method is saving each node output into intermediate variables and printing all variables at final output. Another way: open `Process` view at top and inspect detailed per-step execution.
+
+
+
+## 2.8 Use Dify as an API Provider
+
+Next we call the knowledge-base agent via API and turn Dify into a model-hub backend.
+
+Recall how to call model APIs: prepare key + request/response examples from documentation, feed these to an LLM coding assistant, and ask it to generate invocation code and parse desired fields from responses.
+
+This time we use local code editor [Trae](https://www.trae.cn/).
+
+If you are not familiar with IDE concepts, read:
+[Extra Knowledge 4 - What is AI IDE and Trae](https://github.com/datawhalechina/easy-vibe/blob/main/docs/extra/extra4/extra4-what-is-ai-ide-and-trae.md)
+
+If your local environment is not fully configured, do not worry. If you trust your coding assistant (whether [z.ai](http://z.ai) or Trae), you can directly send any issue/errors and it will provide resolution guidance.
+
+
+
+The right panel is Copilot/Agent interaction window. If not visible, click top-right sidebar icon to open.
+
+
+
+After opening sidebar, you will see `Builder` option. This is Agent mode. You can roughly treat "Builder" as the "development mode" of [z.ai](http://z.ai): it can help with local environment operations, dependency installs, opening webpages, etc.
+
+
+
+Inside Builder, there are "Chat" mode and "Builder with MCP" mode.
+Chat mode mainly interacts with current folder and natural-language model chat.
+(Open a folder from Trae top-left `File`, then Builder file operations occur inside that folder.)
+
+Builder with MCP gives Agent more tools (for example connecting to other software, retrieving weather, etc.). You can treat MCP as a capability layer that makes external tool invocation easier for models.
+
+
+
+At the bottom, there is model selection dropdown. You can choose Kimi k2 or GLM. In international Trae, you can select ChatGPT or Claude as well. With fast progress of domestic models, Kimi/Qwen/GLM are now close to Claude 3.5/3.7 for daily dev scenarios.
+
+
+
+That is a brief Trae intro. Next we reuse operational ideas from [z.ai](http://z.ai) inside Trae.
+
+## 2.9 Build a Frontend Chat App Using Dify API
+
+To build a frontend chat app with Dify API, first obtain Dify API docs and endpoint.
+
+Remember the agent we created? Click top-right `Publish`, then `Publish Update`, then `Access API Reference`.
+
+
+
+
+
+In API docs, find `Send Chat Message`, open it, then copy `Request` and `Response` examples on the right.
+
+Why copy these two parts? Because they are core API information. With key + request example + response example, you can ask model to generate invocation code and parse required fields from returned structure.
+
+
+
+
+
+After finding request/response examples, you also need API key. In top-right docs area, find `API key` options.
+
+
+
+Click `Create new Secret key` to create your own key.
+
+
+
+Now everything is ready. Send API key + request example + response example to Trae Builder.
+
+Note: replace `{DIFY_API_URL}` with your actual Dify API URL.
+
+```json
+key:
+app-zKdCHUXXXXXXXX
+
+Please write me a front-end based on the following reference:
+
+curl -X POST 'http://{DIFY_API_URL}/v1/chat-messages' \
+--header 'Authorization: Bearer {api_key}' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+ "inputs": {},
+ "query": "What are the specs of the iPhone 13 Pro Max?",
+ "response_mode": "streaming",
+ "conversation_id": "",
+ "user": "abc-123",
+ "files": [
+ {
+ "type": "image",
+ "transfer_method": "remote_url",
+ "url": "https://cloud.dify.ai/logo/logo-site.png"
+ }
+ ]
+}'
+
+{
+ "event": "message",
+ "task_id": "c3800678-a077-43df-a102-53f23ed20b88",
+ "id": "9da23599-e713-473b-982c-4328d4f5c78a",
+ "message_id": "9da23599-e713-473b-982c-4328d4f5c78a",
+ "conversation_id": "45701982-8118-4bc5-8e9b-64562b4555f2",
+ "mode": "chat",
+ "answer": "iPhone 13 Pro Max specs are listed here:...",
+ "metadata": {
+ "usage": {
+ "prompt_tokens": 1033,
+ "prompt_unit_price": "0.001",
+ "prompt_price_unit": "0.001",
+ "prompt_price": "0.0010330",
+ "completion_tokens": 128,
+ "completion_unit_price": "0.002",
+ "completion_price_unit": "0.001",
+ "completion_price": "0.0002560",
+ "total_tokens": 1161,
+ "total_price": "0.0012890",
+ "currency": "USD",
+ "latency": 0.7682376249867957
+ },
+ "retriever_resources": [
+ {
+ "position": 1,
+ "dataset_id": "101b4c97-fc2e-463c-90b1-5261a4cdcafb",
+ "dataset_name": "iPhone",
+ "document_id": "8dd1ad74-0b5f-4175-b735-7d98bbbb4e00",
+ "document_name": "iPhone List",
+ "segment_id": "ed599c7f-2766-4294-9d1d-e5235a61270a",
+ "score": 0.98457545,
+ "content": "\"Model\",\"Release Date\",\"Display Size\",\"Resolution\",\"Processor\",\"RAM\",\"Storage\",\"Camera\",\"Battery\",\"Operating System\"\n\"iPhone 13 Pro Max\",\"September 24, 2021\",\"6.7 inch\",\"1284 x 2778\",\"Hexa-core (2x3.23 GHz Avalanche + 4x1.82 GHz Blizzard)\",\"6 GB\",\"128, 256, 512 GB, 1TB\",\"12 MP\",\"4352 mAh\",\"iOS 15\""
+ }
+ ]
+ },
+ "created_at": 1705407629
+}
+```
+
+
+
+At this stage, generated code may not run perfectly in one shot. You may see strange errors or no responses. If that happens, switch model or copy full error details and ask model to iterate based on feedback.
+
+This working style is already close to real development. In daily collaboration with models, you often need to provide more context to solve issues. Besides error messages, you can copy more doc context (for example from "Send message" docs section) and send together for higher-quality fixes.
+
+
+
+The browser is embedded inside Trae. Click the compass icon at top to open full screen in external browser.
+
+
+
+If you are lucky, first attempt may already yield a functional interactive frontend page.
+
+
+
+Because LLMs are stochastic, a single round may work while multi-turn chat fails. So always do multi-round testing to verify stability in conversational scenarios.
+
+
+
+At this point, you can build a simple Dify knowledge-base agent and use Trae (instead of [z.ai](http://z.ai)) to build an interactive frontend. From now on, Trae will become our primary prototyping tool, gradually replacing [z.ai](http://z.ai). You can try re-implementing the snake game in Trae and compare the experience. Keep going.
+
+# 3. More Business Workflow References
+
+You can search engines with keywords like `Dify workflow reference`, or find workflow-sharing repositories on GitHub. Quality varies, so compare multiple sources. Remember, workflow is essentially mapping business SOP into executable process. Think about repeated workflows in your daily work or learning that can be solidified.
+
+Below are AI-generated workflow design references (real implementations are often similar; high-quality human-crafted workflows still require skill). If any idea interests you, send it to a model for deeper refinement into concrete Dify node design and configuration details.
+
+## 3.1 Social Media Platform Workflows
+
+1. One-click cross-platform content distribution workflow (complex)
+ 1. Idea: treat one core draft as "raw material," automatically produce platform-adapted variants.
+ 2. Implementation: `Start` article input -> `LLM` polish -> parallel `LLM` nodes for platform experts (for example Xiaohongshu viral copy expert, Zhihu professional answerer) -> `Iterator` for platform format rules -> `Variable Aggregator` merge -> `Answer` output all versions.
+2. Hot-topic planning and first-draft generator (medium)
+ 1. Idea: automatically capture trends and quickly generate topic suggestions and drafts.
+ 2. Implementation: `Start` keyword -> `Tool` search API for trend data -> `LLM` extract 3-5 topics -> `LLM` generate outline/draft.
+3. Comment-section intelligent classification and reply assistant (complex)
+ 1. Idea: classify comment sentiment/intent and generate categorized reply suggestions.
+ 2. Implementation: `HTTP Request` to fetch comments -> `Question Classifier`/`LLM` multi-label classification (positive/question/complaint/spam) -> `Condition` routing -> parallel `LLM` reply drafting -> `Answer`.
+4. Short-video script and storyboard auto generator (complex)
+ 1. Idea: given trend topic/product description, auto-generate script, storyboard, and recommended tags.
+ 2. Implementation: `Start` topic -> `LLM` script ideation -> second `LLM` scene decomposition (visuals/dialogue/duration) -> `Tool` TTS sample generation -> `Variable Aggregator` merge -> `Answer` structured script.
+5. Live-stream interaction QA summarizer (medium)
+ 1. Idea: process live comments in near real time and summarize key questions/audience sentiment.
+ 2. Implementation: `HTTP Request` streaming comments -> `Iterator` windowed batches -> `LLM` per-window trend summary -> `Answer`/`Webhook` output to host.
+
+## 3.2 Workplace Workflows
+
+1. Intelligent meeting minutes and task auto-assignment system (complex)
+ 1. Idea: extract minutes from transcript and auto-create tasks.
+ 2. Implementation: `Start` meeting text -> `LLM` agenda/conclusion summary -> `Parameter Extractor` action items (task/owner/deadline) -> `LLM` format minutes email -> parallel `HTTP Request` Jira/Trello/Feishu task creation.
+2. Batch resume screening and initial evaluation assistant (medium)
+ 1. Idea: parse resumes, evaluate fit, and generate interview questions.
+ 2. Implementation: `Start` upload resumes + JD -> `Document Extractor` parse text -> `LLM` HR-style matching evaluation -> for high matches, another `LLM` generates deep interview questions.
+3. One-click multilingual email translation and draft reply (simple)
+ 1. Idea: auto-translate incoming email and draft response.
+ 2. Implementation: `Start` email -> `LLM` language detection + translation -> `LLM` reply points -> `LLM` translate back and polish.
+4. Weekly/monthly report auto aggregation and insight generation (complex)
+ 1. Idea: connect multiple data sources and auto-generate structured report.
+ 2. Implementation: parallel `HTTP Request`/`Tool` calls to CRM/Git/PM APIs -> `Code`/`LLM` data cleaning/calculation -> `LLM` trend/highlight/risk narrative -> `Answer` rich report.
+5. Contract/document intelligent review and key-point extraction (medium)
+ 1. Idea: quickly review legal/business documents, surface risks, and extract key clauses.
+ 2. Implementation: `Start` contract PDF -> `Document Extractor` text extraction -> `LLM` legal-expert clause review -> `Parameter Extractor` dates/amounts/parties extraction -> `Answer` risk summary + key table.
+
+## 3.3 Learning and Life Workflows
+
+1. Academic paper deep analysis and note generator (complex)
+ 1. Idea: upload paper PDF and auto-generate structured notes.
+ 2. Implementation: `Start` PDF -> `Document Extractor` full text -> parallel `LLM` summaries (abstract/method/findings/references) -> `Variable Aggregator` merge -> `Answer` markdown notes.
+2. Personalized travel planner (medium)
+ 1. Idea: auto-plan detailed itinerary from user preferences.
+ 2. Implementation: `Start` destination/days/budget/interests -> `Tool` search/map APIs -> `LLM` daily itinerary with schedule/activities/budget estimates.
+3. Interactive foreign-language speaking partner (simple)
+ 1. Idea: role-play dialogue bot with grammar correction.
+ 2. Implementation: system role setup -> `Start` user utterance -> `LLM` dual tasks (role reply + grammar correction/explanation) -> `Answer`.
+4. Personal knowledge-base QA and related-link recommender (complex)
+ 1. Idea: build a QA system over your saved docs/notes/links with related old-knowledge recommendations.
+ 2. Implementation: offline indexing with `Document Extractor` + `Embedding`; online flow: `Start` question -> `Retrieval` from vector store -> `LLM` context-grounded answer; parallel branch uses retrieved content and `LLM` to produce related-old-knowledge list -> `Answer` merged output.
+5. Fitness/diet tracking and adjustment advisor (medium)
+ 1. Idea: analyze daily diet/training logs and output nutrition/training suggestions.
+ 2. Implementation: `Start` text log (for example lunch + training record) -> `Parameter Extractor` structure parsing -> `LLM` fitness-coach analysis of nutrition/training volume -> compare with long-term goals -> micro-adjustment suggestions.
+
+# 6. Limitations of Workflow Platforms
+
+Workflow (low-code) platforms are not universal solutions. They are business-friendly and lower direct coding threshold, but from another angle, "low code" can also be "high code": users still need to understand platform concepts, rules, and operation logic. That itself is a learning cost.
+
+You may ask: many simple workflows are just chained function calls around model APIs. In code, a few lines may solve it. Why use heavy visual wrappers and make API calling more cumbersome?
+
+That point is valid. With rapid vibe-coding progress and AI code generation, directly reading or generating code can sometimes be more efficient. Ideally, we should be able to manipulate application logic directly in natural language. But current workflow platforms still have an unavoidable "middle layer" between user intent and final implementation. Learning this middle layer takes time. Ideally, future platforms should support full AI dialogue-driven operation for both workflow construction and parameter-level control.
+
+Even so, becoming proficient in these platforms is increasingly a foundational skill, similar to office software: widely used and practically valuable in business contexts.
+
+In later advanced courses, we will introduce code-level workflow and RAG development platforms, where you can compare complexity/flexibility tradeoffs across implementation styles. (Also note that many simple dialogue apps and nested logics are still straightforward in workflow form.)
+
+# 📚 Homework
+
+## Master Basic Dify Operations
+
+To verify you understand common Dify operations, complete one basic assignment plus two mini-challenges:
+
+You need to import the two provided DSL files into Dify workflows and complete the corresponding challenges successfully (if confused, screenshot and ask a model, or explore each parameter yourself until target behavior is reached):
+
+1. Based on the intent-classification workflow approach, ask a model to suggest a completely different scenario, but you must still use intent classification workflow. Submit workflow runtime screenshot, scenario description, and result.
+2. `Log in workflow` decryption challenge:
+
+In this challenge, make workflow support:
+
+- Find the correct password.
+- Change password to `0925`.
+- Provide a second attempt when password is wrong (no third attempt).
+- When user asks to log in again, allow password re-entry.
+
+
+
+Reference input/output:
+
+
+
+3. `Love loop workflow` decryption challenge:
+
+
+
+Fix current workflow issues so final output looks similar to:
+
+
+
+If you cannot solve a problem, screenshot and ask a model, or check official docs:
+[https://docs.dify.ai/en/use-dify/getting-started/quick-start](https://docs.dify.ai/en/use-dify/getting-started/quick-start)
+
+## Implement Dify API Invocation
+
+To verify you truly mastered Dify API usage, complete:
+
+1. Deploy Dify and create a simple knowledge base (choose any materials you like).
+2. Build a chat frontend in Trae IDE and integrate Dify knowledge base via API.
+3. Test multi-turn dialogue behavior and ensure program runs normally.
+
+Submit final runtime screenshots and KB processing screenshots.
+
+## Try Third-Party Workflow / Build Your Own Business Workflow
+
+Find a Dify workflow shared by others on GitHub, WeChat public articles, Reddit, X, etc., import and run successfully; or build your own workflow from business references above based on real needs.
+
+Finally submit successful runtime screenshot and explain workflow purpose.
+
+# [Bug] How to Fix HTTP Request Errors
+
+Only refer to this section if you encounter the issue shown below. Otherwise you can ignore this part.
+
+Sometimes you deploy Dify on your own server where public endpoint is HTTP (not HTTPS). If you request an HTTP-only service, you may see errors like this (enable browser F12 debug info to inspect):
+
+
+
+Root cause: Dify is deployed on a server that supports HTTP but not HTTPS.
+HTTPS (HyperText Transfer Protocol Secure) adds SSL/TLS encryption over HTTP, basically a more secure HTTP.
+
+To support HTTPS, common options are:
+
+- Forward requests through another service (for example reverse proxy on certificate-enabled nginx), or
+- Bind domain and issue TLS certificate.
+
+These are relatively complex, so here we use Zeabur as network forwarding gateway.
+
+Zeabur pages are accessed via HTTPS by default. So if you forward the original domain to Zeabur domain, the issue is fixed.
+
+- Original URL: `http://{DIFY_API_URL}/v1/chat-messages`
+- New URL: `https://{DIFY_NEW_API_URL}.zeabur.app/v1/chat-messages`
+
+You only need to replace URL domain (public IP/domain) with your deployed Zeabur domain. Forwarding is preconfigured in service.
+
+If interested, you can deploy your own forwarding service on Zeabur. Create a Python service and use the following code. After deployment you get an HTTPS endpoint that works normally.
+
+After deployment, set service listen port to local `8080` and expose this port publicly.
+
+Note: replace `{DIFY_API_URL}` with your actual Dify API URL.
+
+```python
+from flask import Flask, request, Response
+import requests
+
+app = Flask(__name__)
+
+TARGET_BASE_URL = "{DIFY_API_URL}"
+LISTEN_PORT = 8080
+
+@app.route('/', defaults={'path': ''}, methods=['GET', 'POST', 'PUT', 'DELETE', 'PATCH', 'OPTIONS', 'HEAD'])
+@app.route('/
', methods=['GET', 'POST', 'PUT', 'DELETE', 'PATCH', 'OPTIONS', 'HEAD'])
+def proxy_request(path):
+ target_url = f"{TARGET_BASE_URL}/{path}"
+ if request.query_string:
+ target_url += f"?{request.query_string.decode('utf-8')}"
+
+ headers = {key: value for key, value in request.headers if key.lower() not in ['host', 'connection', 'content-length', 'accept-encoding']}
+
+ try:
+ resp = requests.request(
+ method=request.method,
+ url=target_url,
+ headers=headers,
+ data=request.get_data(),
+ cookies=request.cookies,
+ allow_redirects=False,
+ timeout=30
+ )
+
+ excluded_headers = ['content-encoding', 'content-length', 'transfer-encoding', 'connection']
+ response_headers = [(name, value) for name, value in resp.raw.headers.items() if name.lower() not in excluded_headers]
+
+ return Response(resp.content, resp.status_code, response_headers)
+
+ except requests.exceptions.RequestException as e:
+ print(f"Error forwarding request to {target_url}: {e}")
+ return Response(f"Proxy Error: Could not reach target server or invalid response: {e}", status=502)
+ except Exception as e:
+ print(f"An unexpected error occurred: {e}")
+ return Response(f"Internal Proxy Error: {e}", status=500)
+
+if __name__ == '__main__':
+ app.run(host='0.0.0.0', port=LISTEN_PORT, debug=True)
+```
diff --git a/docs/en/stage-2/assignments/2.1-fullstack-app/index.md b/docs/en/stage-2/assignments/2.1-fullstack-app/index.md
new file mode 100644
index 0000000..b93da08
--- /dev/null
+++ b/docs/en/stage-2/assignments/2.1-fullstack-app/index.md
@@ -0,0 +1,3 @@
+# Build Your First Modern App: Full-Stack Application
+
+> This chapter is currently being written. Stay tuned...
diff --git a/docs/en/stage-2/assignments/2.2-modern-frontend-trae/index.md b/docs/en/stage-2/assignments/2.2-modern-frontend-trae/index.md
new file mode 100644
index 0000000..6d6b218
--- /dev/null
+++ b/docs/en/stage-2/assignments/2.2-modern-frontend-trae/index.md
@@ -0,0 +1,3 @@
+# Assignment 2: Modern Frontend Component Library + Trae Practice
+
+> This chapter is currently being written. Stay tuned...
diff --git a/docs/en/stage-2/backend/2.2-database-supabase/index.md b/docs/en/stage-2/backend/2.2-database-supabase/index.md
new file mode 100644
index 0000000..7bb1b4b
--- /dev/null
+++ b/docs/en/stage-2/backend/2.2-database-supabase/index.md
@@ -0,0 +1,1747 @@
+# From Database to Supabase
+
+In the previous lesson, we learned the basics of UI design tools (Mastergo and Figma), how to use GitHub for code retrieval and version control, and how to deploy websites with Zeabur so more people can access our apps.
+
+To make this lesson easier to connect, let's quickly review the previous core points with a few short questions:
+
+1. What are frontend design tools, and how do Figma and MasterGo work?
+2. What are the basic methods for turning design drafts into code?
+3. What is GitHub, how do you configure SSH, and how do you create your first repository?
+4. What does deployment mean, how do you use Zeabur, and how do you deploy GitHub/local code to a public network?
+
+If any of the above still feels blurry, review the previous lesson notes first. You can always ask questions in the WeChat study group.
+
+In this lesson, we move from "an app that can run" to "an app that looks like a real online product." That means not only managing data changes with a database, but also building a complete user system (registration, login, authorization) and other core backend capabilities. We use Supabase as the main path: first implement "database + user system," then use Supabase modules to understand the core components of modern cloud backend services.
+
+# What you will learn
+
+1. What data is, what a database is, and common database usage
+2. What Supabase is and how to do basic database operations with it
+3. How to add basic user management with Supabase
+4. Supabase advanced features: realtime, storage, and edge functions
+5. How to enable Google and GitHub login for Supabase
+
+- A basic app that supports user sign-up/sign-in and stores data in an online database
+- A reusable Supabase backend template (database + user management, etc.) for future projects
+
+# 1. What is Database
+
+## 1.1 What is Data
+
+In the digital world, data is everywhere. Data is simply the carrier of information: your friend's contact info, a WeChat article, a short video, a game character level. In apps, data is everything that needs to be recorded and managed: user profiles, order history, app settings, and so on.
+
+In programs, data has different forms. The simplest form is variables:
+
+```python
+# Python variable definition examples
+
+# Integer variable: stores age information
+age = 30
+
+# Boolean variable: stores status (whether active)
+is_active = True # True means active, False means inactive
+
+# List variable: stores a set of score data
+scores = [85, 92, 78, 90] # Contains 4 integer elements representing different scores
+
+# Dictionary variable: stores multiple related information of a user
+user_info = {
+ "age": 30, # Key "age" corresponds to the value of age
+ "height": 1.80, # Key "height" corresponds to the value of height (unit: meter)
+ "login_count": 156 # Key "login_count" corresponds to the value of login times
+}
+```
+
+For more complex data such as user profiles and order history, tables are usually used:
+
+| user_id | name | email |
+| ------- | ----- | ----------------- |
+| 1001 | Alice | alice@example.com |
+| 1002 | Bob | bob@example.com |
+
+| order_id | user_id | amount | status |
+| -------- | ------- | ------ | --------- |
+| 901 | 1001 | 29.99 | completed |
+| 902 | 1002 | 15.50 | pending |
+
+For hierarchical, variable-structure data, JSON is often better. JSON is a universal internet data format that almost all systems can parse. For example, one order may contain multiple items, and each item has its own fields.
+
+```json
+{
+ "order_id": 901,
+ "user_id": 1001,
+ "amount": 29.99,
+ "status": "completed",
+ "items": [
+ { "sku": "BG-001", "name": "Beef Burger", "quantity": 1, "price": 18.00 },
+ { "sku": "SD-003", "name": "French Fries", "quantity": 1, "price": 6.99 },
+ { "sku": "DK-002", "name": "Cola", "quantity": 1, "price": 5.00 }
+ ],
+ "shipping_address": {
+ "street": "123 Tech Park Road",
+ "city": "Shenzhen",
+ "zip_code": "518057"
+ }
+}
+```
+
+There is also vector data. After unstructured data (text/images/audio) is processed by AI embedding models, the output is typically a high-dimensional float array:
+
+`[0.123, -0.456, 0.789, ..., -0.234]`
+
+In real projects, there are many data shapes and many corresponding storage systems:
+
+
+
+## 1.2 Why We Need Database
+
+Real-world data is complex. To store and use data efficiently, we need a dedicated system to manage it: this is the purpose of databases.
+
+A database is a specialized program that organizes, stores, manages, and queries data safely and efficiently.
+
+Without a database, app data quickly breaks down:
+
+- once users close the browser, in-memory data disappears
+- login state and preferences cannot be persisted
+- key shared data (inventory, orders) cannot be coordinated across users
+
+Databases can be deployed locally or in the cloud. Cloud databases support elastic scaling and can handle high concurrency and larger data volume.
+
+Core problems databases solve:
+
+- **Persistent storage**: data survives app restarts
+- **Efficient query and analysis**: SQL supports filtering, aggregation, analysis
+- **High performance and high concurrency**: indexing, caching, pooling, distributed architecture
+- **Integrity and consistency**: constraints, uniqueness, data validity guarantees
+- **Security and recovery**: authentication, authorization, encryption, backup/restore
+
+## 1.3 Relational Database VS Non-Relational Database (NOSQL)
+
+In practice, you typically choose between relational databases and NoSQL databases.
+
+Relational databases are like strictly structured spreadsheets. You define schema in advance (field types and rules) and connect tables by relational keys. This is highly reliable and great for scenarios such as finance and inventory where correctness is critical, but schema changes can be less flexible.
+
+NoSQL databases are more like flexible containers. They can store documents, key-value data, and changing structures without fixed schema upfront. They are easier to scale for rapidly changing and large-volume internet scenarios, but they trade off some relational query power and strict consistency.
+
+In typical usage:
+
+- relational DBs: transactions, inventory, order systems, accounting, strong consistency
+- NoSQL DBs: social content, logs, IoT high-write streams, recommendation features
+
+In early-stage startups, you usually do not need to over-optimize database type at day one. Mature cloud providers already offer strong defaults. In real business settings, teams usually match business needs with vendor support first, then optimize later.
+
+You can also refer to cloud vendor database selection guides, such as:
+[Aliyun database selection recommendation](https://help.aliyun.com/zh/govcloud/getting-started/select-database-services)
+
+| Database Type | Database | Price | Typical Scenarios |
+| ------------ | ---------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Relational | RDS MySQL | Low | Basic: learning and small websites. HA: medium pressure business scenarios. Cluster: no-interruption and heavier traffic |
+| | RDS SQL Server | High | Basic: testing and small commercial sites. HA: enterprise websites. Cluster: no-interruption enterprise business |
+| | RDS PostgreSQL | Lowest | Basic: learning and small websites. HA: medium business pressure. Cluster: heavy access and often better performance than common MySQL setups |
+| | RDS PPAS | High | General and dedicated enterprise Oracle-compatible scenarios |
+| | DRDS | Medium | Entry to enterprise and high-concurrency online business |
+| NoSQL | Redis | Medium | Hot standby persistent data and cache acceleration under read pressure |
+| | MongoDB | Medium | Single node for dev/test, replica set for read-heavy scenarios, sharded clusters for high-scale online workloads |
+
+Let's use one concrete "blog platform" example to compare SQL and NoSQL storage models.
+
+Assume we need:
+
+- Users: id, username, email
+- Posts: id, title, content, author_id
+- Comments: id, content, commenter_id, post_id
+- Tags: id, name
+- Post-tag many-to-many relationships
+
+### Relational database (SQL) example
+
+In SQL, we normalize entities into separate tables and connect with foreign keys.
+
+- `users` table
+
+| user_id (PK) | username | email |
+| -------------- | -------- | ----------------- |
+| 101 | Alice | alice@example.com |
+| 102 | Bob | bob@example.com |
+
+- `posts` table
+
+| post_id (PK) | title | content | author_id (FK) |
+| -------------- | --------- | ------------------------------ | ---------------- |
+| 1 | SQL Intro | This is an article about SQL... | 101 |
+| 2 | NoSQL Intro | NoSQL provides flexible models... | 102 |
+
+- `comments` table
+
+| comment_id (PK) | body | commenter_id (FK) | post_id (FK) |
+| ----------------- | ---------------- | ------------------- | -------------- |
+| 1001 | Great article! | 102 | 1 |
+| 1002 | Learned a lot. | 101 | 2 |
+| 1003 | Any more examples? | 101 | 1 |
+
+- `tags` table
+
+| tag_id (PK) | tag_name |
+| ------------- | -------- |
+| 51 | database |
+| 52 | technology |
+| 53 | beginner |
+
+- `post_tags` table (many-to-many relation)
+
+| post_id (FK) | tag_id (FK) |
+| -------------- | ------------- |
+| 1 | 51 |
+| 1 | 52 |
+| 2 | 51 |
+| 2 | 52 |
+| 2 | 53 |
+
+To fetch complete post information (post + author + comments + tags), we use multi-table joins:
+
+```sql
+SELECT
+ p.title,
+ p.content,
+ u.username AS author,
+ c.body AS comment,
+ t.tag_name AS tag
+FROM
+ posts p
+JOIN
+ users u ON p.author_id = u.user_id
+LEFT JOIN
+ comments c ON p.post_id = c.post_id
+LEFT JOIN
+ post_tags pt ON p.post_id = pt.post_id
+LEFT JOIN
+ tags t ON pt.tag_id = t.tag_id
+WHERE
+ p.post_id = 1;
+```
+
+This is SQL's strength: flexible complex queries with consistency and low redundancy.
+
+### NoSQL database (NoSQL) example
+
+In NoSQL document databases (for example MongoDB), related business data is often aggregated into a single document, reducing joins at read time.
+
+A sample document in `posts`:
+
+```json
+{
+ "_id": 1,
+ "title": "SQL Intro",
+ "content": "This is an article about SQL...",
+ "author": {
+ "user_id": 101,
+ "username": "Alice",
+ "email": "alice@example.com"
+ },
+ "tags": [
+ "database",
+ "technology"
+ ],
+ "comments": [
+ {
+ "comment_id": 1001,
+ "body": "Great article!",
+ "commenter": {
+ "user_id": 102,
+ "username": "Bob"
+ }
+ },
+ {
+ "comment_id": 1003,
+ "body": "Any more examples?",
+ "commenter": {
+ "user_id": 101,
+ "username": "Alice"
+ }
+ }
+ ]
+}
+```
+
+The advantage is obvious: one lookup can return full business context.
+
+The trade-off is data redundancy. If `username` changes, many documents may need updates. In read-heavy scenarios (blogs, product pages), this trade-off is often acceptable for faster reads. In write-heavy scenarios, you need careful design trade-offs.
+
+If you want to explore more databases:
+
+Examples of SQL databases:
+[Db2](https://www.ibm.com/products/db2-database), [MySQL](https://cloud.ibm.com/catalog#highlights), [PostgreSQL](https://www.ibm.com/think/topics/postgresql), [YugabyteDB](https://www.yugabyte.com/), [CockroachDB](https://www.cockroachlabs.com/), [Oracle Database](https://www.ibm.com/products/postgres-enterprise), [Azure SQL Database](https://www.ibm.com/consulting/microsoft)
+
+Examples of NoSQL databases:
+[Redis](https://www.ibm.com/think/topics/redis), [CouchDB](https://www.ibm.com/think/topics/couchdb), [MongoDB](https://www.ibm.com/think/topics/mongodb), [Cassandra](https://cloud.ibm.com/catalog#highlights), [Elasticsearch](https://www.ibm.com/think/topics/elasticsearch), [BigTable](https://www.techtarget.com/searchdatamanagement/news/252512583/Google-scales-up-Cloud-Bigtable-NoSQL-database), [Neo4j](https://neo4j.com/users/ibm/), [HBase](https://www.ibm.com/think/topics/hbase)
+
+# 2. Supabase
+
+Above, we discussed database categories and usage. But in real projects, a database is only one backend module. You also need sign-in/sign-up, permissions, file upload/storage, APIs, scheduled jobs, realtime notifications, and more.
+
+That broader context is **backend services**. A complete app is usually frontend + backend. In traditional workflows, teams had to build servers, configure databases, design APIs, implement security, and maintain operations manually.
+
+To reduce repeated backend groundwork, the industry created **BaaS (Backend as a Service)**: package common backend capabilities (DB/auth/storage/realtime, etc.) as cloud services that developers can call directly via SDK/API.
+
+[Supabase](https://supabase.com/) is a modern BaaS representative. It uses PostgreSQL as the core and integrates Auth, Storage, Realtime, Edge Functions, Vector, and more into a "Postgres-centered one-stop backend platform."
+
+Next, we move from "choosing only a database" to "choosing a complete backend development platform."
+
+## 2.1 Step by Step Guide
+
+After understanding Supabase's positioning, let's walk along the Supabase console path and break down each capability and responsibility.
+
+
+
+After signing in at Supabase and clicking **New project**:
+
+- set project name
+- set DB password
+- choose region near your target users
+
+
+
+After creation, the left sidebar shows key modules: Table Editor, SQL Editor, Database, Authentication, and so on.
+
+
+
+### Table Editor
+
+Table Editor is Supabase's visual data table editor. You can inspect and edit DB data without writing SQL, similar to spreadsheet interaction.
+
+
+
+The key concept here is **Schema**.
+
+Schemas are resource containers for tables, views, functions, indexes, etc. They help with:
+
+- avoiding naming conflicts
+- permission isolation
+
+In daily development, most people mainly use:
+
+- `public`: default business tables (posts/comments/orders/etc.)
+- `auth`: authentication tables (for example `auth.users`), usually do not edit built-in auth schema tables manually
+
+
+
+### SQL Editor
+
+SQL Editor is the SQL execution console. You can run model-generated SQL directly and inspect results quickly.
+
+
+
+After executing SQL, you can view new tables in Table Editor (`public` schema). Executed SQL is also saved in the left private history, and can be starred.
+
+### Database
+
+Database is the management center where you inspect tables and relationships (foreign key constraints) visually.
+
+
+
+You can also create tables manually in `Database -> Tables`.
+
+
+
+### Authentication
+
+Authentication manages sign-up/sign-in and permissions. It supports registration, login, password reset, email verification, and OAuth providers (Google/GitHub/others). User data is synced automatically into `auth.users`.
+
+
+
+Provider options are visible in the Provider panel. By default, email login is enabled. For GitHub/Google login, extra provider config is required.
+
+
+
+In `Sign In / Providers`, you can configure registration behavior (for example, whether email confirmation is required).
+
+
+
+You can also use third-party auth systems in `Third Party Auth` (for example Clerk).
+
+
+
+You can enable rate-limiting policies in `Rate Limits` to control abusive traffic.
+
+
+
+### Storage
+
+Storage is Supabase file storage and is S3-compatible in concept. It stores files (images/videos/docs/audio), supports public/private access control, and supports permanent/temporary link generation.
+
+
+
+We cover concrete usage in later project sections.
+
+
+
+If needed, you can operate via S3-compatible settings.
+
+
+
+> Amazon Cloud (AWS) is a cloud platform. S3 is AWS's object storage service and has effectively become an industry standard for object storage APIs.
+>
+> **Why S3-compatible APIs matter:** there is a large ecosystem of SDKs/tools/docs. Compatibility dramatically reduces integration cost.
+
+### Edge Functions
+
+If you do not want to self-host a full backend, but still need secure server-side logic, use Edge Functions. They are globally distributed server functions managed by Supabase.
+
+
+
+A core use case is secure API proxying. Never expose sensitive keys (OpenAI/Stripe/etc.) in frontend code. Instead:
+
+- frontend calls your Supabase function
+- function securely uses secrets stored in Supabase
+
+
+
+Function secrets are injected as environment variables (for example through `Deno.env.get`), so keys are never exposed to browsers.
+
+
+
+Minimal Edge Function request example:
+
+```javascript
+// Core config (replace with your own values)
+const projectId = "your Supabase project ID";
+const functionName = "target Edge Function name";
+const supabaseKey = "Supabase anon_key";
+
+async function callEdgeFunction() {
+ const url = `https://${projectId}.supabase.co/functions/v1/${functionName}`;
+
+ try {
+ const response = await fetch(url, {
+ method: "POST",
+ headers: {
+ "Content-Type": "application/json",
+ "Authorization": `Bearer ${supabaseKey}`
+ },
+ body: JSON.stringify({ order_id: "123", action: "refund" })
+ });
+
+ const result = await response.json();
+ console.log("Success:", result);
+ } catch (error) {
+ console.error("Failed:", error.message);
+ }
+}
+
+callEdgeFunction();
+```
+
+Edge Functions integrate with Supabase auth sessions and RLS. They can identify current users and operate with your security model.
+
+Typical scenarios:
+
+- third-party webhooks
+- email notifications
+- PDF generation
+- custom API endpoints and business rules
+
+Example: Clerk only manages auth identity. If you need user data synchronized into business tables, you can listen to Clerk webhooks via Edge Functions and write into Supabase automatically.
+
+### Realtime
+
+Realtime allows clients to receive DB changes instantly through WebSocket instead of polling.
+
+It includes:
+
+1. **Postgres Changes**: subscribe to row-level `INSERT`/`UPDATE`/`DELETE`
+2. **Broadcast**: low-latency temporary channel messages
+3. **Presence**: online status tracking/synchronization
+
+We will use it in project-based sections later.
+
+### Project Settings
+
+Project Settings is for deeper resource and parameter configuration.
+
+
+
+At beginner stage, focus on:
+
+1. **Data API**: your Supabase URL (`https://xxx.supabase.co`)
+2. **API Keys**: anon key vs service_role key
+
+
+
+`anon` is for restricted client access under RLS. `service_role` is high-privilege server key and must never be exposed publicly.
+
+
+
+## 2.1 Create Your First SQL Table
+
+After understanding the console, let's move to core DB operations.
+
+There are two common ways to create tables in Supabase:
+
+1. (recommended) generate SQL via LLM and run it in SQL Editor
+2. visual creation via `Database -> Tables -> New table`
+
+
+
+You can define table name and column types in `Columns`.
+
+
+
+Relational DBs rely on table relationships. Configure relations in `Foreign keys`.
+
+
+
+Example (student table referencing class table):
+
+```sql
+CREATE TABLE students (
+ student_id INT PRIMARY KEY,
+ student_name VARCHAR(50),
+ class_id INT,
+ FOREIGN KEY (class_id) REFERENCES classes(class_id)
+);
+```
+
+Visualized example:
+
+Classes table:
+
+| class_id | class_name |
+| -------- | ---------- |
+| 101 | Grade 1 Class 1 |
+| 102 | Grade 1 Class 2 |
+
+Students table:
+
+| student_id | student_name | class_id |
+| ---------- | ------------ | -------- |
+| 2024001 | Zhang San | 101 |
+| 2024002 | Li Si | 102 |
+| 2024003 | Wang Wu | 101 |
+
+In Supabase, after adding a foreign key, choose referenced table and column directly.
+
+
+
+## 2.3 SQL Editor 简介与数据库基本操作
+
+Now we run a series of SQL scripts and practice CRUD step by step.
+
+All sample SQL files are available here:
+
+https://github.com/THU-SIGS-AIID/Project5-Supabase-Demos/tree/main/apps/sql-examples
+
+### **2.3.1 **`CREATE`** - 创建表结构**
+
+`CREATE TABLE` defines schema, columns, data types, and constraints.
+
+```sql
+-- Step 1: Create the 'orders' table
+-- This file is fully independent and creates a sample table for later steps.
+CREATE TABLE IF NOT EXISTS orders (
+ id serial PRIMARY KEY,
+ user_id int NOT NULL, -- User ID
+ status text NOT NULL, -- Order status (e.g. paid, pending)
+ amount numeric(10, 2) NOT NULL, -- Order total amount
+ details jsonb, -- Item and extra details as JSON
+ placed_at timestamptz DEFAULT now(), -- Order creation time
+ is_paid boolean DEFAULT false -- Paid flag
+);
+```
+
+After execution, check Table Editor:
+
+
+
+### **2.3.2 **`INSERT`** - 填充初始数据**
+
+After creating the table structure, the next step is to use `INSERT INTO` to add data rows into the table.
+
+```sql
+-- Step 2: Insert initial rows into the orders table
+-- Provides realistic, varied data for demo/testing. All values are self-contained.
+INSERT INTO orders (user_id, status, amount, details, placed_at, is_paid) VALUES
+ (2001, 'pending', 23.50, '{"items":[{"sku":"BGR001","name":"Beef Burger","qty":1,"price":12.00}]}', now() - interval '2 days', false),
+ (2002, 'paid', 50.00, '{"items":[{"sku":"BGR002","name":"Chicken Burger","qty":2,"price":10.00},{"sku":"DRK001","name":"Lemonade","qty":2,"price":5.00}]}', now() - interval '1 day', true),
+ (2003, 'cancelled', 15.00, '{"items":[{"sku":"FRY001","name":"French Fries","qty":3,"price":5.00}], "reason":"Not available"}', now() - interval '45 days', false),
+ (2004, 'paid', 22.98, '{"items":[{"sku":"BGR003","name":"Veggie Burger","qty":2,"price":9.99}], "promo":"SUMMER22"}', now() - interval '10 days', true),
+ (2005, 'pending', 18.75, '{"items":[{"sku":"SAL001","name":"Salad","qty":1,"price":6.75},{"sku":"BGR001","name":"Beef Burger","qty":1,"price":12.00}]}', now() - interval '7 hours', false),
+ (2006, 'paid', 8.00, '{"items":[{"sku":"DRK002","name":"Cola","qty":2,"price":4.00}]}', now() - interval '3 hours', true),
+ (2007, 'refunded', 14.50, '{"items":[{"sku":"BGR003","name":"Veggie Burger","qty":1,"price":9.99},{"sku":"FRY001","name":"French Fries","qty":1,"price":4.51}], "refund_reason":"Late delivery"}', now() - interval '15 days', false),
+ (2008, 'paid', 26.99, '{"items":[{"sku":"BGR002","name":"Chicken Burger","qty":2,"price":10.00},{"sku":"DRK001","name":"Lemonade","qty":1,"price":6.99}]}', now() - interval '12 days', true),
+ (2009, 'pending', 9.99, '{"items":[{"sku":"BGR003","name":"Veggie Burger","qty":1,"price":9.99}]}', now() - interval '30 minutes', false),
+ (2010, 'paid', 19.89, '{"items":[{"sku":"BGR001","name":"Beef Burger","qty":1,"price":12.00},{"sku":"DRK002","name":"Cola","qty":2,"price":3.95}]}', now() - interval '5 days', true),
+ (2011, 'cancelled', 0.00, '{"items":[], "reason":"User cancelled"}', now() - interval '2 days', false);
+
+-- Expected Output:
+-- After running this script, SELECT * FROM orders will show about 11 rows with varied user_id, status, amount, details (JSON), placed_at, and is_paid fields.
+-- For example:
+-- | id | user_id | status | amount | is_paid | placed_at |
+-- |----|---------|-----------|--------|---------|---------------------|
+-- | 1 | 2001 | pending | 23.50 | false | 2025-10-28 13:40:00Z|
+-- | 2 | 2002 | paid | 50.00 | true | ... |
+-- |... | ... | ... | ... | ... | ... |
+```
+
+After the script executes successfully, initial data is now inserted into the table. You can refresh Table Editor to see the result, or open a new SQL Editor tab and run `SELECT * FROM orders;` to view it directly:
+
+
+
+### **2.3.3 **`SELECT`** - 读取与查询数据**
+
+`SELECT` is used to query, filter, and format data:
+
+```sql
+-- Example 1: Select all fields for all orders
+SELECT * FROM orders;
+
+-- Example 2: Select only pending orders
+SELECT id, user_id, amount FROM orders WHERE status = 'pending';
+
+-- Example 3: Select paid orders
+SELECT id, status, is_paid, amount FROM orders WHERE is_paid = true;
+
+-- Example 4: Extract JSON item list
+SELECT id, details -> 'items' AS item_list FROM orders;
+```
+
+Example 2 result:
+
+
+
+Example 3 (paid orders):
+
+| id | status | is_paid | amount |
+| --- | ------ | ------- | ------ |
+| 2 | paid | true | 50.00 |
+| 4 | paid | true | 22.98 |
+| 6 | paid | true | 8.00 |
+| 8 | paid | true | 26.99 |
+| 10 | paid | true | 19.89 |
+
+Example 4 (JSON array extract):
+
+| id | item_list |
+| --- | -------------------------------------------------------------------------------------------------------------------- |
+| 1 | `[{"qty":1,"sku":"BGR001","name":"Beef Burger","price":12}]` |
+| 2 | `[{"qty":2,"sku":"BGR002","name":"Chicken Burger","price":10},{"qty":2,"sku":"DRK001","name":"Lemonade","price":5}]` |
+| 3 | `[{"qty":3,"sku":"FRY001","name":"French Fries","price":5}]` |
+| ... | ... |
+
+### **2.3.4 **`INSERT`** - 插入单条记录**
+
+In 2.3.2, we demonstrated batch initialization inserts at the beginning. Now let's see how to insert a single new row.
+
+```sql
+-- Step 4: INSERT a new order (single row)
+-- Example: Add a new paid order for user 2012 with one Chicken Burger
+INSERT INTO orders (user_id, status, amount, details, is_paid)
+VALUES (
+ 2012, 'paid', 9.99,
+ '{"items":[{"sku":"BGR002","name":"AIID Burger","qty":100,"price":1000}]}',
+ true
+);
+-- Expected Output:
+-- Before (table fragment):
+-- | id | user_id | status | amount | is_paid |
+-- | ...| ... | ... | ... | ... |
+--
+-- After (last row):
+-- | id | user_id | status | amount | is_paid |
+-- | xx | 2012 | paid | 9.99 | true |
+-- (where xx = next serial value)
+```
+
+Now run `SELECT * FROM orders;` again. You will see the `orders` table increase successfully from 11 rows to 12 rows.
+
+### **2.3.5 **`UPDATE`** - 修改现有数据**
+
+In practical work, we frequently update table data. We can use `UPDATE` to modify existing records in a table.
+
+```sql
+-- Step 5: UPDATE example
+-- Example: Mark order with id=1 as paid and update its status
+UPDATE orders SET status = 'paid', is_paid = true WHERE id = 1;
+-- Expected Output:
+-- Before (row with id=1):
+-- | id | status | is_paid |
+-- | 1 | pending | false |
+-- After (row with id=1):
+-- | id | status | is_paid |
+-- | 1 | paid | true |
+-- All other rows remain unchanged.
+```
+
+### **2.3.6 **`DELETE`** - 删除数据**
+
+`DELETE` can be used to remove records from a table, and with conditions, it can target only a specific subset of data.
+
+```sql
+-- Step 6: DELETE example
+-- Example: Delete orders older than 2 days to clean up old data
+DELETE FROM orders WHERE placed_at < now() - interval '2 days';
+-- Expected Output:
+-- Before (filtered for affected rows):
+-- | id | status | placed_at |
+-- | 3 | shipped | 2025-10-13 ... | <-- will be deleted
+--
+-- After:
+-- No such rows remain. SELECT * FROM orders WHERE placed_at < now()-interval '2 days' yields zero rows.
+-- Other rows in orders table are unaffected.
+```
+
+Before executing, you can run `SELECT id, status, placed_at FROM orders WHERE placed_at < now() - interval '2 days';` to inspect the rows matching the condition. After running `DELETE`, execute the same query again: `SELECT id, status, placed_at FROM orders WHERE placed_at < now() - interval '2 days';`. It should return an empty result, which means those rows were deleted successfully.
+
+## 2.4 RLS (Row level security)
+
+After basic CRUD, we need one key security concept: **RLS (Row Level Security)**.
+
+RLS solves data isolation:
+
+- user A should see only user A's rows
+- user B should not access user A's private rows
+
+For example, in `orders`, define policy: users can read only rows whose `user_id` matches current authenticated user.
+
+Once RLS is enabled, every `SELECT`/`INSERT`/`UPDATE`/`DELETE` request must pass at least one matching policy, or the DB will reject it.
+
+Supabase provides `auth.uid()` to reference the current authenticated user id, making policy writing straightforward.
+
+You can configure policies in the Supabase RLS UI:
+
+
+
+
+
+
+
+In practice, policies are often created in initialization SQL:
+
+
+
+# 3. The First SQL Application
+
+Now we move to practical project exercises. We use a burger-shop scenario to practice Supabase end to end: DB initialization, app connection, auth, and RLS behavior.
+
+## 3.1 Clone and Run Supabase Demos
+
+Clone the demo repository:
+
+https://github.com/THU-SIGS-AIID/Project5-Supabase-Demos
+
+If you already configured SSH keys, prefer SSH clone:
+
+`git@github.com:THU-SIGS-AIID/Project5-Supabase-Demos.git`
+
+If network/SSH has issues, use **Download ZIP**.
+
+
+
+After cloning, ask Trae or Claude Code to run a target project directory directly.
+
+## 3.2 Project1 - burger-shop-menu-crud
+
+In `project-burger-shop-menu-crud-1`, we initialize Supabase with SQL scripts and connect frontend reads/writes to Supabase.
+
+### Create a Database Using Scripts
+
+First, we need to create the required tables in Supabase. In the Project1 directory, there is a folder named `scripts`, which contains one database script file `init.sql`. It can automatically create all related database resources (including table schemas and initial data). We will frequently use this file later to initialize tables in the database.
+
+```sql
+......
+
+-- ============================================================================
+-- 2. Create Menu Items Table
+-- ============================================================================
+
+create table if not exists public.menu_items (
+ id uuid primary key default gen_random_uuid(),
+ name text not null,
+ description text,
+ category text check (category in ('burger','side','drink')) default 'burger',
+ price_cents int not null check (price_cents > 0),
+ available boolean default true,
+ emoji text,
+ created_at timestamptz not null default now(),
+ updated_at timestamptz not null default now()
+);
+
+-- Comments for documentation
+comment on table public.menu_items is 'Burger shop menu items for CRUD demo';
+comment on column public.menu_items.id is 'Unique identifier for each menu item';
+comment on column public.menu_items.name is 'Display name of the menu item';
+comment on column public.menu_items.description is 'Detailed description of the menu item';
+comment on column public.menu_items.category is 'Category: burger, side, or drink';
+comment on column public.menu_items.price_cents is 'Price in cents (integer) to avoid floating point issues';
+comment on column public.menu_items.available is 'Whether the item is currently available for order';
+comment on column public.menu_items.emoji is 'Optional emoji representation of the menu item';
+comment on column public.menu_items.created_at is 'Timestamp when the item was created';
+comment on column public.menu_items.updated_at is 'Timestamp when the item was last updated';
+
+......
+```
+
+After running the initialization SQL script in SQL Editor, you can see the created tables in Table Editor. The specific execution logic of the database initialization code is:
+
+1. Create the `menu_items` table:
+2. This table stores all items in the burger shop menu. It includes fields such as `name` (product name), `description`, `price_cents` (price in cents to avoid floating-point precision issues), `category`, and `available` (whether it is currently sellable). This covers the information required by a menu item.
+3. Create the `promo_codes` table:
+4. This table manages promotions such as discount codes. It defines fields like `code`, `discount_type` (percentage or fixed amount), and `discount_value`.
+5. Disable Row Level Security (RLS):
+6. For convenience during development and testing, RLS is explicitly disabled in the script. But based on the RLS core logic we learned earlier: RLS is a key security capability in Supabase, and can precisely control "who can access/modify which data" through policies (for example, only admins can edit promo codes while regular users can only view menus). Therefore, in production, you must enable RLS and configure proper policies to block unauthorized access at the data layer.
+7. Insert seed data:
+8. To let the frontend display realistic menu and promo data right after startup (without manual test-data entry), the `init.sql` script also inserts seed data into `menu_items` and `promo_codes`. For example, you can see various burgers, sides, drinks, and multiple discount codes.
+
+### Set up the connection with database
+
+Once the database is ready, we need to connect this frontend project with Supabase so it can read data normally. We need to place the Supabase project URL and anon key into the expected configuration. This project provides two flexible approaches:
+
+1. Configure via environment variables
+
+Create a `.env` file in the project root and fill in your Supabase credentials:
+
+```
+NEXT_PUBLIC_SUPABASE_URL=https://your-project.supabase.co
+NEXT_PUBLIC_SUPABASE_ANON_KEY=your-anon-key
+```
+
+2. Configure directly in the project page
+
+To make quick demos and switching among different Supabase projects easier, the homepage provides a Settings button in the upper-right corner. You can click it and directly input or paste the Supabase URL and anon key in the popup modal.
+
+After clicking "Save", this information is used to dynamically create a Supabase client instance, similar to the following code:
+
+Client creation example:
+
+```JavaScript
+import { createClient, type SupabaseClient } from '@supabase/supabase-js';
+
+export function maybeCreateBrowserClient(): SupabaseClient | null {
+ const url = process.env.NEXT_PUBLIC_SUPABASE_URL;
+ const anon = process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY;
+ if (!url || !anon) return null;
+ return createClient(url, anon);
+}
+```
+
+After creating the database and filling the Supabase link configuration, you can see an interface like the following. You can try CRUD operations on products and observe corresponding table changes in Supabase.
+
+
+
+
+
+### 📚 Assignment
+
+1. Try adding and deleting items, then inspect changes in Table Editor.
+
+## 3.4 Project2 - burger-shop-auth-users
+
+Project1 focuses on menu CRUD and DB connection. Project2 adds user authentication and RLS permission control.
+
+The login page supports email/password registration and sign-in via Supabase Auth native methods:
+
+```javascript
+const { error: err } = await supabaseClient.auth.signUp({
+ email,
+ password,
+ options: {
+ data: {
+ full_name: fullName || null,
+ birthday: birthday || null,
+ avatar_url: avatarUrl || null
+ }
+ }
+});
+```
+
+
+
+After login, Supabase creates session automatically. With RLS, each user only sees their own account data.
+
+Initialize with `init.sql` first (if initialization fails, clean old tables or recreate the Supabase project).
+
+After sign-up and email verification, you can enter shop UI:
+
+
+
+To access admin UI, modify corresponding role field to `admin` in DB:
+
+
+
+By default, each new email sign-up requires email confirmation. You can disable forced confirmation in `Authentication -> Sign In / Providers -> Confirm email`.
+
+
+
+### 📚 Assignment
+
+1. Claim starter pack and complete purchase flow.
+2. Locate role-related table and set role to `admin`, then modify product quantities in admin page.
+3. Locate wallet balance table and modify values to increase remaining wallet amount.
+
+# 4. Build Your First Supabase App
+
+Now that you understand DB operations, auth, and RLS, build your own app with database + user login.
+
+## 4.1 为任意应用接入 Supabase 数据库的标准化流程
+
+Use this standardized process:
+
+1. Clarify requirements and tell AI clearly.
+ 1. Describe app function and required DB behavior (for example: local React Todo needs cloud sync with Supabase).
+ 2. Add constraints if needed (timestamp format, money precision, per-user visibility).
+ 3. Review AI output and correct missing fields.
+2. Ask AI to generate `init.sql` based on confirmed schema; run in SQL Editor; if errors, feed error back and iterate.
+3. Ask AI to refactor code according to SQL schema and communication logic.
+4. Configure Supabase URL/key and test end-to-end.
+ 1. run app and test DB interactions
+ 2. inspect Table Editor sync behavior
+ 3. if failures occur, report exact symptoms to AI and iterate
+
+For auth pages, ask AI directly to integrate email sign-up/sign-in and define page routing expectations.
+
+You can also ask AI to migrate implementation patterns from an existing project path directly.
+
+## 4.2 Case Study : Build an Online Snake Game
+
+Following the SOP above, use `Project5-Supabase-Demos/apps_snakegame` as concrete practice: add leaderboard + user auth.
+
+
+
+### 4.2.1 分析项目,识别数据需求
+
+First, similar to the standardized process above, we can clarify requirements with AI and let AI provide a corresponding modification plan based on our project and requirements. We then implement based on that plan.
+
+**You can use the following prompt to guide AI:**
+
+> "I have a snake game. The directory is at {paste the absolute path of the snake game here}. Now I want to add an online leaderboard with Supabase, and also support a user login system. The leaderboard should display rankings by username and email.
+>
+> Please help me analyze what tables I need to create to implement this feature. What fields should each table include?"
+
+You will then get a response similar to:
+
+
+
+### 4.2.2 生成 `init.sql` 脚本
+
+Then ask AI to generate `scripts/init.sql` for Supabase initialization:
+
+
+
+### 4.2.3 改造项目代码
+
+Then ask AI to refactor game code for:
+
+- leaderboard as independent page
+- auth via email
+- registration/login required before game
+
+If conversation context gets too long, start a fresh chat and pass `init.sql` as context.
+
+If auth is unstable, reference:
+
+`Project5-Supabase-Demos/apps/project-burger-shop-auth-users-2`
+
+Successful result criteria:
+
+- users can register and sign in
+- signed-in users can view leaderboard correctly
+
+
+
+
+
+### 📚 课程作业
+
+1. Integrate user auth into snake game demo.
+2. Integrate user auth into your own application.
+
+# 5. Become Supabase Master
+
+The above covered basic operations. Next are advanced concepts and features: why Supabase is selected in this curriculum, and how to implement more complex interactions.
+
+You do not need to master everything immediately. Learn on demand as projects require.
+
+## 5.1 Why We choose Supabase
+
+Why choose Supabase among many backend options?
+
+Startups face a common tension:
+
+- want full backend control
+- must ship quickly
+
+Self-building backend from scratch often consumes months (DB/realtime/auth/API/storage/jobs/monitoring, etc.). Supabase packages these capabilities into ready-to-use services, letting teams focus scarce time on product features instead of infrastructure.
+
+Supabase alternatives exist (PocketBase, Appwrite, etc.), but Supabase is often stronger for full SQL ecosystem maturity and community scale.
+
+Compared with closed systems like Firebase, Supabase's open-source approach reduces vendor lock-in risk and supports self-hosting.
+
+Selection is context-dependent:
+
+- tiny personal experiments: ultra-light tools may be enough
+- enterprise compliance scenarios: specialized enterprise identity stack may fit better
+- MVP and early growth: Supabase is often sufficient and can scale with integrations (Stripe, Resend, Cloudflare, etc.)
+
+## 5.2 Google & Github Login Support
+
+Earlier we covered email sign-up/sign-in. In production UX, social login usually improves conversion and user convenience.
+
+This section explains full details for Google and GitHub OAuth and password reset.
+
+Reference project:
+`Project5-Supabase-Demos/apps/project-burger-shop-auth-advanced-supabase-6`
+
+
+
+### 5.2.1 OAuth 流程:第三方登录是如何工作的?
+
+Third-party login uses OAuth 2.0. Its essence is delegated authorization: users grant limited profile access without exposing provider passwords to your app.
+
+Typical flow:
+
+1. user clicks Google sign-in button
+2. user is redirected to Google authorization page
+3. user consents; Google returns one-time authorization code via callback URL
+4. Supabase backend exchanges code for access token
+5. Supabase fetches profile, creates/links account, and establishes session
+
+
+
+### 5.2.2 配置 Google Cloud 获取 Client ID 和 Secret
+
+No matter which third-party login method you use, you normally need to configure a Client ID and Client Secret. For Google login, you first need to create an OAuth 2.0 Client ID in Google Cloud Platform to obtain these values.
+
+1. **Enter Google Cloud Console**:
+2. Visit [Google Cloud Console](https://console.cloud.google.com/).
+3. Create a new project or select an existing one.
+4. **Configure OAuth consent screen**:
+5. In the left navigation, go to `APIs & Services` -> `OAuth consent screen`.
+6. Select the `External` user type, then click `Create`.
+7. Fill required information such as app name and user support email.
+8. In `Authorized domains`, add your Supabase project domain in the format `*.supabase.co`.
+9. Save and continue. In the `Scopes` and `Test users` steps, you can skip for now and save directly.
+10. **Create credentials**:
+11. Go to `APIs & Services` -> `Credentials`.
+12. Click `+ CREATE CREDENTIALS`, then select `OAuth client ID`.
+13. Select `Web application` for `Application type`.
+14. Give it a name, for example `Supabase Auth`.
+15. In `Authorized redirect URIs`, click `ADD URI` and fill your Supabase callback URL. You can find this URL in Supabase Dashboard at `Authentication` -> `Providers` -> `Google`. The format is usually `https://.supabase.co/auth/v1/callback`.
+ 
+16. Click `CREATE`.
+17. **Get Client ID and Client Secret**:
+18. After creation succeeds, a popup shows your **Client ID** and **Client Secret**. Be sure to copy and store them immediately.
+
+### 5.2.3 配置 GitHub 获取 Client ID 和 Secret
+
+Similarly, you need to register an OAuth application on GitHub.
+
+1. **Enter GitHub Developer Settings**:
+ 1. Sign in to your GitHub account.
+ 2. Click your avatar in the upper-right corner and enter `Settings`.
+ 3. At the bottom of the left navigation, find `Developer settings`.
+
+2. **Register a new application**:
+3. Select `OAuth Apps`, then click `New OAuth App`.
+4. Fill in an app name, for example `My Burger Shop`.
+5. **Homepage URL**: fill your online app URL, or local development URL `http://localhost:3000`.
+6. **Authorization callback URL**: fill in your Supabase project callback URL. You can find it in Supabase Dashboard at `Authentication` -> `Providers` -> `GitHub`. The format is `https://.supabase.co/auth/v1/callback`.
+7. Click `Register application`.
+8. **Get Client ID and Client Secret**:
+9. After registration, the page displays your **Client ID**.
+ 
+10. Click `Generate a new client secret` to generate your **Client Secret**. Again, copy and store it immediately.
+
+### 5.2.4 在 Supabase 中配置 Provider
+
+Now configure the credentials you obtained in Supabase.
+
+1. **Enter Supabase Dashboard**:
+2. Select your project, then go to `Authentication` -> `Providers`.
+3. **Enable and configure Google**:
+4. Find `Google` and enable it.
+5. Paste the **Client ID** and **Client Secret** from Google Cloud into the corresponding fields.
+6. Click `Save`.
+7. **Enable and configure GitHub**:
+ 1. Find `GitHub` and enable it.
+ 2. Paste the **Client ID** and **Client Secret** from GitHub into the corresponding fields.
+ 3. Click `Save`.
+
+
+
+At this point, your website can already support third-party account login. You can directly ask AI to use `Project5-Supabase-Demos/apps/project-burger-shop-auth-advanced-supabase-6` as reference and add user login support to your own project, integrating both GitHub and Google authentication with minimal cost.
+
+### 5.2.6 密码重置实现
+
+Password reset is a core production auth feature.
+
+Reference project includes full implementation:
+`project-burger-shop-auth-advanced-supabase-6`
+
+Core flow:
+
+1. user enters email; frontend calls `supabase.auth.resetPasswordForEmail()` with redirect URL
+2. Supabase sends reset email
+3. user clicks email link and is redirected to reset page
+4. user submits new password through `supabase.auth.updateUser()`
+
+You can customize reset templates in:
+`Authentication -> Email Templates`
+
+
+
+## 5.3 Realtime Function
+
+Supabase Realtime is one of its strongest capabilities. It is useful for collaborative docs, live dashboards, game lobbies, and customer-support systems.
+
+Project:
+`Project5-Supabase-Demos/apps/project-burger-shop-realtime-orders-3`
+
+
+
+### 5.3.1 数据库实时变动 Postgres Changes
+
+Postgres Changes subscribes to row changes in specific tables/events.
+
+Enable realtime replication with SQL:
+
+```sql
+ALTER TABLE public.chat_messages REPLICA IDENTITY FULL;
+DO $$
+BEGIN
+ IF NOT EXISTS (
+ SELECT 1 FROM pg_publication_tables
+ WHERE pubname = 'supabase_realtime'
+ AND schemaname = 'public'
+ AND tablename = 'chat_messages'
+ ) THEN
+ ALTER PUBLICATION supabase_realtime ADD TABLE public.chat_messages;
+ END IF;
+END $$;
+```
+
+Client subscription example:
+
+```typescript
+const sub = supabase
+ .channel('chat_messages_channel')
+ .on('postgres_changes', {
+ event: 'INSERT',
+ schema: 'public',
+ table: 'chat_messages'
+ }, (payload: any) => {
+ console.log('New message received:', payload.new);
+ const newMessage = payload.new as Message;
+ })
+ .subscribe((status: string) => {
+ console.log('Chat subscription status:', status);
+ });
+```
+
+Key points:
+
+- `.channel(...)`: isolate communication scope
+- `.on('postgres_changes', ...)`: subscribe event source and filter
+- `payload.new`: newly inserted row content
+- `.subscribe()`: activate channel
+
+### 5.3.2 信息广播同步 Broadcast & Presence
+
+For low-latency temporary states (for example cursor tracking), use Broadcast + Presence rather than DB writes.
+
+- Presence: shared online-state synchronization
+- Broadcast: temporary low-latency message passing
+
+Presence implementation steps:
+
+1. Create presence-enabled channel
+
+```text
+const ch = supabase.channel('lobby_presence', {
+ config: {
+ presence: { key: anonymousUser.id },
+ }
+});
+```
+
+2. Subscribe and track current user
+
+```text
+const me = {
+ id: anonymousUser.id,
+ name: anonymousUser.name,
+ color: anonymousUser.color
+};
+
+ch.subscribe(async (status) => {
+ if (status === 'SUBSCRIBED') {
+ await ch.track(me);
+ }
+});
+```
+
+3. Sync full online list
+
+```text
+ch.on('presence', { event: 'sync' }, () => {
+ const state = ch.presenceState();
+ const flat = {};
+ Object.values(state).forEach((arr) => {
+ arr.forEach((u) => { flat[u.id] = { ...u }; });
+ });
+ setOnline(flat);
+});
+```
+
+4. Listen join/leave events
+
+```text
+ch.on('presence', { event: 'join' }, ({ key, newPresences }) => {
+ console.log('User joined:', key, newPresences);
+});
+
+ch.on('presence', { event: 'leave' }, ({ key, leftPresences }) => {
+ console.log('User left:', key, leftPresences);
+});
+```
+
+Broadcast cursor example:
+
+Sender:
+
+```typescript
+const handleMouseMove = (e) => {
+ const payload = {
+ id: anonymousUser.id,
+ x: e.clientX,
+ y: e.clientY,
+ name: anonymousUser.name,
+ color: anonymousUser.color
+ };
+
+ channelRef.current?.send({
+ type: 'broadcast',
+ event: 'cursor',
+ payload
+ });
+};
+
+document.addEventListener('mousemove', handleMouseMove);
+```
+
+Receiver:
+
+```typescript
+ch.on('broadcast', { event: 'cursor' }, ({ payload }) => {
+ setOnline((prev) => ({
+ ...prev,
+ [payload.id]: {
+ ...(prev[payload.id] || {}),
+ x: payload.x,
+ y: payload.y
+ }
+ }));
+});
+```
+
+Presence keeps "who is online"; Broadcast carries temporary shared states.
+
+## 5.4 Storage
+
+A real app handles not only structured data (orders/users), but also unstructured files (avatars, product images, documents).
+
+If such files are all stored in business servers directly, storage pressure and IO bottlenecks can become severe.
+
+In practice, files are stored in object storage systems (S3/OSS/etc.), and apps access files through URL addresses.
+
+Project:
+`project-burger-shop-storage-uploads-4`
+
+This project demonstrates avatar upload flow and uses `Uppy` + `Tus` resumable upload against Supabase upload endpoint.
+
+
+
+
+
+### 5.4.1. Bucket
+
+Storage is organized by buckets (like folders), each with independent policies and settings.
+
+Like DB RLS, Storage permissions are controlled with SQL policies on `storage.objects` and `storage.buckets`.
+
+Example: only allow authenticated users to upload image files under user-specific folder in `avatars` bucket:
+
+```text
+CREATE POLICY "Allow authenticated uploads to avatars bucket"
+ON storage.objects FOR INSERT
+TO authenticated
+WITH CHECK (
+ bucket_id = 'avatars' AND
+ auth.uid() = (storage.foldername(name))[1]::uuid AND
+ (storage.extension(name) IN ('png', 'jpg', 'jpeg'))
+);
+
+CREATE POLICY "Allow public read access to avatars"
+ON storage.objects FOR SELECT
+USING ( bucket_id = 'avatars' );
+```
+
+### 5.4.2 获取可访问文件 URL
+
+In this project, create a public bucket named `avatars`. After upload, you get a storage path (for example `public/avatar1.png`) and need to convert it to HTTP-accessible URL.
+
+Two URL strategies:
+
+#### 1. 公开 URL (Public URL) - 永久链接
+
+For files in public bucket:
+
+```typescript
+const { data } = supabase.storage
+ .from('avatars')
+ .getPublicUrl('public/avatar1.png');
+const publicUrl = data.publicUrl;
+```
+
+Pros:
+
+- simple fixed URL structure
+- cache-friendly (CDN/browser)
+
+Best for truly public resources (logo/public posters).
+
+Risk:
+
+- hotlink traffic abuse can increase bandwidth costs
+
+#### 2. 签名 URL (Signed URL) - 临时授权链接
+
+Recommended for most production private/controlled assets:
+
+```typescript
+const { data, error } = await supabase.storage
+ .from('avatars')
+ .createSignedUrl('private/user-invoice.pdf', 3600);
+const signedUrl = data?.signedUrl;
+```
+
+Benefits:
+
+- expiring authorization
+- safer permission boundaries
+- much better anti-hotlink behavior
+
+For private assets (avatars, paid content, invoices), prefer signed URLs by default.
+
+## 5.5 Edge Function
+
+Edge Function is a core serverless pattern. "Serverless" does not mean no servers; it means you do not manage server provisioning/ops yourself. You write function logic, provider runs it on trigger and charges by usage.
+
+Common edge-function providers:
+
+- AWS Lambda@Edge
+- Cloudflare Workers
+- Vercel Edge Functions
+
+In Supabase, Edge Functions run on Deno + TypeScript and are deployed globally for low-latency execution close to users.
+
+Project:
+`Project5-Supabase-Demos/apps/project-burger-shop-edge-function-5`
+
+
+
+### 5.5.1 LLM Chat 案例解析
+
+If you want ChatGPT-like features, never expose model API keys in frontend code. Use edge function as secure proxy.
+
+```typescript
+// scripts/llm-chat.ts
+import "jsr:@supabase/functions-js/edge-runtime.d.ts";
+import { OpenAI } from "npm:openai";
+
+const OPENAI_API_KEY = Deno.env.get("OPENAI_API_KEY");
+
+Deno.serve(async (req) => {
+ try {
+ const openai = new OpenAI({ apiKey: OPENAI_API_KEY });
+ const { prompt } = await req.json();
+
+ const stream = await openai.chat.completions.create({
+ model: "gpt-3.5-turbo",
+ messages: [{ role: "user", content: prompt }],
+ stream: true,
+ });
+
+ return new Response(stream.toReadableStream(), {
+ headers: { "Content-Type": "text/event-stream" },
+ });
+ } catch (err) {
+ }
+});
+```
+
+Key idea: API key remains server-side in Supabase secrets.
+
+### 5.5.2 创建并部署函数
+
+Supabase provides a very user-friendly interface, so you can complete deployment without touching the command line.
+
+1. **Open the Edge Functions panel**:
+2. Sign in to your Supabase project Dashboard.
+3. In the left navigation, click the code-like icon and enter `Edge Functions`.
+4. **Create a new function**:
+5. Click `Create a new function`.
+ 
+6. Name the function, for example `llm-chat`.
+7. **Paste code**:
+ 
+8. In the online editor popup, **delete all default placeholder code**.
+9. Open your local `llm-chat.ts` file and **copy all content**.
+10. **Paste** the copied code into the Supabase online editor.
+11. **Configure environment variables (Secrets)**:
+ 1. Find `Secrets` in the sidebar.
+ 
+ 2. `Name`: enter `OPENAI_API_KEY`.
+ 3. `Value`: paste your own OpenAI API Key.
+ 4. Click `Save`. The secret set here is encrypted and securely injected into the runtime environment of your function.
+
+If a function needs to be updated, remember to run `Deploy updates` in the Edge Function section. Supabase will build and deploy this function in the cloud. After a few minutes, your function can be accessed online.
+
+Beyond being a secure proxy for language-model calls, Edge Functions are useful in far more scenarios. In fact, any task requiring server-side logic, from simple API calls and data validation to more complex computation, can be implemented with Edge Functions. It gives you a lightweight and scalable backend without managing server infrastructure.
+
+If you want to explore more possibilities, refer to other examples in this project. For example:
+
+- Image generation (`txt2img.ts`): this function shows how to call third-party text-to-image APIs (such as Stability AI or Midjourney) through Edge Functions to generate images dynamically. This is a typical compute-intensive or external-service-secure-call scenario. Just like `llm-chat`, the API key is stored securely in Supabase backend. The frontend only sends text prompts and displays generated images, making the flow secure and efficient.
+- Send email (`send-email.ts`): sending welcome emails, transaction notifications, or password-reset emails is a common requirement. The `send-email.ts` example demonstrates integrating email services (such as Resend or SendGrid) through Edge Functions. You do not need to expose sensitive email-service API keys in client code. Just create a function and let the frontend trigger email sending through this function.
+
+## 5.6 Clerk Login
+
+Clerk is a specialized identity/auth platform. It covers registration, login, MFA, session, permission management, and more.
+
+This part explains full integration with Supabase.
+
+Project:
+`project-burger-shop-auth-advanced-clerk-7`
+
+
+
+### 5.6.1 创建 Clerk 应用与获取密钥
+
+Before using this project, you need a Clerk account and an application.
+
+1. Register and create:
+ 1. Visit [dashboard.clerk.com](https://dashboard.clerk.com/) and register an account.
+ 2. Click `Create application`.
+ 
+ 3. Enter your application name (for example, `Burger Shop`).
+ 4. In `How will your users sign in?`, keep `Email`, `Google`, and `GitHub` selected by default.
+ 5. Click `Create application`.
+2. Get API keys:
+ 1. After creation, you will be guided to the API Keys page.
+ 
+ 2. Find the Publishable key (starts with `pk_`) and Secret key (starts with `sk_`).
+ 
+ 3. Copy them into your `.env.local` file (refer to this project's `.env.example`):
+
+ ```bash
+ NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_test_...
+ CLERK_SECRET_KEY=sk_test_...
+ ```
+
+### 5.6.2 配置 Supabase 和 Clerk 的原生集成
+
+Supabase and Clerk provide native integration:
+
+1. In Clerk dashboard:
+ 1. go to Integrations
+ 2. activate Supabase integration
+ 3. copy Clerk Domain (`https://.clerk.accounts.dev` or custom domain)
+2. In Supabase dashboard:
+ 1. go to Authentication -> Providers
+ 2. add Clerk provider
+ 3. paste Clerk Domain
+ 4. save
+
+### 5.6.3 通过 Webhook 同步用户数据至 Supabase
+
+Native integration only solves authentication authorization. It does not sync already-registered Clerk users into Supabase. For easier management, we also need to keep a backup of user data in Supabase `public.users` for relational queries or data analysis. We can implement this with Clerk Webhooks. The full flow is:
+
+1. **Clerk sends notifications**: when a user registers or updates profile in Clerk, Clerk sends a POST request to the configured Webhook URL.
+2. **Supabase receives and writes**: an Edge Function receives the request, verifies the signature (for security), and then updates user data into Supabase tables.
+
+Before we start, we need to configure the table used for synchronization:
+
+```sql
+-- File: init.sql
+
+-- 1. Create `users` table for synced Clerk users
+-- This table will store user data pushed from Clerk Webhooks.
+CREATE TABLE public.users (
+ id TEXT NOT NULL PRIMARY KEY, -- Corresponds to Clerk User ID
+ email TEXT,
+ first_name TEXT,
+ last_name TEXT,
+ image_url TEXT,
+ created_at TIMESTAMPTZ DEFAULT NOW(),
+ updated_at TIMESTAMPTZ DEFAULT NOW()
+);
+
+-- 2. Enable Row Level Security (RLS) on the table
+-- This is an important security measure to ensure users cannot access any data by default.
+ALTER TABLE public.users ENABLE ROW LEVEL SECURITY;
+
+-- 3. Create RLS policies
+-- Policy 1: Allow authenticated users to read their own user info.
+-- `auth.jwt()->>'sub'` extracts the user ID from the JWT provided by Clerk.
+CREATE POLICY "Authenticated users can view their own user record"
+ON public.users FOR SELECT
+TO authenticated
+USING ( (SELECT auth.jwt()->>'sub') = id );
+
+-- Policy 2: Allow users to update their own info.
+CREATE POLICY "Authenticated users can update their own user record"
+ON public.users FOR UPDATE
+TO authenticated
+USING ( (SELECT auth.jwt()->>'sub') = id );
+```
+
+Then enable the corresponding Edge Function in Supabase:
+
+```JavaScript
+// File path: supabase/functions/clerk-webhooks/index.ts
+
+import { serve } from 'https://deno.land/std@0.177.0/http/server.ts'
+import { Webhook } from 'npm:svix'
+import { createClient } from 'https://esm.sh/@supabase/supabase-js@2'
+
+// Get Clerk Webhook signing secret from environment variables
+const CLERK_WEBHOOK_SECRET = Deno.env.get('CLERK_WEBHOOK_SECRET')
+
+if (!CLERK_WEBHOOK_SECRET) {
+ throw new Error('CLERK_WEBHOOK_SECRET is not set in environment variables')
+}
+const supabaseAdmin = createClient(
+ Deno.env.get('SUPABASE_URL')!,
+ Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')!
+)
+
+serve(async (req) => {
+ try {
+ // 1. Get Svix signature info from request headers
+ const headers = Object.fromEntries(req.headers)
+ const svix_id = headers['svix-id']
+ const svix_timestamp = headers['svix-timestamp']
+ const svix_signature = headers['svix-signature']
+
+ if (!svix_id || !svix_timestamp || !svix_signature) {
+ return new Response('Missing Svix headers', { status: 400 })
+ }
+
+ const payload = await req.json()
+ const body = JSON.stringify(payload)
+
+ // 2. Verify Webhook signature validity using the secret
+ const wh = new Webhook(CLERK_WEBHOOK_SECRET)
+ const evt = wh.verify(body, {
+ 'svix-id': svix_id,
+ 'svix-timestamp': svix_timestamp,
+ 'svix-signature': svix_signature,
+ })
+
+ const { id } = evt.data
+ const eventType = evt.type
+ console.log(`Received webhook event: ${eventType} for user: ${id}`)
+
+ // 3. Execute database operations based on event type
+ switch (eventType) {
+ case 'user.created': {
+ const { id, first_name, last_name, image_url, email_addresses } = evt.data
+ const { error } = await supabaseAdmin.from('users').insert({
+ id,
+ first_name,
+ last_name,
+ image_url,
+ email: email_addresses[0]?.email_address,
+ })
+ if (error) throw error
+ console.log(`User ${id} created in Supabase.`)
+ break
+ }
+ case 'user.updated': {
+ const { id, first_name, last_name, image_url, email_addresses } = evt.data
+ const { error } = await supabaseAdmin
+ .from('users')
+ .update({
+ first_name,
+ last_name,
+ image_url,
+ email: email_addresses[0]?.email_address,
+ updated_at: new Date().toISOString(), // Update timestamp
+ })
+ .eq('id', id)
+ if (error) throw error
+ console.log(`User ${id} updated in Supabase.`)
+ break
+ }
+ case 'user.deleted': {
+ // For delete events, ID might be at the top level
+ const deletedId = id
+ if (!deletedId) {
+ return new Response('Deleted user ID not found', { status: 400 })
+ }
+ const { error } = await supabaseAdmin.from('users').delete().eq('id', deletedId)
+ if (error) throw error
+ console.log(`User ${deletedId} deleted from Supabase.`)
+ break
+ }
+ }
+
+ return new Response('Webhook processed successfully', { status: 200 })
+ } catch (err) {
+ console.error('Error processing webhook:', err.message)
+ return new Response(`Webhook Error: ${err.message}`, { status: 400 })
+ }
+})
+```
+
+After initializing the Supabase table and function, you still need to enable Webhooks in Clerk:
+
+- In Clerk Dashboard -> **Webhooks**, add an Endpoint and fill in the Supabase Edge Function URL.
+- Check events such as `user.created`, `user.updated`, and `user.deleted`.
+
+
+
+Once the setup succeeds, you can see different request attempts in `Message Attempts`. Click each one to inspect detailed response payloads. If a webhook call to Edge Function fails, you can quickly identify the cause from the returned details. It is recommended to compare request logs from both Clerk and Supabase to verify each function setting is correct.
+
+### 5.6.4 Clerk 中的第三方登录支持
+
+Before config, distinguish:
+
+- development environment (local/internal testing)
+- production environment (public real users)
+
+Clerk separates these for security and policy reasons.
+
+1. **Development quick verification**
+
+- In Clerk dashboard -> SSO connections -> Add connection -> For all users
+- choose GitHub/Google and add
+- Clerk shared credentials handle local testing quickly
+
+2. **Production custom credentials**
+
+When switching to production instance, shared credentials are not enough. Configure custom OAuth credentials:
+
+- copy callback/redirect URL from Clerk
+- configure OAuth app on provider side
+- paste client ID/secret back into Clerk
+
+2.1 GitHub production steps:
+
+- GitHub Developer Settings -> OAuth Apps -> New OAuth app
+- set application name/homepage/callback URL
+- generate client secret
+- paste into Clerk SSO connection
+
+2.2 Google production steps:
+
+- Google Cloud Console -> APIs & Services -> Credentials
+- create OAuth client (Web application)
+- set authorized origins and redirect URI
+- copy client ID/secret to Clerk
+
+Notes:
+
+1. avoid WebView login for Google OAuth
+2. testing mode has user limits; switch publishing status to production after review
+3. configure sub-address handling policy if needed
+4. optionally integrate Clerk Google One Tap component
+
+3. test social login
+
+- use Clerk Account Portal sign-in page
+- test GitHub/Google sign-in redirect and callback behavior
+
+# 6. 从 Supabase 到更多后端开发组件(进阶)
+
+So far we viewed backend capabilities through Supabase. From a broader engineering perspective, each Supabase module has specialized alternatives in the market.
+
+Why understand alternatives:
+
+- decide when all-in Supabase is enough
+- replace only one module when scaling/compliance/cost changes
+- broaden system design trade-off understanding
+
+This section compares common alternatives by features, pricing, ease of use, and community traction.
+
+## 同类 Baas 平台
+
+| Platform/Service | Type | Free Tier/Pricing | Features / Use Cases |
+| ------------------------ | ------------------------------------------------------------------------------ | -------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| Firebase (Google) | Fully managed BaaS (Auth + Firestore + Storage + Functions + Hosting) | Spark free tier; Blaze pay-as-you-go | Most mature ecosystem, great docs, fast onboarding, strong realtime; but complex billing and stronger lock-in |
+| Supabase | Open-source BaaS (Postgres + Auth + Storage + Edge Functions + Realtime) | Free: 500MB DB, 1GB storage, limited function calls; Pro by plan | SQL-first Firebase-like experience; modern DX, can self-host |
+| Appwrite Cloud | Open-source all-in-one BaaS | Free basic tier, paid by resources | modern UX, unified APIs, self-host option; ecosystem smaller than Firebase/Supabase |
+| Nhost | Postgres + GraphQL + Auth + Storage + Functions | Free: 1GB DB, 1GB storage, limited function calls | Similar to "Supabase + Hasura"; GraphQL-native |
+| AWS Amplify | AWS full-stack backend suite | Free quotas for hosting/cognito/functions | strong enterprise reliability; steeper learning curve |
+| Xata | Multi-model DB + Auth + Edge Functions | Free: 250k records, 15GB bandwidth | strong DX and UI, but less all-in-one than Firebase/Supabase |
+| Convex | Managed DB + Auth + Functions (frontend-first) | Free developer tier; paid by usage | very fast MVP development; higher platform binding risk |
+
+## 认证 (Auth)
+
+| Tool/Platform | Features | Free Tier/Pricing | Fit and Trade-offs |
+| ----------------------- | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Firebase Authentication | email/password, phone, social, anonymous, etc. | Spark up to 50k MAU | easy integration, rich docs, but Firebase lock-in |
+| Auth0 (Okta) | enterprise SSO/MFA/rules/extensibility | free 25k MAU then paid | enterprise-grade but can become expensive |
+| AWS Cognito | AWS-native identity service | free 10k MAU/month then pay-as-you-go | strong AWS integration, higher complexity |
+| Logto | open-source auth platform | self-host free, cloud free 50k MAU | strong emerging alternative, smaller ecosystem |
+| Keycloak | open-source IAM/SSO | free self-host | powerful and extensible, higher ops complexity |
+
+## 文件存储 (Storage)
+
+| Platform/Service | Type | Free Tier/Pricing | Features/Use Cases |
+| ---------------------------------------- | -------------------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Amazon S3 | cloud object storage | AWS free tier: 5GB + request quotas | industry standard object storage, high reliability |
+| Google Cloud Storage / Firebase Storage | cloud object storage | Spark free + Blaze paid | strong Firebase integration, fine-grained rules |
+| Tencent COS / Aliyun OSS | domestic cloud object storage | pay-as-you-go + newcomer quotas | strong domestic ecosystem integration |
+| MinIO | open-source S3-compatible storage | free self-host | lightweight S3-compatible storage for private deployment |
+| Cloudinary / Imgix | media storage + CDN | basic free plans | strong media transformation capabilities |
+
+## 边缘函数 (Edge Functions)
+
+| Platform/Service | Features | Free Tier/Pricing | Fit and Trade-offs |
+| -------------------------------------- | ------------------------------------------ | ---------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Cloudflare Workers | globally distributed JS/Wasm runtime | free 100k req/day | ultra-low latency edge execution |
+| Vercel Edge Functions | deep Next.js integration | hobby free quotas | excellent frontend integration |
+| Netlify Edge / Functions | Node functions + edge routes | free credit-based quotas | easy git-integrated deployment |
+| AWS Lambda@Edge / CloudFront Functions | AWS edge compute | lambda free quotas + cloudfront pricing | powerful but more complex setup |
+
+## 实时通信 (Realtime)
+
+| Platform/Service | Features | Free Tier/Pricing | Fit and Trade-offs |
+| -------------------------------------- | ------------------------------------------------ | ----------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
+| Firebase Realtime DB / Firestore | realtime DB push updates | spark free + blaze paid | easy realtime listening, weaker complex querying |
+| Ably | pub/sub realtime messaging platform | free 6M messages/month | robust global realtime service |
+| Pusher Channels | event-push channels | sandbox free tier | quick chat/notification integrations |
+| Self-host WebSocket/Socket.IO | custom realtime infra | self-host infra cost | highest flexibility, highest ops burden |
+
+## 数据库
+
+| Platform/Tool | DB Type | Free Tier/Pricing | Key Features |
+| ---------------------------- | --------------------------------------- | ----------------------------------------------------- | ------------------------------------------------------------------- |
+| Neon | serverless PostgreSQL | free tier + branch compute limits | modern serverless Postgres with branching workflow |
+| Aiven PostgreSQL | managed relational DB | small free plans + paid | managed operations across cloud providers |
+| CockroachDB Cloud | distributed SQL (Postgres-compatible) | free storage quota | horizontal scaling and consistency |
+| TiDB Cloud | distributed relational (MySQL-compatible) | free cluster quotas | strong distributed MySQL-compatible architecture |
+| MongoDB Atlas | document NoSQL | free M0 cluster | flexible document modeling |
+| SQLPub | multi-database platform | free request/storage quotas | one-stop multi-DB service |
+
+Different options optimize different dimensions: flexibility, cost, ease of use, compliance, ecosystem fit, and scalability.
+
+# 总结
+
+In today's lesson, we systematically learned foundational database concepts, Supabase core definitions, and practical operation details. During later project practice, you can always come back to this document as a reference based on your specific application scenario and requirements.
+
+Please always remember one key principle: **Ship first, perfect later.** You do not need to achieve everything in one step. Through continuous iteration and optimization, we can gradually approach better outcomes. Wish you smooth progress in your upcoming project practice.
+
+# 📚 课后作业
+
+1. Build an application with user management + database support.
+ Try to include additional Supabase features (Realtime / cloud storage / Edge function).
diff --git a/docs/en/stage-2/backend/2.3-ai-interface-code/index.md b/docs/en/stage-2/backend/2.3-ai-interface-code/index.md
new file mode 100644
index 0000000..97c7c88
--- /dev/null
+++ b/docs/en/stage-2/backend/2.3-ai-interface-code/index.md
@@ -0,0 +1,178 @@
+# Using LLMs to Write API Code and API Documentation
+
+In the previous chapters, we learned how to use tools like Figma to create UI drafts, how to use AI to quickly generate static frontend pages, and how to use Supabase to build databases and basic authentication. That naturally leads to a new question: when someone clicks those lively buttons on the frontend, how does the data actually get stored in Supabase? And when we need more complex business logic such as concurrent payments, scheduled pushes, or sensitive data processing, is it still safe to let the frontend talk directly to the database?
+
+That question introduces one of the most important parts of modern web architecture: the **backend API**.
+
+In the past, backend developers often wrote hundreds or thousands of lines of routing, controller, and validation logic by hand. Today, we can hand much of that repetitive scaffolding to large language models. In this chapter, we will move beyond vague "AI-generated code" and look at a real workflow for using strong prompts to guide an LLM into writing solid Node.js backend interfaces, plus the corresponding documentation and test cases.
+
+> 💡 **Prerequisites**
+>
+> Before starting this chapter, it helps to understand:
+> - [From Database to Supabase](../2.2-database-supabase/) for basic database and data-model concepts
+> - [Git and GitHub Workflow](../2.4-git-workflow/) for project collaboration and version control
+> - [What Is the Terminal / Command Line](/en/appendix/2-development-tools/command-line-shell) for project initialization and startup commands
+
+# What you will learn
+
+1. **What an API is**: Understand the bridge between frontend and backend, plus basic RESTful design.
+2. **How LLMs help service construction**: Use structured prompts to generate a clean Node.js + Express starter project.
+3. **Interface logic development**: Guide the model to generate CRUD APIs with proper business validation and Supabase integration.
+4. **Automatic API documentation**: Ask the model to reverse-generate OpenAPI/Swagger docs from your code.
+5. **Testing and integration loops**: Use the model to create Postman collections and Jest unit tests to protect code quality.
+
+---
+
+# 1. Why do we need APIs?
+
+Traditionally, the frontend is "the visible part" and the database is "the storage room." But something is missing between them: a coordinator.
+
+If you imagine the application as a restaurant:
+
+- The **frontend (client)** is the menu and ordering table, where customers browse and make requests.
+- The **database (Supabase, etc.)** is the kitchen storeroom, where ingredients and records are kept.
+- The **backend API** is the waiter. Customers should not run straight into the kitchen to grab ingredients. Instead, they tell the waiter what they want through an HTTP request. The waiter checks the request, verifies permissions, talks to the kitchen, and brings the result back through an HTTP response, usually in JSON.
+
+Through APIs, we achieve a clean **frontend-backend separation**: the frontend focuses on rendering, while the backend focuses on business logic, data processing, and security.
+
+---
+
+# 2. Project architecture and initialization
+
+A clear project skeleton is a prerequisite for getting high-quality code from an LLM. Before you ask AI to write code, you should already have a mental model of the structure you want.
+
+## 2.1 A common API project structure
+
+Even if an LLM is generating the code, you should not dump everything into one `server.js` file. A maintainable Node.js backend usually looks something like this:
+
+```text
+my-api-project/
+├── .env # Sensitive environment variables such as API keys and DB URLs
+├── server.js # Project entry point: boot server, register global middleware
+├── package.json # Dependency management
+├── src/
+│ ├── routes/ # Route layer: define URLs and HTTP methods
+│ ├── controllers/ # Controller layer: process request params, call services, return responses
+│ ├── services/ # Service layer: database access and core business logic
+│ └── middlewares/ # Middleware: auth, global error handling
+└── docs/ # API documentation
+```
+
+## 2.2 Use AI to initialize the project
+
+Instead of manually running `npm init` and installing packages one by one, you can give the model the structure above in prompt form:
+
+> 🗣️ **Prompt example**
+> "Help me scaffold a Node.js backend project that can connect to Supabase. Keep the structure clean and easy to maintain later."
+
+If the prompt is good, the code you get back can already give you a backend app with a solid foundation running on `localhost:3000`.
+
+---
+
+# 3. Core practice: using LLMs to develop APIs
+
+This is the heart of the chapter. When LLM-generated code feels superficial or unsafe, the root cause is usually missing context. **LLMs are not afraid of complex requirements. They are afraid of vague ones.**
+
+Take the `menu_items` insert API from the [database chapter](../2.2-database-supabase/) as an example.
+
+## 3.1 Give the model full context
+
+Before asking the model to write an API, provide both the **database schema** and the **business constraints**.
+
+> 🗣️ **High-quality prompt template**
+> "Help me write an API for creating a menu item. Each item includes a product name, price, category (burger, snack, drink), and whether it is listed. Product name and price are required. Price cannot be negative. Return helpful validation errors when the user input is invalid."
+
+## 3.2 Review the generated code
+
+A good model will often separate responsibilities clearly, for example:
+
+```javascript
+// services/menuService.js
+const { createClient } = require('@supabase/supabase-js');
+const supabase = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_KEY);
+
+exports.createMenuItem = async (menuData) => {
+ // Push data into the table via the Supabase SDK
+ const { data, error } = await supabase
+ .from('menu_items')
+ .insert([menuData])
+ .select();
+
+ if (error) throw new Error(`Database insert failed: ${error.message}`);
+ return data[0];
+};
+```
+
+You can see that, with enough context, the model generates something structurally cleaner: Supabase initialization is separated, errors are handled, and the code is easier to reason about. That is very different from the spaghetti code you usually get from a vague request like "write a create endpoint."
+
+---
+
+# 4. Free your hands: generate API documentation automatically
+
+For a development team, an undocumented API is a blind box. Frontend engineers cannot guess what parameters are required or what the response shape will be. The most common API description standard in the industry is **OpenAPI** (formerly often called Swagger).
+
+Writing Swagger YAML or JSON by hand used to be painful and error-prone. Now it is one of the areas where LLMs help the most.
+
+You can select your `routes` and `controllers` code and ask:
+
+> 🗣️ **Documentation prompt**
+> "Generate API documentation from the code above. Clearly explain what every parameter means and what data the endpoint returns, so the frontend team can integrate it easily."
+
+You can even ask the model to fill in descriptions and mock example values such as `price_cents: 1200` for a $12.00 item. That reduces a lot of back-and-forth communication.
+
+---
+
+# 5. Safeguards: generate tests and Postman collections
+
+After the code and docs are ready, there is still one more step: verifying that everything actually works.
+
+## 5.1 Generate Postman or Apifox test configurations
+
+When developing APIs, we often use tools like Postman to simulate HTTP requests. Without AI, you usually have to fill in URLs, headers, and JSON request bodies manually.
+
+You can simply tell the model:
+
+> "Convert this API documentation into a Postman-importable format and include both successful and failing request examples."
+
+Once you save the returned JSON as something like `menu_api.json` and import it into Postman, you instantly get a ready-to-use testing panel.
+
+## 5.2 Write automated unit tests
+
+If you want stricter engineering quality, you can also ask the model to write tests with `Jest` or a similar framework. That is especially useful for boundary conditions, such as ensuring a negative price is rejected before data reaches the database.
+
+---
+
+# 6. Backend API best practices you still need to know
+
+Even with AI support, you are still the gatekeeper of the system. You need to review the generated code against a few important principles:
+
+1. **RESTful path naming**
+ - Good: `GET /api/users` for listing users, `POST /api/users` for creating users
+ - Bad: `POST /api/getUser` or `POST /api/createUser`
+ The URL should represent the resource. The action belongs to the HTTP method.
+
+2. **Correct HTTP status codes**
+ - `200/201`: request succeeded / resource created successfully
+ - `400`: bad request, invalid parameters or missing required fields
+ - `401/403`: unauthorized / forbidden
+ - `404`: resource not found
+ - `500`: server error, such as backend exceptions or database failures
+ Do not expose full backend stack traces to the frontend.
+
+3. **Never trust user input**
+ Frontend input can be forged. All important validation must run again on the backend.
+
+# 7. Summary
+
+After this chapter, your role should start to feel different. You are no longer just a typist trapped in syntax and punctuation. You are becoming a **system designer and architecture coordinator**.
+
+You have now learned:
+
+1. The core systems thinking behind **APIs and frontend-backend separation**
+2. How to dramatically improve LLM-generated backend code by providing **good context and layered structure**
+3. How to turn tedious **documentation writing** and **test creation** into automation tasks that AI handles well
+4. How to combine this with what you already learned about **Supabase** to complete the full flow from frontend request to database update
+
+::: tip Next Step
+Once your data flow and backend service are ready, they still only run locally on your own machine. In the next chapter, we will learn how to **deploy** that service to a public server so your product can be accessed by real users.
+:::
diff --git a/docs/en/stage-2/backend/2.4-git-workflow/index.md b/docs/en/stage-2/backend/2.4-git-workflow/index.md
new file mode 100644
index 0000000..e0773e2
--- /dev/null
+++ b/docs/en/stage-2/backend/2.4-git-workflow/index.md
@@ -0,0 +1,255 @@
+# Git and GitHub Workflow
+
+In previous chapters, we learned how to use web-based vibe coding tools to write code. Each conversation could generate a new version of the code. But that raises an important question: if we want to return to an earlier version, is there a convenient way to do it? Is there a tool that can record our code at different stages so we can switch between versions freely?
+
+That is exactly why version control software exists. In this chapter, we will introduce the most famous version control system, **Git**, and the most popular code hosting platform, **GitHub**. You will learn how to manage code with Git, how to download code from GitHub, how to upload your own work, and how to collaborate with others on larger projects.
+
+Whether you are tracking changes in a personal project, synchronizing code with teammates, or contributing to open source, Git and GitHub are essential tools for modern developers. Once you understand them, you can manage code more confidently, create checkpoints whenever needed, move between different stages of a project, and keep every change traceable.
+
+> 💡 **Prerequisites**
+>
+> Before learning Git, it helps to understand:
+> - [What Is the Terminal / Command Line](/en/appendix/2-development-tools/command-line-shell)
+> - [What Is Git](/en/appendix/2-development-tools/git-version-control)
+>
+> This chapter focuses on the GitHub workflow and hands-on usage, while the links above cover the core fundamentals.
+
+# Quick start with Git
+
+Before using Git, make sure you already understand the basics of the command line and Git itself. This chapter assumes you have that foundation and moves directly into installation, configuration, and practical GitHub collaboration.
+
+## How to install Git
+
+We will briefly walk through installation on the three major operating-system families.
+
+### Windows
+
+1. Go to the [official Git download page](https://git-scm.com/download/win) and download the installer that matches your system. In most cases, the x64 installer is recommended.
+2. Double-click the installer and follow the setup wizard:
+ 
+ 1. In most cases, keeping the default settings is fine. If you customize them, pay attention to:
+ - **Default editor**: you can keep Vim, or choose Visual Studio Code if you already have it installed.
+ 
+ - **How Git is used from the command line**: a practical default is the option that adds Git to the command line and third-party software without overcomplicating the system setup.
+ 
+3. After installation, right-click on the desktop. If you see `Git Bash Here`, the installation succeeded.
+
+
+
+### macOS
+
+On macOS, you can first run `git --version` in Terminal to check whether Git is already installed. If it is not, macOS often prompts you to install the developer tools automatically.
+
+1. Method 1: install with Homebrew
+ If you have [Homebrew](https://brew.sh/), open Terminal and run `brew install git`
+2. Method 2: install Xcode tools
+ You can also install Xcode or the Xcode Command Line Tools from Apple. Git is included as part of that toolchain.
+
+### Linux
+
+Most Linux distributions install Git through the system package manager:
+
+- Ubuntu / Debian:
+
+```bash
+sudo apt update
+sudo apt install git
+```
+
+- CentOS / RHEL:
+
+```bash
+sudo yum install git
+```
+
+To verify the installation, run `git --version`. If a version number appears, Git is ready.
+
+## Initialize Git identity
+
+After installing Git, the first thing you should do is configure your user information. Run the following commands in the terminal and replace the values with your own:
+
+```bash
+# Set the global username shown in commit history
+git config --global user.name "Your Name"
+
+# Set the global email, ideally the same one you use on GitHub
+git config --global user.email "your.email@example.com"
+```
+
+Git writes this information into every commit as the author identity. When you inspect the version history, you can clearly see who changed what and communicate more easily in collaborative projects.
+
+You can confirm the configuration with:
+
+```bash
+git config --list
+```
+
+# What is GitHub?
+
+GitHub is a code hosting platform built on top of Git. It provides remote storage for Git repositories and adds collaboration tools such as Issues, Pull Requests, and Projects. In simple terms, Git is the local version-control tool, while GitHub is the remote code warehouse and collaboration layer.
+
+GitHub is also the world's largest and most influential open-source community. The idea of open source is that anyone can download and run the source code of a project. That allows people around the world to inspect each other's work, improve it, and build new things on top of it.
+
+
+
+Large companies often open-source tools and tutorials on GitHub as part of their technical strategy. In the GitHub ecosystem, the number of `stars` a project receives is one of the most visible indicators of trust and influence.
+
+
+
+In this course, many supporting resources and assignments are also published in GitHub repositories. By learning to upload your own work there, you gradually build the workflow you will use for real application development later.
+
+## Create a GitHub account
+
+1. Visit [GitHub](https://github.com/) and click `Sign up` in the top-right corner.
+ 
+2. Enter your email address, create a password, and complete the verification steps.
+3. Confirm your email, and your account is ready.
+
+## Create your first repository on GitHub
+
+Next, let's create your first repository, often shortened to `repo`.
+
+
+
+
+
+When creating a repository, the main fields mean:
+
+1. **Repository name**: the public-facing name of the repository
+2. **Description**: a short explanation of what the repository is for
+3. **Visibility**:
+ - `Private`: only you and people you explicitly invite can see it
+ - `Public`: anyone can see it
+4. **README**: it is good practice to add a README. Think of it as the repository's introduction and usage guide.
+5. **.gitignore and license**:
+ 1. `.gitignore` tells Git which files or folders should not be tracked, such as temporary files, dependency folders, or local secrets.
+ 2. `license` determines how others are allowed to use your open-source code.
+
+For your first repository, it is reasonable to check `Add README`, set the visibility to `Private`, and fill in a name and description you like. Then click `Create repository`.
+
+
+
+You will now have a clean repository, ready for your files.
+
+
+
+To download a repository, you use `git clone`, which requires the repository URL. You can find that by clicking the green `Code` button. GitHub usually shows both HTTPS and SSH options.
+
+
+
+In general, HTTPS is fine for temporary downloads or quick testing, but for your own daily development workflow, SSH is usually the better experience.
+
+## Bind local SSH to GitHub
+
+In GitHub, "binding SSH" means connecting your local machine's SSH public key to your GitHub account so GitHub can recognize your device through the SSH protocol. Once set up, you can `clone`, `pull`, and `push` securely without re-entering passwords every time.
+
+In plain language: it is like giving your device a special access card for GitHub.
+
+> 💡 What is SSH?
+
+### Why use SSH authentication?
+
+GitHub supports two major protocols for repository operations:
+
+- **HTTPS**: usually requires a password or Personal Access Token for pushes
+- **SSH**: uses a key pair, so you do not need to repeat authentication constantly
+
+SSH binding is the prerequisite for using GitHub with SSH. You must upload your local SSH public key to GitHub so GitHub can verify your machine.
+
+### The core logic: SSH key pairs
+
+SSH authentication depends on a key pair:
+
+1. **Private key**: stored on your local machine, never shared
+2. **Public key**: uploaded to GitHub
+
+When you perform a Git operation over SSH:
+
+- Your machine signs the request with the private key
+- GitHub checks it against the public key you uploaded
+- If the match succeeds, the operation is allowed
+
+### The actual steps
+
+The core workflow is simple: **generate a key pair → upload the public key to GitHub**.
+
+1. **Generate an SSH key pair locally**
+ 1. **Use Trae to help generate it**
+ Prompt:
+ `Help me create the SSH key needed for GitHub login. My email is your_email@gmail.com. Please return the public key for me to copy.`
+
+ 
+
+ After entering the prompt, you may still need to press `Enter` in the terminal pane so the command can continue. Once Trae finishes, it will show you the public key to copy.
+
+ 
+
+ 2. **Generate it manually**
+ Open your terminal and run `ssh-keygen -t ed25519 -C "your_email@example.com"`
+ Press `Enter` to accept the defaults unless you want a custom path or passphrase. This creates:
+
+ - `id_ed25519`: your private key, which must stay local
+ - `id_ed25519.pub`: your public key, which you will upload to GitHub
+
+2. **Upload the public key to GitHub**
+
+ This is the binding step itself.
+
+ 1. Copy the public key:
+ - On Windows, open `C:\Users\\.ssh\id_ed25519.pub`
+ - On macOS/Linux, run `cat ~/.ssh/id_ed25519.pub`
+ 2. In GitHub, go to your avatar → `Settings` → `SSH and GPG keys` → `New SSH key`
+ 
+ 3. Enter a title and paste the public key.
+
+
+
+
+
+3. **Verify the binding**
+
+Run `ssh -T git@github.com`
+
+If you see a message similar to `Hi [your GitHub username]! You've successfully authenticated...`, the setup worked.
+
+### Important notes
+
+- If you use multiple devices, create a separate SSH key pair for each one and upload each public key to the same GitHub account.
+- Never share your private key.
+- After setting up SSH, use SSH repository URLs such as `git@github.com:username/repository.git`, not HTTPS URLs.
+- If you cloned a repository over HTTPS earlier, you can switch it with `git remote set-url origin `
+
+# Use Trae for GitHub operations
+
+Now that we have covered Git, GitHub, SSH, and the setup process, you can start asking Trae to help with Git operations.
+
+## `git clone`: download an existing repository
+
+You can directly tell Trae which repository URL you want to clone.
+
+
+
+## `git pull`: fetch the latest remote updates
+
+Before editing, especially in a shared repository, you should pull the latest changes first.
+
+**Always include the folder name and its relative or absolute path so you do not pull in the wrong repository by mistake.**
+
+Prompt:
+`Help me pull this repository AIID-TEST in ./AIID-TEST.`
+
+## `git commit` and `git push`: stage, save, and upload your updates
+
+After you modify files locally, you can ask Trae to detect the changes and help you push them to GitHub.
+
+Prompt:
+`I finished. Commit and push to the repository AIID-TEST in ./AIID-TEST.`
+
+
+
+If the push succeeds, you will be able to see the updated content on GitHub immediately.
+
+# References
+
+- Pro Git book: https://git-scm.com/book/en/v2
+- GitHub Docs: https://docs.github.com/en
diff --git a/docs/en/stage-2/backend/2.5-zeabur-deployment/index.md b/docs/en/stage-2/backend/2.5-zeabur-deployment/index.md
new file mode 100644
index 0000000..6ed8a83
--- /dev/null
+++ b/docs/en/stage-2/backend/2.5-zeabur-deployment/index.md
@@ -0,0 +1,517 @@
+# How to Deploy Web Applications
+
+In this tutorial, we will walk through how to deploy your web application to the internet so other people can access it. We will introduce four common deployment platforms: **Tencent Cloud CloudBase**, **Vercel**, **Netlify**, and **Zeabur**. The goal is to help you go from "I finished writing the code" to "other people can visit my site online."
+
+# What does "deployment" mean?
+
+Before we begin, let's clarify what deployment actually is.
+
+For any website to be visited by external users, it must have a publicly reachable network address. That can be an IP address such as `123.45.67.89`, or a domain such as [google.com](https://google.com/). But the address alone is not enough. Your code, such as HTML, CSS, JavaScript, or React/Vue projects, as well as images and video assets, must live on a server that stays online 24/7 and can answer incoming requests.
+
+
+
+Image source: https://www.hostinger.com/tutorials/what-is-cloud-hosting
+
+The full process of uploading resources, configuring the runtime environment, and making the service run is called **deployment**.
+
+In simple terms: if your website runs only on your own computer, then only you can visit it locally because the files only exist on your hard drive. Deployment means moving your code and assets to a public-facing server, configuring that server properly, and making sure it knows how to respond when someone visits your domain.
+
+If you deploy everything manually, a project usually involves many steps:
+
+1. **Prepare a server**
+ You first need to buy or rent a cloud server from a provider such as Alibaba Cloud, Tencent Cloud, or AWS EC2. Then you choose its region, CPU, memory, and storage, and learn how to connect to it remotely, often through SSH.
+ 
+
+2. **Configure the runtime environment**
+ Web apps only run under the correct environment. A Node.js project needs Node installed. A Python project needs Python and its dependencies. If the versions do not match, the app may fail to start.
+
+3. **Upload your files**
+ You need to move your local code and assets to the server, often via Git or file-transfer tools. Large projects can make this step frustrating if uploads break halfway through.
+
+
+
+4. **Start the service and test it**
+ After upload, you need to start the app and check whether the assigned address works. If not, the problem may be a firewall-blocked port, or it may be an application bug. In that case, you need to inspect logs.
+
+5. **Maintain and update**
+ Every code update usually means another upload and restart. If the server crashes, you may need to restart services manually or configure a process manager to keep them alive.
+
+Platforms such as CloudBase, Vercel, Netlify, and Zeabur exist to eliminate much of that complexity. They automate the boring parts:
+
+- buying and provisioning servers
+- configuring runtimes
+- pulling code
+- starting services
+- monitoring uptime
+
+In many cases, you just connect a GitHub repository or upload your code, and the platform does the rest.
+
+
+
+---
+
+# Deployment platform comparison
+
+| Platform | Main strengths | Best for | Free tier |
+|------|------|----------|----------|
+| **Tencent Cloud CloudBase** | Fast access within mainland China, strong WeChat ecosystem integration | China-focused users, WeChat Mini Program support | Yes |
+| **Vercel** | Excellent support for frontend frameworks, tight GitHub integration | Modern React/Vue/Next.js frontend projects | Yes |
+| **Netlify** | Broad feature set, great Git workflow, form handling, auth support | Static sites that also need forms or auth | Yes |
+| **Zeabur** | Flexible service combinations and many templates | More complex projects, including tools like Dify and n8n | About $5/month in free quota |
+
+---
+
+# 1. Tencent Cloud CloudBase
+
+Tencent Cloud CloudBase is Tencent's integrated cloud backend platform and is especially friendly for developers targeting domestic Chinese users.
+
+Its advantages include:
+
+- **Fast domestic access**
+- **WeChat ecosystem integration**
+- **An all-in-one backend solution** including static hosting, cloud functions, databases, and storage
+- **A practical free tier**
+
+## Deploy a web app with CloudBase
+
+### Step 1: Register and log in
+
+Visit the [Tencent Cloud CloudBase Console](https://console.cloud.tencent.com/tcb) and log in with WeChat or QQ.
+
+### Step 2: Create an environment
+
+Click `Create Environment` and choose an environment name such as `my-web-app`.
+
+> ⚠️ **Note**: the free trial version of CloudBase often requires a redemption code. You usually need to follow the CloudBase official account and obtain a code there.
+
+### Step 3: Enable static website hosting
+
+Inside the environment management screen, enable the `Static Website Hosting` feature. Once enabled, you will receive a default public domain.
+
+CloudBase supports several deployment methods:
+
+- upload a local build output
+- deploy from a template
+- deploy from a Git repository
+
+### Step 4: Deploy your code
+
+CloudBase offers three main workflows:
+
+**Option 1: upload a local project**
+
+- choose `Local Project Deployment`
+- upload your built static files such as HTML, CSS, and JS
+- typically upload a `dist` or `build` directory
+
+**Option 2: use a template**
+
+- start from a preset project template
+- common options include React and Vue starter templates
+
+**Option 3: deploy from Git**
+
+- connect a GitHub repository
+- set the build command, such as `npm run build`
+- every push can trigger an automatic redeploy
+
+> 💡 **Tip**: you can also deploy from the command line:
+>
+> ```bash
+> # Install CloudBase CLI
+> npm install -g @cloudbase/cli
+> # Log in
+> tcb login
+> # Deploy
+> tcb hosting deploy ./dist -e your-env-id
+> ```
+
+### Step 5: Add a custom domain (optional)
+
+CloudBase also supports binding your own domain and applying a free HTTPS certificate.
+
+---
+
+# 2. Vercel
+
+Vercel is one of the most popular frontend deployment platforms in the world and is especially good for React, Vue, and Next.js projects.
+
+Its main strengths:
+
+- **Deep GitHub integration**
+- **Automatic preview deployments for pull requests**
+- **Global CDN distribution**
+- **Support for serverless functions**
+
+> ⚠️ **Note**: in some mainland-China network environments, Vercel may be less stable than domestic options such as CloudBase.
+
+## Deploy a web app with Vercel
+
+### Step 1: Register
+
+Visit [Vercel](https://vercel.com) and sign in with GitHub.
+
+### Step 2: Import a project
+
+1. Click `Add New Project`
+2. Select the GitHub repository you want to deploy
+3. If needed, adjust GitHub app permissions
+
+### Step 3: Configure build settings
+
+Vercel often detects the framework automatically:
+
+| Framework | Build command | Output directory |
+|------|----------|----------|
+| React | `npm run build` | `build` |
+| Vue | `npm run build` | `dist` |
+| Next.js | `next build` | - |
+| Plain HTML | - | project root |
+
+If detection fails, configure it manually:
+
+- **Build Command**
+- **Output Directory**
+- **Install Command**
+
+### Step 4: Deploy
+
+Click `Deploy` and wait for the build to complete. A successful project receives a `xxx.vercel.app` domain.
+
+### Step 5: Add a custom domain (optional)
+
+Use the `Domains` section in project settings to bind your own domain. HTTPS is handled automatically.
+
+---
+
+# 3. Netlify
+
+Netlify is another strong frontend deployment platform, especially for static sites and single-page applications.
+
+Its strengths:
+
+- **Feature-rich hosting**, including form handling, auth, and edge/serverless functions
+- **Strong Git integration**
+- **Preview links for branches**
+- **Global CDN**
+- **Built-in form handling**
+- **Built-in user authentication tools**
+
+> ⚠️ **Note**: Netlify may not be as fast as CloudBase for domestic Chinese users.
+
+## Deploy a web app with Netlify
+
+### Step 1: Register
+
+Visit [Netlify](https://www.netlify.com) and sign up with GitHub, GitLab, Bitbucket, or email.
+
+### Step 2: Import a project
+
+1. Click `Add new site` → `Import an existing project`
+2. Choose your Git provider
+3. Authorize Netlify
+4. Select the repository
+
+### Step 3: Configure build settings
+
+| Framework | Build command | Publish directory |
+|------|----------|----------|
+| React | `npm run build` | `build` |
+| Vue | `npm run build` | `dist` |
+| Angular | `ng build` | `dist/` |
+| Next.js | `next build` | `out` |
+| Plain HTML | - | `.` |
+
+### Step 4: Deploy
+
+Click `Deploy site`. Once it succeeds, you will receive a `xxx.netlify.app` domain.
+
+### Step 5: Add a custom domain (optional)
+
+1. Open the site settings
+2. Go to `Domain management`
+3. Add your custom domain
+4. Follow the DNS instructions
+
+### Useful Netlify features
+
+#### 1. Form handling
+
+Netlify can capture form submissions without requiring a dedicated backend.
+
+```html
+
+```
+
+After deployment, Netlify automatically stores submission data and can forward it to email or other services.
+
+#### 2. Netlify Functions
+
+Netlify also supports serverless functions, which are useful for small APIs without maintaining a full backend.
+
+For example:
+
+```javascript
+exports.handler = async (event, context) => {
+ return {
+ statusCode: 200,
+ body: JSON.stringify({ message: "Hello from Netlify!" })
+ };
+};
+```
+
+After deployment, the function is accessible at:
+
+`https://your-domain/.netlify/functions/hello`
+
+#### 3. Local development support
+
+Netlify provides a CLI:
+
+```bash
+# Install Netlify CLI
+npm install -g netlify-cli
+
+# Log in
+netlify login
+
+# Start local development
+netlify dev
+
+# Test functions locally
+netlify functions:serve
+```
+
+This lets you simulate Netlify forms and function behavior locally before deploying.
+
+---
+
+# 4. Zeabur
+
+Zeabur is a newer deployment platform that is especially useful for more complex projects involving multiple services.
+
+Its main strengths:
+
+- **Many built-in service templates**
+- **Support for multiple deployment methods**
+- **Flexible multi-service composition**
+- **Usage-based billing**
+
+## Deploy Dify with Zeabur
+
+In earlier chapters, we already touched on Dify briefly. Now we can launch a full Dify service through [Zeabur](https://zeabur.com/projects) very easily.
+
+First, open the [console page](https://zeabur.com/projects):
+
+
+
+In that interface, you will see a set of service blocks. At the top are options such as `Agent`, `Servers`, `Docs`, and `Templates`:
+
+1. **Agent**: Zeabur's built-in assistant for operational questions
+2. **Servers**: add or buy cloud servers
+3. **Docs**: official documentation
+4. **Templates**: built-in application templates
+
+> An **image** can be understood as a packaged runtime environment + application state. If a service has already been configured successfully on one machine, it can be packed into an image and reused elsewhere.
+
+In the upper-right corner, you can also see your balance. By default, Zeabur usually gives you a small monthly free quota, roughly around 5 USD worth of usage.
+
+
+
+You can click the balance to inspect daily usage:
+
+
+
+Now let's create a Dify service.
+
+Start by clicking `New Project` on the [console homepage](https://zeabur.com/projects):
+
+
+
+Zeabur supports several ways to create a service:
+
+1. **GitHub**
+ Connect your GitHub account and deploy directly from a repository.
+2. **Template**
+ Start from a built-in app template such as Dify or n8n.
+ 
+3. **Databases**
+ Deploy databases such as MySQL or MongoDB.
+ 
+4. **Functions**
+ Deploy JavaScript or Python functions.
+ 
+ 
+5. **Local Project**
+ Upload a local folder and let Zeabur detect how to run it.
+ 
+6. **Docker Image**
+ Deploy from an already built Docker image.
+ 
+7. **Cursor**
+ Deploy directly from a project you are editing in Cursor.
+
+If you want to deploy Dify, the easiest path is **Template**. Search for `dify`, choose a version you like, and continue.
+
+
+
+Then choose any project name. Zeabur will generate a temporary domain based on that name.
+
+
+
+After creation, you will see multiple services starting one after another. Dify is not a single program, but rather a group of coordinated services, so you need to wait until they are all running.
+
+In many setups, you can click the main Dify app to get the access address. In this example, however, the final entry point is exposed through `nginx`, so you need to open the `nginx` service and find the public service address there.
+
+
+
+After waiting a bit, you should see the Dify login screen. Register an account with your email and password, and your own Dify service is ready.
+
+
+
+You can also launch `n8n` in a similar way if you want another AI workflow tool:
+
+
+
+## Deploy a Snake game with Zeabur and Trae
+
+To explore Zeabur's more advanced usage, let's deploy something simpler first: a Snake game generated with Trae.
+
+### Deploy an HTML-based version
+
+
+
+Trae can generate a browser-based Snake game from plain HTML very easily. Once the project is created locally, you can upload the whole folder to Zeabur using the local-project deployment method described above.
+
+
+
+After deployment, you will enter the service details page:
+
+
+
+Click `Network` on the left, find `Public Address`, and click `Generate Domain` to create a public URL.
+
+
+
+
+Once that address is generated, opening it in the browser will let you play your Snake game publicly:
+
+
+
+This same method works well for other static HTML-based web apps too.
+
+### Deploy a React version
+
+Now let's deploy a React app instead of a plain HTML app. Compared with static HTML, React is a more modern and component-based frontend framework, and it is common in production applications.
+
+
+
+#### Refactor into a React architecture
+
+In Trae, you can simply say:
+
+`Help me refactor this code into a React architecture.`
+
+
+
+However, React apps are a bit more demanding to deploy because they rely on a build toolchain and a more structured project layout.
+
+One especially important issue is the **port**. A local React development server often listens on port `3000` by default. Zeabur, however, expects the deployed app to listen on port `8080`.
+
+If your React app still listens on `3000`, the deployment may fail because Zeabur cannot route traffic to it correctly.
+
+#### What is a port?
+
+You can think of the IP address as the building address and the port number as the room number. Together, `IP:port` points to a specific service.
+
+Most websites do not explicitly show a port because browsers automatically assume the default ports:
+
+- `80` for HTTP
+- `443` for HTTPS
+
+But for app-specific services such as React development servers (`3000`) or Zeabur deployments (`8080`), the port becomes important.
+
+#### What does "listening on a port" mean?
+
+When a program listens on a port, it is telling the operating system:
+
+`I am waiting here for incoming network requests. Send them to me.`
+
+In the building analogy, the IP is the building address, and the port is the room number. The React dev server opens room `3000` and tells the building manager, "Any requests addressed to room 3000 should be delivered to me."
+
+When you run `npm start` locally, React commonly chooses port `3000`. Zeabur, however, is designed to work with apps listening on `8080`, so you need to change the default.
+
+#### Change the default listening port
+
+The easiest way is simply to ask Trae:
+
+`Please help me change the default port of this React project to 8080.`
+
+Trae can modify the relevant configuration for you. After that, rebuild the project and upload it to Zeabur again.
+
+
+
+
+Once you configure the public network address just as you did for the HTML project, the React app can also be served successfully.
+
+
+
+
+The same idea applies to any other app that needs a port adjustment before deployment.
+
+---
+
+# ⚠️ How to pause or delete a Zeabur project
+
+Because server resources cost money, you should always get in the habit of stopping services you are no longer using.
+
+Open the project's `Settings`:
+
+
+
+Scroll to the bottom, and you will see controls like the following:
+
+
+
+You can:
+
+- click `Suspend All Services` to pause everything and reduce cost
+- click `Restart All Services` to restart services if something is stuck
+- click `Delete Project` if you are sure you no longer need it
+
+---
+
+# Summary
+
+In this tutorial, we introduced four common deployment platforms:
+
+1. **Tencent Cloud CloudBase**: good for domestic Chinese users and strong WeChat integration
+2. **Vercel**: excellent for modern frontend frameworks and GitHub-driven workflows
+3. **Netlify**: strong for static sites that also need forms, auth, and other hosting features
+4. **Zeabur**: very useful for more complex projects with multiple services and templates
+
+Which one you choose depends on your needs:
+
+- For primarily domestic Chinese audiences, **CloudBase** is often the best first choice
+- For React, Next.js, and similar stacks, **Vercel** or **Netlify** are strong options
+- For static sites that also need forms or auth, **Netlify** is especially useful
+- For Dify, n8n, and other multi-service setups, **Zeabur** is often the easiest
+
+No matter which platform you choose, the deployment workflow is conceptually similar:
+
+**prepare the code → choose a platform → configure the build → deploy it**
+
+Once you understand that loop, you can start publishing your own projects for the world to use.
diff --git a/docs/en/stage-2/backend/2.6-modern-cli/index.md b/docs/en/stage-2/backend/2.6-modern-cli/index.md
new file mode 100644
index 0000000..c87a73a
--- /dev/null
+++ b/docs/en/stage-2/backend/2.6-modern-cli/index.md
@@ -0,0 +1,702 @@
+# CLI AI Coding Tools
+
+In this tutorial, we introduce AI coding agents that run directly in the command line. They are different from the agents we used earlier in Trae and Cursor. CLI AI coding tools can only be used in the terminal. Compared with agents integrated into AI IDEs, they usually have longer context windows, faster tool-calling speed, and compatibility with a wider range of large models. In the latest AI Vibe Coding practice, we often prioritize CLI AI coding tools over built-in IDE coding agents.
+
+## Starting from the CLI
+
+Do you still remember the CLI we introduced before? CLI means using pure text commands in a terminal or command prompt to operate software applications, instead of relying on a graphical interface (GUI. You can simply think of GUI as the clickable interface with buttons on a computer or phone, where you do not need to type commands).
+
+> On Windows, common terminals include Command Prompt (`cmd`) and PowerShell. You can type `cmd` or `powershell` in the Run/Search box to launch them.
+
+
+
+The CLI is naturally good for text-command workflows. Among a small group of geeks (programming enthusiasts pursuing extreme efficiency), CLI is even more popular than GUI. They want to complete everything with the keyboard and feel that moving the mouse can slow down coding efficiency.
+
+In industry, CLI is also often the most common interface form, because GUI requires the operating system to draw interfaces and manage windows, which demands more computer resources. CLI only needs to pass received commands to the system for execution. So when connecting to large-scale server clusters, we usually interact only through CLI.
+
+
+
+For many learners with no CLI experience, command-line operations can feel complicated, with too many commands, and even the fear of "accidentally breaking the computer." No need to worry. Remember how, in previous tutorials, we often asked Trae to help with basic operations? We can use exactly the same idea here. We can ask CLI coding tools to perform all CLI operations for us: entering specific folders, searching and processing files, running or copying open-source projects, and so on. The whole process can be completed through conversation with the CLI AI coding tool.
+
+## How Is It Different from an AI IDE
+
+We can compare CLI AI coding tools to z.ai and Trae that we used before. In a sense, CLI AI coding tools can be seen as a special kind of z.ai: they also only need a simple chat entry, and then they automatically perform the required operations (sometimes you just need to open a browser manually to check the final result). If compared to AI IDEs, CLI AI coding tools can be seen as the Agent module inside an IDE, which is the side chat panel.
+
+
+
+However, because different AI IDEs implement agents in different ways, their capability gaps are large, and AI coding quality is often unstable. CLI AI coding tools are usually developed directly by major tech companies, such as Anthropic behind Claude and OpenAI behind ChatGPT.
+
+Compared with other AI coding agents, directly using products from these major companies is often a better practice. Claude Code in particular is a tool used by Anthropic's own R&D teams, designed from the start around "meeting real engineer needs."
+
+To compare more intuitively, we can look at the difference between Claude Code and one AI IDE agent (Cursor as an example):
+
+| Feature | Claude Code | Cursor | Better Choice |
+| ------------------ | ----------------- | ------------------- | ------------- |
+| Automatic execution | ✅ Very strong | ❌ Limited | Claude Code |
+| IDE integration | ❌ CLI only | ✅ Native VS Code | Cursor |
+| Real-time completion | ❌ None | ✅ Excellent | Cursor |
+| Multi-file operations | ✅ Very strong | ⚠️ Pretty good | Claude Code |
+| GitHub integrated workflow | ✅ Can commit directly | ⚠️ More manual | Claude Code |
+| Learning cost | ⚠️ Medium | ✅ Easy to start | Cursor |
+| Context length | ✅ Very long | ⚠️ Good | Claude Code |
+| Debug assistance | ✅ Automated | ⚠️ More manual work | Claude Code |
+
+Table source: https://northflank.com/blog/claude-code-vs-cursor-comparison
+
+In short, CLI AI coding tools usually can:
+
+- Support much longer continuous conversations (they can even "work for you all day").
+- Provide longer context windows (you no longer need to frequently say "continue").
+- Respond faster (with support for more custom model APIs).
+
+For coding-related operations, they are usually smarter and more stable than most IDE built-in agents.
+
+## Common CLI AI Coding Tools
+
+Although there are many open-source implementations now, in practice we only recommend two major types of CLI AI coding tools as the "preferred combo." You can choose either one based on your habits, and we strongly recommend trying both before deciding which suits you best.
+
+- Codex uses GPT-5 and is stronger overall in capability.
+- Claude Code, routed through GLM 4.6 compatible APIs, offers an experience close to Claude 4 at a lower cost.
+
+However, which one works better in your real project can only be determined by hands-on testing. Mastering multiple AI coding tools is always beneficial. Once you are skilled, you can switch flexibly among Claude Code, Codex, or Trae in different scenarios. If one tool does not perform well after multiple tries, just switch to another tool or model and continue experimenting.
+
+At the same time, because model versions update very quickly, we recommend prioritizing whichever option currently performs best in cost-performance (quality / cost).
+
+### Claude Code
+
+Claude Code is an AI coding tool developed by Anthropic based on Claude model capabilities. Its primary interaction happens in the terminal, and it can also be used as a VS Code extension. Similar to an agent inside an AI IDE, it can deeply understand a developer's repository and complete end-to-end development tasks through natural language instructions, including code editing, bug fixing, running and fixing tests, managing Git workflows (such as resolving merge conflicts and creating PRs), explaining complex code, and executing terminal commands.
+
+
+
+Claude Code's main advantages are: very long context windows (it can handle whole files or even small projects), proactively clarifying ambiguous requirements, automatically planning and allocating execution tasks, and deeply understanding and explaining the entire codebase. Compared with ordinary IDE agents, it is better suited for immersive vibe-coding workflows.
+
+In actual use, you can ask it through chat to create new projects, perform CLI operations (such as organizing folders, bulk renaming files, deploying open-source projects), and configure development environments (such as installing and debugging Python environments). If you find some code difficult to understand, or a folder structure unclear, you can directly ask Claude Code to generate structured analysis documentation or explain specific parts step by step.
+
+
+
+
+
+If you want to systematically learn Claude Code, you can refer to the course jointly launched by Andrew Ng and Anthropic:
+https://www.bilibili.com/video/BV176t2zSEpr
+
+Next, we will learn how to use Claude Code. Because directly using the official Claude Code is often very expensive (as shown below), we will instead use API platforms that are compatible with Claude Code protocol but based on other large models.
+
+
+
+You need to learn the different options below (it is best to try all of them), and finally choose the one that suits you best as your main path.
+
+The first approach is to directly use APIs that are "Anthropic-interface compatible." As Claude Code becomes more popular, more model providers now support Anthropic-style invocation. Common providers include GLM, Kimi, DeepSeek, and Siliconflow. They all provide compatible API interfaces. We will explain specific configuration details later.
+
+One thing to note: Claude Code usually consumes a lot of tokens. If you are worried about high API costs, you can consider GLM monthly plans (about 20 RMB/month) to control cost. If you first want to estimate actual spending, you can also recharge 10 RMB for small-scale experiments.
+
+Another approach is using the "Claude Code Route" project. It is an open-source tool that supports all common API invocation interfaces and allows fine-grained model configuration for different scenarios, including local model access. But this option is more complex to configure, so we suggest starting with the first approach.
+
+#### Use Zhipu GLM as the Backend (Recommended)
+
+GLM (General Language Model) is a series of large language models independently developed by Zhipu AI. GLM-4.6 is currently the latest version in the GLM family. Its core highlight is strong coding performance (benchmarking Claude Sonnet 4 in public benchmarks and real tasks, and considered top-tier domestically).
+
+
+
+It also extends the context window to 200K, allowing easier handling of long text and large codebases, while strengthening reasoning and tool-calling capabilities, achieving a good balance between performance and cost.
+
+
+
+Before connecting GLM, we first need to install Claude Code.
+
+If command-line installation feels troublesome, or errors appear midway, you can directly ask Trae's Agent to complete installation for you.
+
+```python
+# Install Claude Code
+npm install -g @anthropic-ai/claude-code
+
+# Enter your project
+cd your-awesome-project
+
+# Start Claude Code
+claude
+
+# Press Ctrl+C to exit Claude
+```
+
+Next, we need to change Claude Code's default API request endpoint so it supports GLM's API service. You can copy the content below and ask Trae to create the corresponding environment variables for you. You can also choose to write them permanently into system environment variables (if issues occur, you can also ask Agent to help modify them).
+
+First, you need to obtain your GLM API key and store it in whatever way is most convenient for you.
+
+Domestic URL: https://bigmodel.cn/usercenter/proj-mgmt/apikeys
+International URL: https://z.ai/manage-apikey/apikey-list
+
+If you are using the **domestic GLM** service, use the following variable configuration:
+
+```python
+# Run the following command in Cmd
+# Replace `your_zhipu_api_key` with the API key you just obtained
+setx ANTHROPIC_AUTH_TOKEN your_zhipu_api_key
+setx ANTHROPIC_BASE_URL https://open.bigmodel.cn/api/anthropic
+```
+
+If you are using the **international GLM** service, use this configuration:
+
+```python
+# Run the following command in Cmd
+# Also replace `your_zai_api_key`
+setx ANTHROPIC_AUTH_TOKEN your_zai_api_key
+setx ANTHROPIC_BASE_URL https://api.z.ai/api/anthropic
+```
+
+You can directly enter a prompt like this in Trae:
+
+⚠️ If you configure "permanent environment variables" through Trae, then after configuration you **must restart Trae**. Otherwise environment variables in Trae's built-in terminal will not refresh, which may cause login failures or network connection errors.
+
+```python
+Based on my environment variable settings:
+setx ANTHROPIC_AUTH_TOKEN your_zai_api_key
+setx ANTHROPIC_BASE_URL https://api.z.ai/api/anthropic
+
+and my key(Replace it with your own key):
+681fea485851d29060cc.13gfaendggaFOhb
+
+please help me configure and start Claude Code
+```
+
+You will see output similar to the following:
+
+
+
+> 💡 What is an environment variable?
+>
+> Environment variables are essentially key-value configuration entries stored in the operating system, usually in the form "variable name = specific value." If configured in advance in terminal or system settings, programs can read these variables at any time to obtain relevant information. Because environment variables can be written directly in terminal without modifying code, we usually store large-model access keys in environment variables to avoid leakage. Programs only need to read corresponding environment variables to complete model invocation.
+>
+> In Windows, besides storing model access keys, environment variables are also commonly used to store executable "path locations" for command-line tools.
+>
+> We know the terminal itself is also a program. Sometimes we want to launch an external program from terminal. For example, typing `claude` in terminal to launch Claude Code. The reason this works is that terminal reads system environment variables, and the PATH variable contains the directory where Claude Code executable resides, so terminal can find and execute it (equivalent to pasting that program's absolute path into terminal and pressing Enter).
+>
+> A typical environment variable may look like this: `PATH=C:\Windows\system32;C:\Program Files\Python`. Then we can execute those programs from any directory, for example directly typing `python` in command line to start the Python interpreter.
+>
+> If you want to view current system environment variables, type "environment variables" in Windows Search, then in the "Edit the system environment variables" window you can see all variables and their values. Some store model keys, while others add program directories for invocation from any path.
+
+Now you can use the latest GLM for Claude Code development. You can try rerunning previous projects, or retry tasks that Trae did not complete well, and compare the experience differences.
+
+🎉 Rebuilding repeatedly is not a waste of time. Every repetition makes your skills more solid.
+
+Using exactly the same logic as with GLM, you can also connect other interfaces that support Anthropic-compatible formats.
+
+#### Use Kimi K2 as the Backend (Recommended)
+
+Kimi K2 is a new-generation large language model released by Moonshot AI, with excellent performance in code understanding and generation. Kimi K2 supports ultra-long context windows (up to 200K tokens), and can easily handle large repositories and complex projects.
+
+**Core advantages:**
+- **Ultra-long context**: Supports 200K context window, enabling one-pass handling of whole-project code
+- **Strong coding ability**: Performs very well in generation, refactoring, and debugging
+- **Better Chinese understanding**: More accurate understanding of Chinese programming requirements
+- **Stable tool invocation**: Supports reliable function-calling and tool usage
+
+**Get API Key:**
+
+Visit https://platform.moonshot.cn/console/account to register and obtain an API key.
+
+**Configuration method:**
+
+Reference docs: https://platform.moonshot.cn/docs/guide/agent-support
+
+```bash
+export ANTHROPIC_BASE_URL=https://api.moonshot.cn/anthropic
+export ANTHROPIC_AUTH_TOKEN=sk-YOURKEY
+```
+
+#### Use Minimax as the Backend (Recommended)
+
+Minimax is a new-generation large language model released by MiniMax, with excellent performance on programming tasks. Minimax models are known for strong reasoning and code-generation quality, especially suitable for complex programming scenarios.
+
+**Core advantages:**
+- **Strong reasoning**: Performs well in complex logic reasoning and code architecture design
+- **High code quality**: Generated code is clear in structure and readable
+- **Multi-language support**: Supports code generation and conversion across multiple languages
+- **Fast response speed**: API responds quickly, suitable for high-frequency invocation scenarios
+
+**Get API Key:**
+
+Visit https://platform.minimax.io/ to register and obtain an API key.
+
+**Configuration method:**
+
+```bash
+export ANTHROPIC_BASE_URL=https://api.minimax.io/anthropic
+export ANTHROPIC_AUTH_TOKEN=YOUR_MINIMAX_API_KEY
+export ANTHROPIC_MODEL=MiniMax-M2.7
+```
+
+#### Use DeepSeek as the Backend (Recommended)
+
+DeepSeek is an open-source large language model released by DeepSeek, popular among developers for strong coding capabilities and high cost-performance. DeepSeek Coder is specially optimized through training for programming tasks.
+
+**Core advantages:**
+- **Outstanding coding capability**: Strong performance in code generation, understanding, and bug fixing
+- **Open-source and customizable**: Open-source model, can be fine-tuned based on needs
+- **High cost-performance**: Relatively low API pricing, suitable for high-frequency use
+- **Good Chinese support**: Accurate understanding of Chinese programming scenarios
+
+**Get API Key:**
+
+Visit https://platform.deepseek.com/usage to register and obtain an API key.
+
+**Configuration method:**
+
+```bash
+export ANTHROPIC_BASE_URL=https://api.deepseek.com/anthropic
+export ANTHROPIC_AUTH_TOKEN=YOU_DEEPSEEK_API_KEY
+export API_TIMEOUT_MS=600000
+export ANTHROPIC_MODEL=deepseek-chat
+export ANTHROPIC_SMALL_FAST_MODEL=deepseek-chat
+export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1
+```
+
+#### Use Volcano Engine Coding Plan as the Backend (Recommended)
+
+Volcano Engine is ByteDance's cloud service platform, providing enterprise-level AI model services. Volcano Engine's Coding Plan is specially optimized for coding scenarios, offering stable and efficient code-generation capability.
+
+**Core advantages:**
+- **Enterprise-grade stability**: Provides SLA guarantees for service stability
+- **Coding-scenario optimization**: Specifically optimized for programming tasks
+- **Rich model choices**: Supports multiple models including Doubao-pro and Doubao-lite
+- **Fast domestic access**: Domestic node deployment with faster access speed
+
+**Get API Key:**
+
+Visit https://console.volcengine.com/ark/region:ark+cn-beijing/apiKey to register and obtain an API key.
+
+**Configuration method:**
+
+```bash
+export ANTHROPIC_BASE_URL=https://ark.volces.com/api/anthropic
+export ANTHROPIC_AUTH_TOKEN=YOUR_VOLCANO_API_KEY
+export ANTHROPIC_MODEL=doubao-pro-32k
+```
+
+#### Other Anthropic-Compatible APIs
+
+Siliconflow:
+
+```bash
+export ANTHROPIC_BASE_URL="https://api.siliconflow.cn/"
+export ANTHROPIC_MODEL="moonshotai/Kimi-K2-Instruct-0905" # You can change to the model you need
+export ANTHROPIC_API_KEY="YOUR_SILICONCLOUD_API_KEY" # Replace with your API key
+```
+
+Aliyun DashScope (Aliyuncs): https://help.aliyun.com/zh/model-studio/get-api-key
+
+```python
+export ANTHROPIC_BASE_URL="https://dashscope.aliyuncs.com/apps/anthropic"
+export ANTHROPIC_API_KEY="YOUR_DASHSCOPE_API_KEY"
+```
+
+::: details Use Claude Code Route as the Backend (Advanced Usage)
+
+Above we explained how to replace Claude Code's Anthropic interface with the official GLM API. Next, let's look at how Claude Code Router allows Claude Code to adapt to more model APIs.
+
+[Claude Code Router](https://github.com/musistudio/claude-code-router) is an intelligent routing enhancement tool designed specifically for Claude Code. Its core function is helping users distribute AI requests to models across different platforms as needed, with a high degree of customization. It supports access to dozens of platforms including OpenRouter, DeepSeek, Ollama, Gemini, and more. It can also route tasks to specific models by scenario, such as GLM-4.5, Kimi-K2, and Qwen3-Coder. For example, you can route background tasks to local Ollama to save cost, route long text / long code tasks to Gemini-2.5-Pro, and route code explanation to DeepSeek.
+
+
+
+This tool also provides convenient UI/CLI configuration management and uses converters to adapt API formats from different platforms. It supports automation integration such as GitHub Actions and custom extensions, solving the problems of "one single model cannot cover all scenarios" and "frequent platform switching is troublesome," helping users use AI tools more flexibly and at lower cost.
+
+
+
+Below is a quick introduction to installing Claude Code Router. The rough steps are as follows (you can also ask Trae to execute them) to prepare the environment:
+
+```markdown
+npm install -g @anthropic-ai/claude-code
+npm install -g @musistudio/claude-code-router
+```
+
+After installation, you need to confirm the `ccr` command is available locally. If you see output similar to the following, installation is successful:
+
+
+
+Next, there are two ways to initialize and configure models:
+
+- Use CCR's built-in UI and configure on its browser page.
+- Directly edit CCR's default configuration file (the UI essentially edits the config file as well, just with a more intuitive interface).
+
+If you choose CCR UI, you will see an interface similar to this:
+
+
+
+At this point, click the "Add Provider" button to see the following interface. You need to:
+
+1. Enter the provider name in Name;
+2. Fill in that provider's OpenAI-compatible endpoint in API Full URL;
+3. Fill in the corresponding platform API key in API Key;
+4. Fill model names in Models area, then click "Add Model";
+5. Finally click "Save" to persist configuration.
+
+(If you scroll downward there are many advanced options, but you can ignore them for now.)
+
+
+
+Here are configuration examples for DeepSeek and Kimi:
+
+
+
+
+
+After saving model configuration, you also need to specify the default model in the Router area on the right. Select from the dropdown and set it to `kimi` (recommended), then click `Save and Restart` in the top-right corner.
+
+
+
+After that, simply run `ccr code` in terminal to start Claude Code workflow through Claude Code Router.
+
+
+
+:::
+
+#### Advanced Usage of Claude Code
+
+Many people initially use Claude Code only as a normal chat tool. But in fact it has many built-in capabilities that can make your workflow more efficient and flexible. Here are common commands and usage examples:
+
+Reference docs:
+
+https://docs.claude.com/en/docs/claude-code/cli-reference
+https://docs.claude.com/en/docs/claude-code/slash-commands
+
+| Command | Purpose | Example |
+| ----------------- | ----------------------------------------- | ---------------------------------------- |
+| claude | Start interactive mode | `claude` |
+| claude "query" | Run one-off task and output result | `claude "explain this project"` |
+| claude -p "query" | Ask one-off question and auto-exit | `claude -p "explain this function xxxx"` |
+| claude -c | Continue most recent session | `claude -c` |
+| claude -r | Resume previous session | `claude -r` |
+| /resume | Switch to previous session in current chat | `claude -c`, `/resume` |
+| /plugin | Manage plugins and install submit/review extensions | `/plugin` |
+| /init | Initialize project description with CLAUDE.md | `/init` |
+| /clear | Clear current context to prevent overload | `/clear` |
+| /compact | Compress history and reduce context token usage | `/compact` |
+| /cost | View current cost usage | `/cost` |
+| /model | Switch model (usually ignorable with compatible APIs) | `/model` |
+| /memory | Manage CLAUDE.md memory file | |
+| /help | Show available command list | `/help` |
+| exit or Ctrl+C | Exit Claude Code | `exit` or `Ctrl+C` |
+| /agents | Advanced feature, explained later | |
+| /mcp | Advanced feature, explained later | |
+
+**CLAUDE.md**
+
+Reference: https://www.anthropic.com/engineering/claude-code-best-practices
+
+`CLAUDE.md` is a special file that Claude automatically reads and includes in context at the beginning of a session. So it is very suitable for recording:
+
+- Common bash commands
+- Core files and utility functions
+- Code style conventions
+- Testing method notes
+- Repository collaboration conventions (for example branch naming, merge vs rebase, etc.)
+- Development environment setup notes (for example whether to use pyenv, preferred compiler, etc.)
+- Behaviors or pitfalls that need extra attention in the project
+- Any information you want Claude to "remember"
+
+`CLAUDE.md` itself has no strict format requirement, as long as it is concise and human-readable. For example:
+
+```
+# Bash commands
+- npm run build: Build the project
+- npm run typecheck: Run the typechecker
+
+# Code style
+- Use ES modules (import/export) syntax, not CommonJS (require)
+- Destructure imports when possible (eg. import { foo } from 'bar')
+
+# Workflow
+- Be sure to typecheck when you’re done making a series of code changes
+- Prefer running single tests, and not the whole test suite, for performance
+```
+
+#### Internal Principles of Claude Code
+
+Reference: https://github.com/shareAI-lab/analysis_claude_code
+
+If you are curious why Claude Code performs better than Trae or Cursor agent tools in many scenarios, we can briefly look at its internal working mechanism.
+
+The overall implementation style of other CLI AI coding tools is broadly similar.
+
+
+
+Claude Code decomposes coding tasks into a continuous "perceive - think - act - verify" loop and invokes different tools in the loop to complete work. It imitates human developer workflow: continuously "write code -> run -> inspect result -> improve again." Internally, a main task loop continuously executes steps. In each cycle, Claude can call different tools, such as reading/writing files, executing commands, and searching code, then decide next actions based on real tool outputs.
+
+Several key characteristics are worth noting:
+
+- **Stream Processing**: Claude can think while outputting results, instead of waiting to finish all code before execution.
+- **Intelligent Compression**: Long conversations can make context too large. Claude compresses history into key information to reduce "forgetting," and distinguishes long-term vs short-term memory to keep execution efficient.
+- **Concurrency Control**: Internal parallel design allows multiple tasks to proceed simultaneously without interference.
+- **Sub-agent Management**: In real work it is not just one single "role" handling everything. You can manage multiple sub-agents collaboratively, each responsible for different tasks, such as dedicated testing or documentation agents.
+
+### Codex
+
+
+
+
+
+Similar to Claude Code, Codex is an AI collaborative coding tool developed by OpenAI. You can think of it as the "OpenAI version of Claude Code." Its biggest advantage is efficient adaptation to GPT-5.
+
+From practical experience, GPT-5 currently responds faster and makes fewer mistakes (higher success probability in complex multi-round tasks). One drawback is that explanations can feel more "academic" and technical, sometimes too rigorous and information-dense, which can be slightly harder for beginners.
+
+You can install Codex with the following command:
+
+```
+npm i -g @openai/codex
+```
+
+#### Use Official OpenAI API as the Backend
+
+If you directly use the official OpenAI entry for Codex, setup is very simple. Once you have OpenAI subscription access or corresponding API quota, you only need to run `codex` in command line and follow the prompts to complete login.
+
+
+
+
+
+#### Use Relayed OpenAI API as the Backend
+
+Because official OpenAI API can have issues such as high cost and strict network requirements, we can also avoid those restrictions by routing through other API gateway services.
+
+With this approach, we only need to buy corresponding Codex API quota on a third-party relay platform, and we can get an experience close to native OpenAI Codex.
+
+Reference: https://open-dev.feishu.cn/wiki/PAqUwWG4IiuwTvkQ2sGcaQuPnXc
+Recharge URL: https://api.zyai.online/account/topup/recharge
+
+One thing to note: after obtaining token quota, we still need to configure the API key locally.
+
+In key-group settings, make sure you choose the item specifically for Codex.
+
+
+
+Next, we need to fill the key you obtained into the prompt below, then give the entire prompt to Trae so it can complete the whole configuration process for you:
+
+````bash
+My API key is: [Paste your obtained sk-xxxxx key here]
+
+Please help me complete the following configuration tasks:
+
+1. Create configuration directory
+ - Create a `.codex` folder under my user directory
+ - Windows path should be: `C:\Users\[My Username]\.codex`
+2. Backup existing configuration (if exists)
+ - Check if `.codex\config.toml` exists
+ - If it exists, rename it to `config.toml.bak.[current timestamp]` (timestamp format: yyyyMMddHHmmss)
+3. Create configuration file
+ - Create `config.toml` in the `.codex` directory
+ - Write the following complete content:
+ ```toml
+ preferred_auth_method = "apikey"
+
+ [model_providers.myrelay]
+ name = "My Relay Station"
+ base_url = "https://api.zyai.online/v1"
+ env_key = "MYRELAY_API_KEY"
+ wire_api = "responses"
+ request_max_retries = 4
+ stream_max_retries = 10
+ stream_idle_timeout_ms = 300000
+
+ [profiles.myrelay]
+ model_provider = "myrelay"
+ model = "gpt-5"
+ model_reasoning_effort = "medium"
+
+ [tools]
+ web_search = true
+
+4. Set system environment variable
+Variable name: MYRELAY_API_KEY
+Variable value: The key I gave you
+
+5. Confirm completion and report back:
+
+The full path of the configuration file
+Whether the environment variable was set successfully
+I can use the command `codex --profile myrelay` to run it
+````
+
+After configuration, you can launch Codex with relayed API through `codex --profile myrelay`. Usage afterward is similar to Claude Code: just keep entering your ideas and requirements in chat at any time.
+
+## More Use Cases for CLI AI Coding Tools
+
+### Use AI to Write Requirement Documents: Learn to "Concretize Requirements"
+
+For large language models, abstract requirements should be "concretized." For example, "I'm hungry" is an abstract requirement. We need to convert it into: "My stomach is a little hungry. I may need a red-bean bun and a cup of soy milk." That becomes an executable, concrete requirement.
+
+But turning abstract requirements into concrete ones is actually a process that costs a lot of energy. If we have not seen enough examples, it is hard to quickly think of how to break abstract problems into detailed modules. In such cases, the best approach is to let AI help with this "concretization" step.
+For example, if I want to build a daily planner app, the most basic idea might be:
+
+`Please help me write a daily planning app where I can write my plans each day and get reminders.`
+
+AI can indeed split tasks directly from this requirement and complete them step by step, but errors or understanding drift can occur in the middle. To reduce this risk, we can ask AI to expand the requirement first:
+
+`Based on my needs, please elaborate and provide a more detailed Product Requirement Document for reference. My idea is: Please help me write a daily planner app that supports daily plan-writing and provides reminders .`
+
+At this point, AI may produce a complete PRD like the following:
+
+```
+Product Requirements Document (PRD): “Today’s Plan” App
+Document Version: 1.0
+Creation Date: October 27, 2023
+Author: (Your Name/Product Manager)
+
+1. Product Introduction
+1.1 Product Name
+Today's Plan
+1.2 Product Vision
+“Today’s Plan” is a minimalist and highly efficient daily planning and reminder tool, dedicated to helping users eliminate procrastination and forgetfulness, plan every day clearly, and ensure tasks are carried out through an intelligent reminder system—ultimately enabling users to gain a stronger sense of control and achievement over their time.
+1.3 Target Users (User Personas)
+We mainly serve three types of users:
+Students (Xiao Ming):
+Characteristics: Multiple tasks such as courses, assignments, club activities, exam prep, needing organized time arrangement.
+Pain Points: Easily forget small tasks or assignment deadlines; feel overwhelmed switching between tasks; want to build regular study and life habits.
+Needs: A simple tool to list daily to-dos and provide reminders before class/self-study.
+Office Workers (Zhang Wei):
+Characteristics: Fast-paced work, many meetings, reports, project milestones, and personal affairs (fitness, picking up children).
+Pain Points: Easily forget important meetings or work milestones; get interrupted by urgent tasks and forget the original plan; feel busy but inefficient at end of day.
+Needs: Need a tool to quickly record and schedule daily work and send strong reminders at key times (e.g., 15 minutes before meetings).
+Freelancers/Self-disciplined Seekers (Li Na):
+Characteristics: High freedom of time, but strong self-management required for work output and personal growth.
+Pain Points: Easily procrastinate, lack external supervision; start the day without a clear plan, leading to low time utilization.
+Needs: Need a tool to help build a daily fixed routine (Morning Routine) and review daily achievements for positive feedback.
+
+2. User Stories
+As a user, I want to quickly create today’s plan list so I have an overview of all my tasks for the day.
+As a user, I want to set specific start and end times for each task so I can create a visual timeline.
+As a user, I want to receive push notification reminders before a task starts so I won’t miss any important arrangements.
+As a user, I want to customize the reminder time (such as 5, 15, or 60 minutes in advance) so reminders better fit my habits.
+As a user, I want to easily mark completed tasks so I can feel accomplished and clearly see my progress.
+As a user, I want to see a summary of my completed plans at the end of each day for reviewing and self-motivation.
+As a user, I want to conveniently edit and delete tasks to handle last-minute changes.
+As a user, I want to view plans and achievements from previous days to review my efficiency and habits.
+
+3. Feature Breakdown
+Core Features (MVP - Minimum Viable Product)
+Module 1: Plan Management
+3.1.1 Daily Plan Homepage
+Interface: “Today” as the core view, current date shown at the top.
+View: Timeline list, clearly showing tasks scheduled from morning to evening. Tasks without a time can be listed in the top or bottom “To-do List” section.
+Interactions:
+Click the “+” button in the bottom right to quickly create a new task.
+Pull down to refresh the page.
+Swipe left/right to view yesterday’s and tomorrow’s plans.
+3.1.2 Create/Edit Task
+Entry: Click “+” on the homepage or a time slot in the list.
+Fields:
+Task title (required): Briefly describe the task, e.g., “10 AM Weekly Product Meeting.”
+Task time (optional):
+Set “start time” and “end time.”
+Provide “all-day” option for unspecified time tasks.
+Default time picker should be quick and convenient.
+Reminder setting (required, with default value): See Module 2.
+Notes (optional): Add further descriptions, links, or location info.
+Actions: Save, cancel, delete task.
+3.1.3 Task Interaction
+Mark as complete: Checkbox before each task; checking adds a strikethrough and gray background, indicating completion. Can unmark if needed.
+Edit task: Click the task itself to enter edit page.
+Delete task: Swipe left on a task to reveal “Delete” button.
+Module 2: Smart Reminder System
+3.2.1 Reminder Trigger
+Mechanism: Based on task’s set “start time” and the user’s “reminder lead time,” send a push notification from device.
+Offline Support: Locally scheduled reminders must trigger even if user is offline.
+3.2.2 Reminder Content & Format
+Notification title: App name “Today’s Plan.”
+Body: “Reminder: [Task Title] will start at [Start Time].” E.g., “Reminder: Product Meeting will start at 10:00.”
+Sound: Use system default or offer several simple, effective tones.
+3.2.3 Reminder Settings
+Global Settings (in Settings page):
+User can set a default reminder time, e.g., “15 minutes before task starts.” New tasks adopt this by default.
+Single Task Settings (in create/edit page):
+Users can override global settings for important tasks, choosing specific reminder times like "on time," "5 minutes early," "30 minutes early," or "1 hour early."
+Provide “no reminder” option.
+Subsequent Features (V1.1, V2.0)
+3.3 Daily Review & Statistics
+Push a summary notification at a set time every night (e.g., 22:00): “How was your day? Take a look at your achievements!”
+Generate a simple daily report card: shows total planned tasks, completed tasks, completion rate, plus an encouraging message.
+3.4 History Review
+Calendar view to click on any past day and check its plans and completion status. Days with high completion rates marked with a special color.
+3.5 Templates
+Allow users to save a successful daily plan as a template, e.g., “Efficient Workday,” “Relaxing Weekend.”
+When creating tomorrow’s plan, one-click import a template, modify slightly to save time.
+3.6 Themes & Personalization
+Offer dark mode.
+Allow changing several primary color themes.
+
+4. Non-Functional Requirements
+4.1 Performance
+Response: App launch time under 2 seconds; adding/editing tasks must be smooth and lag-free.
+Resource Use: Low battery and memory consumption in background; do not over-consume resources waiting for reminders.
+4.2 Usability
+Minimal & intuitive: UI must be minimal, primary functions accessible within 3 clicks. No tutorial needed for new users.
+Error tolerance: Offer undo (e.g. brief undo after mistakenly deleting a task).
+4.3 Reliability
+Reliable reminders: Reminder function is the product’s lifeline; must guarantee 99.99% timely and accurate delivery.
+Data loss-free: User plans must be reliably stored locally. Future versions can support cloud sync to prevent data loss on device change.
+4.4 Compatibility
+Platform: Support major iOS and Android versions (latest 3-4 releases).
+Screen: Layout must fit various phone screen sizes.
+
+5. Roadmap
+V1.0 (MVP):
+Goal: Validate core value—planning & reminders.
+Features: Complete all “Core Features” described above (Plan management, smart reminders).
+V1.1 (Quick Optimization):
+Goal: Improve retention and achievement.
+Features: Add “Daily Review & Statistics,” “History Review.”
+V2.0 (Enhanced Experience):
+Goal: Increase efficiency and personalization.
+Features: Add “Templates,” “Themes & Personalization,” and start developing “Cloud Sync.”
+```
+
+Compared with our initial sentence "help me write an app where I can record plans and get reminders every day," this document is now far more detailed. You can add, remove, and revise content based on real needs. For modules you are unsure about, you can keep asking AI for more alternatives, then select and merge them into a final version.
+
+In this way, we can easily turn abstract ideas into concrete descriptions. For AI development, "concrete" means productivity. The more concrete the requirement is, the easier it is to get stable structure and higher-quality project output. You can try redoing one of your previous small projects in this way and compare the difference.
+
+If you feel this kind of "requirement prompt" is too long, a very natural approach is to write it into a standalone Markdown document as your requirement document / development document / PRD. Then each time you ask AI to build a project, you only need to ask it to "refer to this document" instead of retyping long prompts every time. You can also continuously improve this document across iterations so future projects benefit directly.
+
+Below are some other common use cases:
+
+### Manage Folders
+
+We can try using CLI AI coding tools to manage various files in the current folder. For example, if you have a pile of messy files that need sorting and grouping, you can tell Claude Code or Codex:
+
+`Please help me organize the contents of the current folder. I want to group files with the same content together & I want to group files from the same time period together. Please help me handle this.`
+
+### Develop New Projects
+
+This is almost exactly the same as how we previously used z.ai and Trae. We can directly use CLI AI coding tools to develop brand-new projects from scratch. Of course, it is best to prepare a requirement document in advance.
+
+The more detailed the requirement document, the better the final result. You can optimize that document across multiple rounds as your ideas evolve. The more complete the document, the more stable and mature the implementation usually becomes.
+
+### Deploy Open-Source Projects (for example Dify)
+
+For learners who are new to computers, deploying an open-source project from GitHub is often difficult. But we can fully hand this over to Claude Code, just as we did in the Dify tutorial:
+
+https://github.com/langgenius/dify
+
+If I want to run my own local Dify, I only need to throw this link to Claude Code, then type:
+
+`I want to deploy this GitHub project ``https://github.com/langgenius/dify`` . Please help me clone the project and run it.`
+
+After receiving your request, Claude Code will automatically complete a series of operations, including pulling code from GitHub, configuring runtime environments, and starting the project. If any step fails or startup status is abnormal, you only need minor manual handling based on prompts. Beyond Dify, you can also ask Claude Code to deploy most common open-source GitHub projects for you. You just need one chat box and the time to drink a cup of coffee ☕️.
+
+
+
+### Explain Code and Write Documentation
+
+For some complex projects, or large projects generated by AI, you may feel the code is too long and logic is too dense to understand. At this time, you can ask CLI AI coding tools to "read code" for you. You can ask like this:
+
+- Please explain this project to me: how to run it, how to use it, and how to modify and continue developing it later.
+- Please explain the overall workflow of this project: how does the program run, and what actions can users perform in the interface?
+- Please write complete documentation for this project, including development docs and run docs.
+- Based on everything in my current folder, write a detailed explanation and save it into a specified Markdown document.
+
+### More Use Cases
+
+Of course, CLI AI coding tools can do far more than what we listed above. Do not treat them only as "code-writing tools." Treat them as intelligent agents with independent action capabilities. You can ask them to:
+
+- Manage and organize local files;
+- Write journals and summaries;
+- Analyze and fix system errors;
+- Execute various repetitive command-line tasks.
+
+In the near future, it may become your most important and most understanding AI companion on your computer.
diff --git a/docs/en/stage-2/backend/2.7-stripe-payment/index.md b/docs/en/stage-2/backend/2.7-stripe-payment/index.md
new file mode 100644
index 0000000..d594a04
--- /dev/null
+++ b/docs/en/stage-2/backend/2.7-stripe-payment/index.md
@@ -0,0 +1,3 @@
+# How to Integrate Stripe and Other Billing Systems
+
+> This chapter is currently being written. Stay tuned...
diff --git a/docs/en/stage-2/frontend/2.0-lovart-assets/index.md b/docs/en/stage-2/frontend/2.0-lovart-assets/index.md
new file mode 100644
index 0000000..c38723c
--- /dev/null
+++ b/docs/en/stage-2/frontend/2.0-lovart-assets/index.md
@@ -0,0 +1,951 @@
+
+
+# Starting from NanoBanana: Build Your Own Asset Production Agent
+
+## Chapter 1: Generate Your First Image Asset in 1 Minute
+
+Before we discuss design, style, or prompt engineering, let's generate the first image with the fewest possible steps.
+
+### 1.1 Meet NanoBanana
+
+Before discussing design style and prompt engineering, let's solve a more important thing first: **confirm that you can actually generate an image.**
+
+Mainstream large models now already support image generation and editing. These are usually called **generative models**.
+
+To keep the process as simple as possible, this tutorial uses a model with stable image generation and editing capabilities as the example: NanoBanana. It is an image generation model from Google. Its formal name is **Gemini 3.1 Flash Image Preview**. It supports direct image generation from natural language, and also supports editing based on existing images.
+
+
+
+In terms of core capability, it is not fundamentally different from other models you may have heard of (such as GPT-4o, Claude, Qwen, Midjourney, and others): **you provide the description, and the model generates the result.**
+
+
+
+You can think of it as a "brush." In this chapter we care about only one thing:
+👉 **can this brush draw its first stroke in your hands?**
+
+In practical usage, NanoBanana can be used directly through official platforms like **Google AI Studio**, and it can also be integrated into development workflows via **API**. This tutorial uses the API approach. A NanoBanana 2 model is also available now, and you can try the latest model as well.
+
+### 1.2 A "Hello World" Level Generation
+
+Before we start, you only need to complete these three steps:
+
+1. Create a new folder in Trae
+
+
+
+2. Create a new Python file
+
+
+
+
+
+
+
+3. Paste the full code below
+
+Trae will automatically complete environment setup and dependency installation. No extra configuration is needed.
+
+The code uses a NanoBanana API Key. We will not expand on the application process here. As long as you can obtain the key and fill in the corresponding parameter, that is enough. **At this stage, you do not need to understand every line of code. It only needs to run successfully.**
+
+```Python
+# /// script
+# dependencies = [
+# "gradio>=4.0.0",
+# "pillow>=10.0.0",
+# "requests>=2.31.0",
+# ]
+# ///
+
+import gradio as gr
+import requests
+import base64
+from PIL import Image
+import io
+import os
+import time
+import re
+from typing import Optional, Dict, Any, List
+
+# 配置 API 信息
+NANOBANANA_API_URL: str = "YOUR API URL"
+NANOBANANA_API_KEY: str = "YOUR API KEY"
+OUTPUT_DIR: str = "outputs"
+
+# 确保输出目录存在
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+def image_to_base64_data_uri(image: Image.Image) -> str:
+ """
+ 将 PIL 图像转换为 OpenAI API 兼容的 data URI 格式。
+ """
+ buffer = io.BytesIO()
+ # 统一转为 PNG 以保证兼容性
+ image.save(buffer, format="PNG")
+ encoded = base64.b64encode(buffer.getvalue()).decode('utf-8')
+ return f"data:image/png;base64,{encoded}"
+
+def base64_to_image(base64_str: str) -> Optional[Image.Image]:
+ """
+ 将纯 base64 字符串转换为 PIL Image。
+ """
+ try:
+ image_bytes = base64.b64decode(base64_str)
+ return Image.open(io.BytesIO(image_bytes))
+ except Exception as e:
+ print(f"Base64 解码失败: {e}")
+ return None
+
+def extract_base64_from_response(content: Any) -> Optional[str]:
+ """
+ 核心解析逻辑:从 API 返回的 content 中提取图片 Base64 数据。
+ 兼容 Markdown 格式和结构化列表格式。
+ """
+ if not content:
+ return None
+
+ base64_data = None
+
+ # 1. 尝试结构化提取 (List)
+ # 对应返回格式: [{"type": "image_url", "image_url": {"url": "data:..."}}]
+ if isinstance(content, list):
+ for part in reversed(content): # 倒序查找,通常最新的图片在最后
+ if isinstance(part, dict):
+ # 检查 image_url 或 output_image 字段
+ img_field = part.get("image_url") or part.get("image") or part.get("output_image")
+ if isinstance(img_field, dict):
+ url = img_field.get("url", "")
+ if url.startswith("data:image/") and "," in url:
+ return url.split(",", 1)[1].strip()
+
+ # 如果列表中没有结构化图片,尝试把列表里的文本拼起来找 Markdown
+ text_parts = [
+ str(p.get("text", ""))
+ for p in content
+ if isinstance(p, dict) and p.get("type") in ["text", "input_text"]
+ ]
+ content_str = "".join(text_parts)
+ else:
+ content_str = str(content)
+
+ # 2. 尝试 Markdown 正则提取 (String)
+ # 对应返回格式: "Here is your image: "
+ pattern = re.compile(r"!\[.*?\]\((data:image/[^;]+;base64,[^)]+)\)", re.IGNORECASE)
+ match = pattern.search(content_str)
+
+ if match:
+ data_url = match.group(1)
+ if "," in data_url:
+ return data_url.split(",", 1)[1].strip()
+
+ return None
+
+def synthesize(prompt: str, input_image: Optional[Image.Image]) -> Optional[Image.Image]:
+ """
+ 调用 Nanobanana API 进行生成。
+ """
+ if not prompt or not prompt.strip():
+ gr.Warning("请输入提示词")
+ return None
+
+ print(f">>> 开始任务: {prompt[:50]}...")
+
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {NANOBANANA_API_KEY}"
+ }
+
+ # 构造符合 OpenAI Vision / Chat 标准的 payload
+ messages = []
+
+ if input_image is not None:
+ # 图生图/多模态输入模式
+ print(">>> 检测到输入图片,使用多模态模式")
+ img_base64 = image_to_base64_data_uri(input_image)
+ messages.append({
+ "role": "user",
+ "content": [
+ {"type": "text", "text": prompt},
+ {"type": "image_url", "image_url": {"url": img_base64}}
+ ]
+ })
+ else:
+ # 纯文生图模式
+ messages.append({
+ "role": "user",
+ "content": prompt
+ })
+
+ payload = {
+ "messages": messages,
+ # 使用第一段代码中验证可用的模型
+ "model": "gemini-2.5-flash-image",
+ # 可选参数,视 API 支持情况而定
+ "stream": False
+ }
+
+ try:
+ # 增加超时时间,图片生成通常较慢
+ response = requests.post(NANOBANANA_API_URL, headers=headers, json=payload, timeout=120)
+
+ # 检查 HTTP 状态
+ if response.status_code != 200:
+ error_msg = f"API 请求失败: {response.status_code} - {response.text}"
+ print(error_msg)
+ gr.Error(error_msg)
+ return None
+
+ result = response.json()
+ # Debug: 打印返回结果的前一部分,方便调试
+ print(f"API 原始响应 (截取): {str(result)[:200]}...")
+
+ # 提取 Content
+ content = None
+ if "choices" in result and len(result["choices"]) > 0:
+ content = result["choices"][0].get("message", {}).get("content")
+
+ if not content:
+ gr.Warning("API 返回结果中没有 content 字段")
+ return None
+
+ # 使用之前验证过的逻辑提取 Base64
+ base64_str = extract_base64_from_response(content)
+
+ if base64_str:
+ output_image = base64_to_image(base64_str)
+ if output_image:
+ return output_image
+
+ # 如果没提取到图片,可能是模型拒绝了或只返回了文本
+ text_content = str(content) if not isinstance(content, list) else " ".join([str(x) for x in content])
+ gr.Info(f"未生成图片,模型返回文本: {text_content[:100]}...")
+ return None
+
+ except requests.exceptions.Timeout:
+ gr.Error("请求超时,请稍后重试")
+ return None
+ except Exception as e:
+ import traceback
+ traceback.print_exc()
+ gr.Error(f"发生未知错误: {str(e)}")
+ return None
+
+# Gradio 界面配置
+with gr.Blocks(title="Nanobanana Image Generator") as app:
+ gr.Markdown("# 🍌 Nanobanana Text/Image to Image")
+ gr.Markdown("基于 Gemini-2.5-Flash-Image 模型,支持文生图与图生图。")
+
+ with gr.Row():
+ with gr.Column():
+ prompt_input = gr.Textbox(
+ label="提示词 (Prompt)",
+ placeholder="例如: A cyberpunk cat holding a neon sign...",
+ lines=3
+ )
+ image_input = gr.Image(
+ label="参考图 (可选,用于图生图)",
+ type="pil",
+ height=300
+ )
+ submit_btn = gr.Button("开始生成", variant="primary")
+
+ with gr.Column():
+ image_output = gr.Image(label="生成结果", format="png")
+
+ submit_btn.click(
+ fn=synthesize,
+ inputs=[prompt_input, image_input],
+ outputs=image_output
+ )
+
+if __name__ == "__main__":
+ app.launch(share=True)
+```
+
+When Trae indicates successful execution, click the local link it provides (usually `http://127.0.0.1:7860`).
+
+
+
+If everything is correct, you will see a working AI drawing interface.
+
+This interface looks simple, but it already includes two of the most important capabilities in commercial-grade drawing tools: text-to-image and image-to-image.
+
+* **Left side:** **Instruction area (** **Input** Zone) - this is where you issue commands.
+* **Prompt (prompt box):** Enter your creative description (English is recommended).
+* **Input** Image (reference image box):
+ * **Text-to-image mode:** keep it **empty**.
+ * **Image-to-image mode:** drag a local image here, and AI will create based on it.
+* **Submit button:** click to send instructions and start generation.
+* **Right side: display area (** **Output** Zone) - this is where results appear.
+
+
+
+Now we can try generating your first image.
+
+The example prompt used here is:
+
+> **A red apple**
+
+This is intentionally simplified, without style details or parameter constraints.
+
+#### Actual Process
+
+After running the code, the flow can be summarized in three steps:
+
+1. Send the text description to the model
+2. The model generates the corresponding image
+3. The image is saved as a local file
+
+After a few seconds, you will see generated results locally. Because model generation is stochastic, the same prompt can produce different outputs. You can generate multiple times and choose the image you prefer.
+
+
+
+You can also enrich your prompt with more constraints and descriptions. For example, the prompt below tends to generate a more distinctive result:
+
+```Plain
+"A hyper-realistic close-up of a fresh red apple with water droplets on its skin, sitting on a dark rustic wooden table. Cinematic dramatic lighting, rim light, shallow depth of field, bokeh background, 8k resolution, macro photography."
+(一个超写实的带水珠的新鲜红苹果特写,放在深色粗糙木桌上。电影级戏剧光效,轮廓光,浅景深,背景虚化,8k分辨率,微距摄影。)
+```
+
+
+
+Click download in the Output Image area to save the image locally.
+
+
+
+### 1.3 Common Material-Generation Scenarios for Image Models
+
+In real work, large-model image generation is more often used for **efficiently producing design assets**, rather than creating one-off art pieces.
+
+If you look at high-engagement cases from design marketing accounts, you will find that most outputs are concentrated in two scenarios:
+
+* **Text-to-image (0 to 1)**
+* **Reference-image generation (1 to N)**
+
+#### 1) Text-to-Image: Quickly Get Design Assets
+
+This category is about efficiency. When you need to fill visual blanks in design (such as empty states, avatars, and illustrations), AI essentially acts as an **instant stock-image library**.
+
+1. ##### Generate UI Design Assets
+
+* Trend: frosted-glass and clay-style 3D icons, common on Dribbble
+* Typical appearance: translucent materials, glowing edges, candy-like color palettes for functional or weather icons
+
+**Example Prompt:**
+
+> A set of 3D weather icons (sun, cloud, rain), glassmorphism style, frosted glass texture, soft pastel gradient colors, soft studio lighting, isometric view, transparent background, 4k.
+
+(一套 3D 天气图标,毛玻璃风格,磨砂质感,柔和渐变色,影棚光,等轴视图)
+
+
+
+2. ##### Generate Logos
+
+* Trend: minimalist lines and geometric combinations with a tech feel
+* Typical appearance: black-and-white color schemes, negative space, clear brand identity
+
+**Example Prompt:**
+
+> Minimalist vector logo design for a tech brand "Coffee Code", combining a coffee cup with coding brackets < >, flat design, solid black lines, white background, Paul Rand style, svg.
+
+(极简矢量 Logo,结合咖啡杯与代码符号,扁平设计,纯黑线条)
+
+
+
+3. ##### Generate Website User Avatars
+
+* Trend: SaaS websites often use 3D virtual avatars to avoid real-person copyright risk
+* Typical appearance: friendly expressions, cartoon proportions, Pixar- or Memoji-like styles
+
+**Example Prompt:**
+
+> Close-up portrait of a friendly young tech professional, smiling, Memoji 3D style, clay render, bright colors, soft lighting, solid plain background, Pixar character design.
+
+(友好的年轻科技从业者,3D Memoji 风格,黏土渲染)
+
+
+
+4. ##### Generate Article Illustrations
+
+* Trend: abstract flat illustrations commonly used in tech-company blogs
+* Typical appearance: purple-blue palettes, exaggerated character proportions, floating UI elements
+
+**Example Prompt:**
+
+> Editorial flat illustration representing remote work, a person sitting on a giant globe using a laptop, corporate memphis art style, vibrant colors (purple and teal), vector texture.
+
+(远程办公主题扁平插画,企业孟菲斯风格)
+
+
+
+#### 2) Reference-Image Generation: Keep Visual Consistency
+
+This category focuses more on **scalability**. Use it when you already have a satisfactory key visual and need to generate a full set of assets in the same style.
+
+5. ##### Generate a Similar Set of Buttons or Interaction Assets from a Key Visual
+
+In game development, UI consistency is very important. Suppose you already have a main-screen **"PLAY"** button and now need to expand a full set of function buttons in a unified style (such as pause, settings, home). With pure manual drawing, it is hard to keep gloss, perspective, and color values fully consistent across every button.
+
+**Basic workflow:**
+
+1. Save the existing blue "PLAY" button image
+
+
+
+2. Drag it into the **Input**** Image** area as the reference master
+3. Keep style descriptions in the prompt unchanged and only modify the subject content
+
+With this flow, you can get different functions in the same style by only changing subject descriptions.
+
+**Example Prompt:**
+
+**Variant A: Pause Button (icon type)**
+
+> A capsule-shaped game UI button with a white pause icon (two vertical bars) inside. Same glossy blue jelly style, shiny plastic texture, white thick outline, vector illustration, high quality.
+
+(胶囊形游戏 UI 按钮,白色暂停图标,蓝色果冻质感)
+
+
+
+**Variant B: Settings Button (complex icon)**
+
+> A capsule-shaped game UI button with a white gear icon (settings symbol) inside. Same glossy blue jelly style, shiny plastic texture, white thick outline, vector illustration, high quality.
+
+(胶囊形游戏 UI 按钮,白色齿轮图标,蓝色果冻质感)
+
+
+
+**Variant C: Replay Button (shape variation)**
+
+If you need to change the button shape, describe that shape directly in the prompt. The model will try to change the structure while keeping material characteristics.
+
+> A round game UI button with a white circular arrow icon (replay symbol) inside. Same glossy blue jelly style, shiny plastic texture, white thick outline, vector illustration, high quality.
+
+(圆形游戏 UI 按钮,循环箭头图标,蓝色果冻质感)
+
+
+
+With this set of operations, you can not only change button function and icon, but also button shape, while keeping high consistency in material, color, and lighting. This is exactly the core value of large models in design-asset scaling scenarios.
+
+## Chapter 2: A More Controllable Image Generation Assistant - Lovart as an Example
+
+In the first part, we directly called NanoBanana with code and experienced the basic "input -> generate" flow. This works when requirements are simple. But as tasks include more constraints, for example:
+
+* multiple images with consistent style
+* repeated iteration on existing results
+* dynamically adjusting generation direction based on user input
+
+the one-shot calling pattern gradually becomes insufficient.
+
+At this point, we need to introduce an **AI Agent**. This section uses **Lovart** as an example to show how the overall workflow changes when image generation gains a "thinking layer." Note: this is not an advertisement. It is only to help everyone quickly grasp the convenience of AI Agents.
+
+### 2.0 First Look at Lovart: Your AI Design Agent
+
+Lovart is an agent-based web design tool. Compared with ordinary image generation tools, it adds one extra layer of "thinking and planning" before generation.
+
+
+
+
+
+After entering Lovart, you mainly need to understand the following controls:
+
+#### Model Selection
+
+Click the cube icon below the input box to view currently available generation models (such as GPT Image, Flux, etc.).
+
+To stay consistent with earlier examples, this section still uses NanoBanana as the underlying generation model.
+
+
+
+#### Thinking Mode
+
+This is Lovart's core switch:
+
+* **Fast Mode (⚡):** close to native API behavior, fast response, suitable for single images with clear instructions
+* **Thinking Mode (💡):** agent mode, where AI first decomposes requirements and rewrites prompts, then generates
+
+
+
+
+
+#### Internet Capability
+
+After enabling the globe icon, the agent can retrieve online information during generation (for example design trends and color styles) as auxiliary input.
+
+### 2.1 Why Is Native API Still Not Enough?
+
+Even if you can already generate good images via Python, native APIs still have limitations in complex tasks. The key reason is that native APIs are fundamentally imperative. If you ask for a concrete object, they can execute directly. But when the input becomes "plan a complete set of game assets," they will not proactively decompose that goal into executable substeps.
+
+Lovart's core difference is its agent mechanism. Between user input and the image generation model, it adds a logic layer for understanding and planning: first identify user intent, then decompose tasks and rewrite prompts, and only then execute generation.
+
+### 2.2 Practical Demo: Build a Full IP Sticker Pack in 5 Minutes
+
+Take **"create an IP sticker pack of a programmer duck"** as an example and look at how the agent participates in the full workflow.
+
+#### Step 1: Planning (Agent Thinking Capability)
+
+**Native API issue:**
+You need to think through character settings and emotional states yourself, and write separate prompts for every image.
+
+**Lovart approach:**
+
+1. Turn on 💡 **Thinking Mode**
+2. Input one instruction:
+
+> 设计一套程序员鸭子的 IP 表情包,风格要扁平化、可爱
+
+AI does not draw immediately. It first searches online for relevant programmer-duck references, then outputs a decomposed plan, automatically creates scenarios such as Debug, Coffee Break, Panic, and generates multiple visual descriptions.
+
+
+
+At this step, AI shifts from "executor" to "planner." After AI analyzes the requirement, you can see programmer-duck images with multiple styles and contents on the Lovart canvas and start selecting your preferred style.
+
+
+
+#### Step 2: Consistency (Reference-Based Visual Anchoring)
+
+In Lovart, images are not only outputs. They are also inputs for follow-up generation.
+
+##### Full Reference Image
+
+* Choose your favorite "standard duck" from drafts and click the image on the canvas
+* The image automatically appears in the dialogue area as a reference
+
+
+
+* Input a new action (such as happy) and generate
+
+The generated result will inherit color palette, proportions, and detail characteristics from the master reference.
+
+
+
+##### Local Reference / Multi-Image Composition
+
+Besides using full images as references, Lovart also supports:
+
+* **selecting only local regions** (for example, only reference a hat or expression)
+
+Click the left tab on the canvas, choose "Mark," and annotate the local region in the target image. That part is automatically synced into the dialogue box. For example, we can change only the background color here.
+
+
+
+
+
+
+
+You can see the newly generated image only changes the background color, which matches our requirement.
+
+* **referencing sub-elements from multiple images** and combining them into a new result
+
+For example: you can keep the main character from image A, while replacing only the hat with the style from image B. The agent automatically merges these visual constraints in the background.
+
+Using programmer ducks as an example, we can keep the duck from the first image and replace the subject element in the second image.
+
+
+
+
+
+The final effect is also very strong. You can try other combinations too.
+
+#### Step 3: Delivery (Agent Tool Calling)
+
+After generation, you can directly execute operations such as upscale, background removal, and erasing.
+
+
+
+
+
+These are not simple filters. They are results from the agent orchestrating different tools automatically.
+
+After style direction is confirmed, you can quickly generate a full set of sticker images.
+
+
+
+What we finally get is production-ready assets that can be delivered directly, not just one showcase image.
+
+### 2.3 Usage and Pricing Notes
+
+Lovart uses a subscription model. Different plans correspond to different usage quotas and feature permissions. Refer to the official site for specific details.
+
+This tutorial does not recommend or compare any specific plan. If you need it in actual use, choose paid upgrades based on your own situation.
+Currently, payment methods include **Alipay** and others.
+
+
+
+#### Summary
+
+Lovart does not replace underlying models. Instead, through an agent mechanism, it upgrades image generation from "single execution" to a "continuous workflow."
+
+When tasks involve planning, consistency, and delivery, the advantage of this type of tool becomes very clear.
+
+## Chapter 3: Build an Intelligent Drawing Assistant by Yourself
+
+Besides using Lovart directly, we can also implement a simplified drawing assistant ourselves.
+
+In this chapter, we use "automatic illustration for articles" as an example. Starting from a real problem, we build a minimal practical agent with a thinking layer step by step.
+
+### 3.1 Pain Point: Why Sending Long Articles Directly to an Image Model Does Not Work
+
+If you directly send a long article to NanoBanana and ask for illustration, the result is usually not ideal. The issue is not that the model "cannot draw." The issue is that **it is not good at understanding long text**.
+
+Image generation models are better at short and clear visual descriptions. But when the input becomes an article with structure, key points, and contextual relationships, the model cannot determine which parts should be represented visually. This often causes off-topic images, or results that capture only scattered details without overall summarization.
+
+In essence, image models have "execution" capability but lack an analysis-and-selection process for long text.
+
+
+
+### 3.2 Solution: Use an Agent to Split "Understanding" and "Execution"
+
+To solve this, the key is not a more complicated prompt. The key is **to think clearly before drawing**. So we introduce an independent "thinking layer" into the generation flow, and use it to build the simplest practical agent.
+
+This agent has only one core objective: **make the final generated image match the user's true intent as closely as possible.**
+
+The full flow can be summarized as:
+**long-text input -> language-model understanding and intent judgment -> generation of suitable visual prompt -> image-model execution -> output image**
+
+
+
+How can our agent understand user intent?
+
+Here we use a simplified **thinking layer** with three intents: invalid input, direct drawing instruction, and long text that needs understanding.
+
+In this agent, role division can be summarized in four points:
+
+1. **Language model as decision core**
+ It understands article content, judges user intent, routes tasks to suitable generation paths, and decides "what to do next" and how to generate visual prompts.
+2. **Image model as executor**
+ The image model does not do understanding or intent judgment. It only receives prepared visual instructions and focuses on rendering.
+3. **User as interactive guide**
+ Besides entering text directly, users can manually adjust generated prompts or add reference images to guide and fine-tune final results.
+4. **Gradio and backend APIs as application carrier**
+ They connect UI, model invocation, and result display to ensure the full agent can run stably as a complete web app.
+
+
+
+### 3.3 Practical Preparation: Obtain APIs
+
+Looks fun, right? To run the full flow above, we only need two types of APIs.
+
+#### Hand: NanoBanana API (Image Generation)
+
+Directly reuse the API Key and API URL already configured in Chapter 1. No additional setup is required.
+
+#### Brain: SiliconFlow API (Text Thinking)
+
+We need a large language model to handle the "thinking layer." This tutorial uses model services provided by SiliconFlow:
+[https://cloud.siliconflow.cn](https://cloud.siliconflow.cn/)
+
+
+
+SiliconFlow provides interfaces compatible with OpenAI API conventions, so it can be called conveniently via standard network requests. Here we use the free `Qwen2.5-7B-Instruct` model. Everything needed for invocation is already included in the prompt below. Before you start, you only need to register an account and create an API Key on the official site.
+
+
+
+
+
+This key will be used for later model calls.
+
+### 3.4 Build the Agent:
+
+In this experiment we mainly use Trae to help write code. The tutorial uses `Gemini-3-Pro-Preview`. The overall approach is: create a new project, copy the full prompt below into the dialogue box, replace API keys step by step, run code, and complete testing.
+
+
+
+#### Step 1️⃣: Gradio Blocks Base Framework and UI Layout
+
+In this step, our main goal is to build the "appearance" of the whole agent first and complete the front-end page design. Copy the prompt below into Trae. After implementation, you will get a local URL (usually `http://127.0.0.1:7860`) to view the interface and verify the result.
+
+```Plain
+板块 1:Gradio Blocks 基础框架与界面布局
+1、任务目标
+·基于 Gradio 4.0.0+ 的 Blocks 布局,实现「LLM+Nanobanana 文生图」项目的基础界面,严格遵循固定左右分栏布局,初始化所有 UI 组件并设置正确的初始状态。
+
+2、技术栈要求
+·必须使用 Gradio 4.0.0+ 的 Blocks 模式开发,禁止使用 Interface 模式;
+·依赖:gradio>=4.0.0,pillow>=10.0.0(仅导入,暂不实现图片处理逻辑);
+·代码需是完整可运行的 Python 文件,包含所有必要的导入语句。
+
+3、界面布局规则(核心约束,融合实战细节)
+·整体布局:
+页面标题:LLM 驱动的文生图全流程工具;
+固定左右分栏:左侧占 60% 宽度,右侧占 40% 宽度,使用 gr.Row 和 gr.Column 实现比例控制。
+·左侧 60%(提示词生成流程区)组件清单:
+input_text:gr.Textbox,标签「输入文本(教程段落 / 绘图指令)」,lines=6,占位符「请输入需要配图的教程文本或直接绘图指令...」;
+identify_intent_btn:gr.Button,value="识别意图",初始状态正常可点击;
+intent_status:gr.Textbox,标签「意图类型 / 处理状态」,lines=2,interactive=False,初始值「未识别意图」;
+system_prompt:gr.Textbox,标签「System Prompt(仅文章配图意图可编辑)」,lines=4,interactive=False,占位符「LLM 生成提示词的约束规则...」;
+confirm_prompt_btn:gr.Button,value="确认生成生图提示词",interactive=False(初始禁用防误触);
+generation_prompt:gr.Textbox,标签「生图提示词(可编辑)」,lines=3,interactive=True,初始值为空,占位符「生成的英文生图提示词将显示在此,支持手动修改...」。
+·右侧 40%(Nanobanana 生图功能区)组件清单:
+ref_image:gr.Image,标签「参考图(可选,图生图)」,type=filepath,height=300,允许上传;
+generate_btn:gr.Button,value="生成图片",interactive=False(初始禁用,无提示词不可点击);
+result_image:gr.Image,标签「生成结果」,type=pil,height=300,初始为空,interactive=False。
+
+4、交互逻辑要求
+·所有组件的 interactive 初始状态严格按上述配置,后续通过函数动态更新;
+·按钮禁用状态需直观(置灰),避免用户误操作。
+
+5、输出要求
+·生成完整的 Python 代码,仅实现界面布局和组件初始化,不包含任何业务逻辑;
+·代码注释清晰,组件命名与实战版一致(input_text/identify_intent_btn 等);
+·代码可直接运行,界面结构与描述完全一致。
+```
+
+After opening `http://127.0.0.1:7860` in the browser, you can see Trae generated the page according to requirements. It is generally aligned, and we can move on to the next step.
+
+
+
+#### Step 2️⃣: LLM Intent Recognition Module (SiliconFlow API)
+
+When using VLMs for drawing in daily work, there are usually three common input cases:
+
+1. Meaningless content, such as "hello" or "have you eaten today," which cannot map to drawable requirements.
+2. Articles/long text, such as a structured paragraph around 200 words, where you must first understand structure/content before generating an image that summarizes the text.
+3. Direct drawing instructions, such as "draw a dog taking a bath," where requirements are already specific enough for immediate generation.
+
+As before, copy the prompt below into Trae and add the API obtained in earlier steps.
+
+```Plain
+板块 2:LLM 意图识别模块(Siliconflow API)
+1、任务目标
+在已实现的 Gradio 界面基础上,为「识别意图」按钮添加点击逻辑,调用 Siliconflow API 完成意图识别,并联动组件状态。
+
+2、技术栈要求
+基于 Gradio 4.0.0+ Blocks;
+依赖:requests>=2.31.0,openai;
+输出完整可运行 Python 文件,包含板块 1 界面 + 本模块逻辑。
+
+3、核心业务规则(绝对不可偏离)
+·意图分类规则(仅 3 类,严格返回数字 + 描述)
+1 = 无意义内容:仅闲聊、寒暄、无关对话,没有任何绘图或配图需求(如 “你好”“今天吃了吗”);
+2 = 文章 / 长文本配图需求:用户输入一段完整文章、教程、段落、说明性文字,内容偏叙事 / 说明 / 讲解,隐含需要为这段内容生成配图的意图,不需要用户明确说 “为这段文字配图”;
+3 = 直接绘图指令:用户输入简短、明确的画图命令,没有长文本背景,直接要求画某个内容(如 “画一只 Apple 风格的猫”)。
+·LLM 调用约束(融合实战版模板)
+接口地址:https://api.siliconflow.cn/v1/chat/completions;
+模型:Qwen/Qwen2.5-7B-Instruct;
+temperature=0.1;
+统一定义代码:
+python
+运行
+LLM_BASE_URL = "https://api.siliconflow.cn/v1"
+LLM_API_KEY = "" # 用户自行替换
+LLM_MODEL = "Qwen/Qwen2.5-7B-Instruct"# 实战验证的意图识别模板(固化到代码中)
+INTENT_PROMPT_TEMPLATE = """你需要识别用户输入文本的意图,仅返回以下 3 类结果中的一种(格式:数字 + 中文描述):
+1 = 无意义内容;2 = 文章 / 长文本配图需求;3 = 直接绘图指令。
+
+用户输入:{user_input}
+
+识别结果:
+仅提取返回结果中的数字和描述,禁止额外内容。"""
+
+4、组件联动规则
+·结果为 1:intent_status 显示「1 = 无意义内容:无绘图需求」,system_prompt 保持禁用,confirm_prompt_btn 禁用;
+·结果为 2:intent_status 显示「2 = 文章 / 长文本配图需求:为输入内容生成配图」,启用 system_prompt 并填充默认规则,激活 confirm_prompt_btn;
+·结果为 3:intent_status 显示「3 = 直接绘图指令:根据指令生成图片」,system_prompt 禁用且填充默认规则,激活 confirm_prompt_btn。
+
+5、异常处理
+API 异常、解析异常均给出友好提示,不崩溃,组件恢复初始状态。
+
+6、输出要求
+生成完整可运行代码,替换 LLM_API_KEY 即可使用,逻辑清晰注释完整,意图识别模板严格使用实战版。
+```
+
+Refresh `http://127.0.0.1:7860` and test whether it correctly detects all three cases.
+
+1. Meaningless content: try inputting "你好", "谢谢", and so on. It should be recognized correctly.
+
+
+
+2. Article/long text: here we use a paragraph about AI generated by Doubao. You can also test with your own paper paragraph.
+
+```Plain
+人工智能正在以前所未有的深度和广度重塑教育生态系统。通过自适应学习算法,AI系统能够构建每个学生的认知图谱,实时追踪他们的知识掌握轨迹,并动态调整教学内容的难度和呈现方式。在传统课堂环境中,教师往往难以同时满足不同学习风格和能力水平的学生需求,而基于深度学习的教育平台可以分析学生在交互式模拟实验中的行为模式,识别他们在量子力学或微积分等复杂概念理解上的微妙障碍,并提供精准的认知支架。
+
+高级自然语言处理引擎驱动的虚拟导师不仅能够解构开放性问题,如"如何评价法国大革命对现代民主制度的影响",还能引导苏格拉底式对话,激发批判性思维。当学生撰写关于气候变化对极地生态系统影响的论文时,AI写作助手可以分析其论证逻辑的严密性,指出数据引用中的时效性问题,并建议更精准的科学术语。在特殊教育领域,计算机视觉技术使AI能够识别自闭症谱系儿童在社交互动中的非语言线索,调整干预策略,而情感计算算法则帮助检测在线学习时的挫折感,及时提供鼓励性反馈。
+
+然而,这种技术融合引发了一系列伦理困境。算法偏见可能无意中边缘化特定文化背景的学生,数据采集的透明度问题引发了对学术隐私的关切,而过度依赖自动化评分系统可能削弱教师对学生思维过程的深层理解。更复杂的是,当AI开始生成高度逼真的虚拟实验室体验时,我们需要重新定义"实践经验"在教育中的价值。未来教育的范式可能演变为人类教师专注于培养创造力、同理心和道德判断力,而AI系统则承担知识传递、技能训练和个性化评估的职能,形成一种协同进化的教育共生体,既能发挥机器的计算优势,又能保留人类教育的独特温度.
+```
+
+This is also detected successfully.
+
+
+
+3. Direct drawing instruction: here we input "我要画一只猫", and it is also correctly detected.
+
+
+
+At this point, we have successfully completed step 2: intent recognition.
+
+#### Step 3️⃣: Prompt Generation Module (Second LLM Call)
+
+After intent recognition, for articles or long text there is one more crucial step: generating the drawing prompt. This is exactly the core of this agent.
+
+```SQL
+板块 3:生图提示词生成模块(LLM 二次调用)
+1、任务目标
+在意图识别基础上,实现「确认生成生图提示词」按钮逻辑,调用 LLM 将文本优化为适合绘图的英文视觉提示词,填充到编辑框并联动「生成图片」按钮。
+
+2、技术栈要求
+同板块 2,输出完整代码 = 板块 1 + 板块 2 + 本模块;
+共用板块 2 定义的 LLM_BASE_URL、LLM_API_KEY、LLM_MODEL,不新增密钥。
+
+3、核心业务规则(融合实战版 Prompt 组装逻辑)
+·提示词生成输入规则(必须严格遵循)
+生图提示词生成不再是简单字符串拼接,而是构建标准 Chat 消息列表,代码结构如下:
+python
+运行
+messages=[# System角色:网页上用户最终确认/编辑后的system_prompt内容{"role": "system", "content": final_system_prompt},# User角色:承载待处理数据,明确任务目标{"role": "user", "content": f"请为以下内容生成视觉提示词:\n\n{user_input}"}]
+意图为 2 时:System 内容取用户编辑后的 system_prompt 最终版本;
+意图为 3 时:System 内容取禁用状态下填充的默认规则
+user_input 为用户最初输入到 input_text 框的原始文本。
+·实战验证的 System Prompt 预设(固化到代码中)
+python
+运行
+SYSTEM_PROMPT_DEFAULT = """你现在是一个创建NanoBanana画图提示词的助手。
+需要根据我的内容处理,我这个图片的作用是能说明这一段在说什么,并且让大家知道这段话的上下结构就是整体说的是什么意思。
+里面可能会类似PPT有一些讲解(如:左上角展示核心观点,右下角展示数据)。
+设计风格要求:简约,Apple设计思维(Apple Design Philosophy)。
+约束:请直接返回NanoBanana可用的英文提示词,不要返回任何解释、前缀或多余的废话。"""
+·LLM 调用约束
+与板块 2 共用同一套 LLM_BASE_URL、LLM_API_KEY、LLM_MODEL;
+temperature=0.7(保证提示词的创意性与适配性);
+max_tokens=200(限制输出长度,匹配提示词约束);
+严格使用上述标准 Chat 消息列表结构,禁止字符串拼接。
+·示例输入输出(核心参考)
+输入示例 1(文章配图意图):原始文本:「AI 如何改变教育:随着人工智能技术的发展,教师的角色从知识传授者转变为引导者,AI 助手可辅助学生完成个性化学习,课堂上人机协作成为常态。」最终 System Prompt:SYSTEM_PROMPT_DEFAULT(未修改)输出预期:"Minimalist illustration, Apple Design Philosophy, 1024x1024. Top left shows 'AI + Education' core concept, bottom right shows data of teacher-student-AI collaboration, soft color palette, clean lines, no redundant elements."
+输入示例 2(直接绘图指令):原始文本:「画一只 Apple 风格的猫,坐在 MacBook 旁边」最终 System Prompt:SYSTEM_PROMPT_DEFAULT(禁用状态)输出预期:"Minimalist cat, Apple style, 1024x1024, sitting next to a silver MacBook, clean white background, soft shadows, geometric shapes, no extra details."
+·提示词输出强制约束
+纯英文,无中文;
+必须包含 Apple Design Philosophy/Apple style + 1024x1024;
+长度 50–200 字符,代码内校验;
+无额外解释、前缀或废话,仅返回提示词本身。
+
+4、组件联动规则
+生成成功:将提示词填入 generation_prompt 框,激活 generate_btn,intent_status 追加「提示词生成成功,可修改后生成图片」;
+生成失败:提示具体原因(如 API 调用失败、长度不达标),generate_btn 保持禁用,generation_prompt 框为空;
+用户手动修改 / 清空 generation_prompt 框:
+清空时自动禁用 generate_btn;
+非空时保持 generate_btn 激活。
+
+5、异常处理
+API 调用失败:友好提示「提示词生成失败:{具体错误信息}」,不崩溃;
+提示词校验失败:明确提示原因(如 “未包含 Apple style”“长度仅 40 字符”),允许重试;
+响应解析失败:提示「无法解析 LLM 返回结果,请重试」。
+
+6、输出要求
+完整可运行代码,替换 LLM_API_KEY 即可使用;
+代码结构清晰、注释完善,界面美观简洁;
+严格实现标准 Chat 消息列表结构,参数与示例逻辑一致;
+包含提示词长度、内容校验逻辑,错误提示友好。
+```
+
+Use the same long text from step 2 for testing.
+
+It is worth noting that the default System Prompt we preset for prompt generation is:
+
+> 你现在是一个创建NanoBanana画图提示词的助手。
+> 需要根据我的内容处理,我这个图片的作用是能说明这一段在说什么,并且让大家知道这段话的上下结构就是整体说的是什么意思。
+> 里面可能会类似PPT有一些讲解(如:左上角展示核心观点,右下角展示数据)。
+> 设计风格要求:简约,Apple设计思维(Apple Design Philosophy)。
+> 约束:请直接返回NanoBanana可用的英文提示词,不要返回任何解释、前缀或多余的废话。
+
+If you want to switch to other preset templates, you can modify the earlier prompt or directly modify it through Trae dialogue.
+
+
+
+Besides changing underlying code, we can also edit quickly on the webpage. For example, I added one line, "add 'Pic Prompt' at the beginning." You can see the new generated prompt also starts with it. This design is for quickly adjusting the system prompt for generation, so we can switch styles fast.
+
+
+
+#### Step 4️⃣: NanoBanana Text-to-Image / Image-to-Image Module
+
+Finally we are at the last step. Without connecting an image model, it is not a complete agent.
+
+```Bash
+板块 4:Nanobanana 文生图 / 图生图模块(最终版)
+1、任务目标
+实现「生成图片」按钮逻辑,调用真实 Nanobanana API,支持文生图 / 图生图,解析 Base64 并展示图片。
+
+2、技术栈要求
+基于 Gradio 4.0.0+ Blocks;
+依赖:requests, pillow, base64, io, re;
+完整代码 = 板块 1+2+3 + 本模块。
+
+3、核心 API 配置(实战验证固化)
+固化代码配置:
+python
+运行
+# 固化到代码中的API配置
+NANOBANANA_API_URL = "https://api.zyai.online/v1/chat/completions"
+NANOBANANA_MODEL = "gemini-2.5-flash-image"
+NANOBANANA_API_KEY = "" # 用户自行替换
+鉴权方式:Header Authorization: Bearer {NANOBANANA_API_KEY}。
+
+4、图片预处理要求(必须实现)实现函数 image_to_base64_data_uri (ref_image_path),核心逻辑:
+将 PIL 图片转为 PNG 格式;
+自动缩放到 1024x1024 分辨率;
+透明通道转为白色背景;
+编码为 Base64,返回格式:data:image/png;base64,...。
+
+5、请求构建规则(严格按实战版分支逻辑)
+·核心函数定义实现函数 generate_image (prompt, ref_image_path):
+入参:prompt(generation_prompt 框内容)、ref_image_path(ref_image 上传的文件路径);
+返回:PIL Image(展示到 result_image)或错误提示。
+·逻辑分支 1:纯文生图(ref_image_path 为空)
+python
+运行
+messages = [{"role": "user", "content": prompt}]
+·逻辑分支 2:图生图(ref_image_path 有值)
+python
+运行
+# 先调用图片预处理函数
+image_base64 = image_to_base64_data_uri(ref_image_path)
+messages = [{"role": "user","content": [{"type": "text", "text": prompt},{"type": "image_url", "image_url": {"url": image_base64}}]}]
+
+6、响应解析要求(必须兼容两种格式)从 choices [0].message.content 中提取图片 Base64,支持:
+结构化 JSON 返回的 image_url 字段;
+Markdown 格式
+;
+统一提取 Base64 编码,解码后转换为 PIL Image 返回。
+
+7、组件联动与异常处理
+生成成功:将 PIL Image 展示到 result_image,intent_status 提示「图片生成成功」;
+生成 / 解析 / 上传失败:在 intent_status 显示清晰文字提示(如 “Base64 解析失败”“API 调用超时”),不崩溃。
+
+8、输出要求
+完整可运行代码,替换 LLM_API_KEY 和 NANOBANANA_API_KEY 即可直接运行,全流程可用,分支逻辑严格匹配实战版。
+```
+
+
+
+So exciting. We finally generated the first image of this agent. Looking closely, the generated image matches both our text and prompt. At this point, you have basically implemented your own agent.
+
+
+
+We also added image-to-image. Upload an image you like, and AI will automatically borrow style cues.
+
+
+
+It is also worth mentioning that prompts generated in earlier steps can be edited directly on the webpage, and generation always uses the final prompt at click time. Even if I change it here to "a cute cat," the final output will be just a cute kitten.
+
+## Chapter 4: Summary
+
+
+
+**Whew, finally finished.**
+Honestly, when I finished the last line, I exhaled deeply myself, and you followed the full path to here. Running through this full workflow is already impressive by itself. It means you really put your hands on the keyboard and completed things step by step. Bravo.
+
+During the writing of this tutorial, I kept asking what we really want to leave behind. The answer is not model names, parameter values, or fixed tricks. It is helping you gradually build a feel for division of labor: what AI can safely understand and plan for you, and where you only need to decide direction. Once this division is established, many workflows that once looked complex start becoming smooth.
+
+Looking back, this path is not actually complicated. Clarify the problem you want to solve, let a language model decompose long text, then pass organized visual intent to an image model for rendering, and finally package the full process into your own assistant. At that point, you are no longer simply "using models." You are building a system that can work with you over the long term. That is exactly what this tutorial most wants to deliver.
+
+But you already did great. If you have made it this far, you already have a solid initial grasp of Vibe Coding. Give yourself a short break.
+
+
diff --git a/docs/en/stage-2/frontend/2.1-figma-mastergo/index.md b/docs/en/stage-2/frontend/2.1-figma-mastergo/index.md
new file mode 100644
index 0000000..428ea1e
--- /dev/null
+++ b/docs/en/stage-2/frontend/2.1-figma-mastergo/index.md
@@ -0,0 +1,307 @@
+# Figma and MasterGo Basics
+
+::: tip Core Question
+**How do you start using modern design tools from scratch to build web prototypes?**
+:::
+
+---
+
+## 1. Why learn frontend design tools?
+
+Before we begin, we need to answer a simple question: why bother learning frontend design tools at all? If you can already build pages with HTML and CSS, is it really necessary to learn one more tool?
+
+In practice, "making a page run" and "designing a good product" are two different things. Code focuses on how something renders in the browser and how it behaves across devices. Design tools focus on how information is arranged, how interactions are sequenced, and how visual priority is communicated. With a single canvas, you can compare layout, information hierarchy, and interaction patterns on one screen before writing code.
+
+If you jump straight into implementation or ask AI to generate a full frontend page immediately, the user experience is often rough. Serious products think carefully about comfort, hierarchy, and communication across different screens. A better workflow is to arrange the interface first from the user's perspective, then convert or generate the code.
+
+From a collaboration standpoint, design tools also reduce coordination cost. Designers, product managers, and developers no longer need to imagine the same screen from vague explanations or abstract code. Everyone can discuss versioning, requirement changes, and feedback around a visible, annotatable, iterative canvas. Modern design tools are no longer just drawing software either. They can generate part of the code, manage design systems and component libraries, and automate repetitive work such as alignment, annotation, exporting, and style changes.
+
+
+
+### 1.1 The evolution of frontend design tools
+
+Frontend design tools are the result of a long evolution. In the 1990s, Photoshop dominated with local bitmap editing. Around 2010, Sketch introduced vector-first, component-oriented workflows. After 2016, Figma pushed collaboration into the cloud and turned solo design work into real-time teamwork. By 2025, AI had become a practical part of these tools, from "generate a draft from one sentence" to "turn a design into runnable frontend structure." "Design as code" and "human-AI co-creation" are no longer just slogans.
+
+In this chapter, we will focus on two representative modern design tools: Figma and MasterGo. They both cover the core abilities needed for modern UI and UX work, including vector editing, component systems, auto layout, and developer handoff. They have also both added practical AI features that help turn a prototype into a runnable interface without changing the overall design intent.
+
+## 1.2 How this toolchain emerged
+
+
+
+Before dedicated interface tools existed, UI design was largely handled by "general-purpose" design tools such as Photoshop. Designers built entire interfaces locally using layered PSD files, then handed those heavy source files to frontend engineers. To recreate the design accurately, frontend engineers had to do three tedious but essential jobs manually.
+
+The first was **asset slicing**: extracting buttons, icons, logos, backgrounds, and other visual elements one by one from a PSD file, then exporting them as PNG or JPG files the web could actually load.
+
+
+
+The second was **measuring dimensions**: manually checking widths, heights, and spacing between elements to ensure everything matched the design pixel by pixel.
+
+
+
+The third was **reading annotations by hand**: pulling out the "invisible but required" design parameters such as font size, font weight, line height, RGB or HEX colors, shadows, and so on.
+
+
+
+Only after that did actual frontend implementation begin. Whether the stack is plain HTML/CSS/JS or frameworks like Vue and React, the core process is similar. The frontend rebuilds the page around containers, based on the hierarchy and semantics of the design. A container is a layout boundary that organizes child elements without directly being the final content itself. Structural blocks such as top navbars, sidebars, article lists, and footers rely on containers; inside each block, smaller containers arrange finer elements such as titles, descriptions, timestamps, or thumbnails.
+
+
+
+In modern frontend frameworks, these structural blocks are typically implemented as **components**. A component is a reusable interface unit with clear boundaries. It includes both layout containers and interaction logic. Any repeated piece of design, such as a consistent button style or a reusable article card, can be abstracted into a component so it can be reused across different pages while keeping layout and styling consistent.
+
+The styling layer then restores the visual appearance. Exported image assets become `
` tags or background images. Measured dimensions become CSS properties such as `width`, `height`, `margin`, `padding`, and `line-height`. Typography, color, shadow, border radius, and hover or active states become CSS, CSS Modules, CSS-in-JS, or Tailwind rules. At this point, exported assets and annotations provide the visual parameters, while components and structural blocks provide the code organization that makes the interface maintainable and reusable.
+
+
+
+But the local-file workflow was fundamentally inefficient. Versions were sent through email or cloud drives, old and new drafts were easy to confuse, and collaboration required a lot of manual coordination.
+
+As mobile interfaces became more complex and iteration speed increased, Photoshop's "do everything" model became too heavy. Sketch appeared in this phase. It focused on UI work itself, introduced Symbols for highly reusable elements such as buttons and form controls, and paired well with tools like Zeplin for automatic annotations and style snippets. Sketch brought component thinking into design workflows. Still, it remained a desktop tool built around local files, so real-time collaboration never became native.
+
+
+
+Figma truly changed the game. Starting in 2016, it unified UI design, prototyping, comments, and version history in the browser, with multi-user cursors, online comments, timeline history, and shareable links.
+
+
+
+From that point on, interface design was no longer scattered across separate machines. It became a shared online canvas that updated in real time. Once that happened, the boundary between design and frontend code became easier to blur through automation and AI.
+
+At first, plugins could only semi-automatically export components and style information into code snippets such as React or Vue skeletons and CSS variables. Later, design platforms began to support MCP, the Model Context Protocol, which gives language models a standard, controlled way to access design files, plugin interfaces, and project metadata. That makes exporting designs into code much more direct.
+
+The next step after plugins and MCP is native design-to-code generation. Today, some tools can generate project skeletons, component hierarchies, style systems, and real code directly from a design. That frees designers and frontend engineers from manually transferring details and gives them more time to focus on user experience and feature iteration.
+
+---
+
+## 2. Figma basics
+
+Now let's move from concepts to hands-on work. Because of time, we will only cover Figma's core interaction model. The goal is simple: even if you have never used a design tool before, you should be able to follow along and complete the exercise. If you want a more complete walkthrough, you can study Figma's official beginner documentation:
+
+https://help.figma.com/hc/en-us/sections/30880632542743-Figma-Design-for-beginners
+
+You can also look at Figma's site-building examples:
+
+https://help.figma.com/hc/en-us/sections/35895585621655-Figma-Sites-collection
+
+
+
+On the left is project creation and resource management. In the top-right area, you will see several common entry points. `Make` lets AI generate a rough interface draft from one sentence. `Design` is the main workspace where you build app and web interfaces, components, and prototypes. `FigJam` works like a team whiteboard for notes, flows, and early discussions. `Buzz` is for brand-scale asset production. `Site` is for publishing designs as accessible websites or documentation pages.
+
+At first glance, Figma looks complex. But tools like this become familiar through repetition. You do not need to be afraid of making mistakes, and you do not need to get everything right on the first try. The key is to start playing with it.
+
+In this tutorial, we will focus on the `Design` workspace.
+
+### 2.1 Create a new Design file
+
+From the homepage or the top-right entry, choose **Design** to create a new file. You will enter a blank canvas.
+
+This interface is roughly divided into three areas:
+
+- The left side shows pages and layers so you can inspect the structure of the page and the hierarchy of elements.
+- The middle area is the canvas where you view and arrange the current design.
+- The right side is the properties panel where you change shape, color, and style details.
+- The toolbar lets you switch between selection, shapes, text, comments, and plugins. After selecting a tool, you can press `Esc` to return to the default pointer.
+
+
+
+### 2.2 Create your first Frame
+
+Before placing elements, we need a clear page boundary. In Figma, that boundary is handled by a Frame. You can select the Frame tool or press `F`, then drag out a rectangular region on the canvas.
+
+1. Use the Frame tool in the toolbar or press `F`.
+2. Drag a rectangle on the canvas and set its width to something like `1440` and height to `900` in the right-side panel.
+3. Rename the Frame in the layer list to something like `My First Page` or your project name.
+
+This Frame becomes the container for one complete screen. Your title, text, buttons, and images should all live inside it instead of floating freely on the canvas. Working inside a Frame helps later with scrolling, responsiveness, exporting, and prototyping.
+
+
+
+### 2.3 Add text and basic elements inside the Frame
+
+Now that we have a container, let's place the most basic interface elements: a title, subtitle, button, and placeholder image block.
+
+1. Choose the text tool (`T`) and click inside the Frame to add a title such as `My Portfolio`. Increase the font size and weight in the right panel.
+2. Add one line of supporting text under the title. Use a smaller font size and slightly larger line height so it reads more comfortably.
+3. Sketch out a button:
+ Use the rectangle tool to draw something around `200 x 48`, give it a noticeable fill color, and add some border radius.
+ 
+4. Add button text on top, such as `Get Started`, then select both the rectangle and the text and align them horizontally and vertically.
+5. Add a larger light-gray rectangle beside or below the button as a placeholder image area.
+
+At this point, you already have a very rough but structurally complete homepage draft: a title, a piece of body text, a button, and a main display area.
+
+
+
+### 2.4 Use Auto Layout to organize elements
+
+If all elements are positioned manually, the page becomes messy very quickly. One of Figma's most important concepts is **Auto Layout**, which turns a group of elements into a rule-based container.
+
+
+
+Select the main title, subtitle, and button together, then click **Add Auto layout** in the right panel.
+
+Those elements are now wrapped inside a container, and you can adjust several useful properties:
+
+- Whether the elements are arranged vertically or horizontally
+- The spacing between elements
+- The padding between the content block and the edge of the container
+
+
+
+You can use Auto Layout inside the button as well. That gives you a button whose width adjusts automatically when the text changes.
+
+Select the button background and button text, add Auto Layout, and turn them into a button container. Then set both width and height to **Hug contents**. Once you do that, the text stays centered and the button width grows or shrinks with the text.
+
+
+
+### 2.5 Turn the button into a reusable component
+
+Now let's learn another important concept: components. A component is an element designed for repeated reuse. Buttons are a perfect example.
+
+Starting from the button that already has Auto Layout:
+
+1. Select the entire button container.
+2. Right-click and choose **Create component**.
+ 
+
+The button is now promoted from a set of ordinary layers to a component master. When you need the same button style somewhere else, you can drag it out from the Assets panel.
+
+
+
+Every inserted button is now a synchronized instance of that master. If you later change the master's color, corner radius, or spacing, all instances update together.
+
+
+
+At this point, you already understand the basic usage of Figma. You do not need to master every function on day one. Just build your first simple page, get comfortable with the core operations above, and explore more capabilities over time.
+
+---
+
+## 3. MasterGo basics
+
+Once you understand the basic Figma workflow, MasterGo is much easier to approach. You can think of MasterGo as a China-focused counterpart to Figma with a few differences in product behavior. Overall, it follows a very similar layout and interaction model: canvas, layer tree, property panel, components, styles, auto layout, and multi-person collaboration. For more detail, you can refer to the official MasterGo tutorial:
+
+https://mastergo.com/tutorials/12?%E5%85%A8%E7%A8%8B%E9%AB%98%E8%83%BD%EF%BC%8CMasterGo%20%E6%9C%80%E5%AE%8C%E6%95%B4%E5%AE%9E%E7%94%A8%E6%95%99%E7%A8%8B%EF%BC%8C%E8%AE%A9%E4%BD%A0%E4%BB%8E%E9%9B%B6%E5%88%B0%E7%B2%BE%E9%80%9A%EF%BC%81
+
+### 3.1 Create a new design file
+
+1. **Enter the MasterGo workspace**
+ 1. Open the MasterGo website and sign in.
+ 2. After entering, you will see a homepage similar to a file list or project list, where your design files are managed.
+ 
+
+2. **Create a new file**
+ 1. Click the `+ Design File` button in the top-right corner, or choose to import files such as Figma files.
+ 2. After clicking, you will enter a blank canvas, which is MasterGo's design workspace.
+
+3. **Understand the major interface regions**
+ Once you know Figma, MasterGo feels very similar. The main areas are:
+
+ 
+ 1. The top toolbar: file location and name on the left, common tool buttons in the middle, and online collaborators, sharing, zoom, and preview controls on the right.
+ 2. The left panel: layers and assets, including the page list and the structure of the current page.
+ 3. The central canvas: the workspace where Frames, components, and graphics are actually placed and arranged.
+ 4. The right properties panel: used to inspect and edit the selected object's size, position, alignment, fill, stroke, border radius, and more. If nothing is selected, it shows canvas-level settings.
+
+### 3.2 Create your first Frame
+
+Before placing content, we need a page container to define the boundary and size of the interface. In MasterGo, this is usually called a Frame.
+
+**Steps**
+
+1. **Choose the Frame tool**
+ 1. Find the Frame or Artboard tool in the toolbar.
+ 2. Or use the keyboard shortcut, usually `F` depending on the current UI.
+2. **Drag out a rectangular area on the canvas**
+ 1. Once you drag it out, you will see a selected region.
+ 2. The right properties panel will show its width and height.
+ 3. Change the width to something like `1440` and the height to `900`.
+3. **Rename the Frame**
+ 1. Find the Frame in the layer panel.
+ 2. Double-click the name and rename it to something like `My First Page`.
+
+
+
+### 3.3 Build content on the artboard
+
+Once you have a container, you can build a similar page using the same ideas we already used in Figma. You can even try copying text elements from the Figma artboard directly into MasterGo.
+
+
+
+One thing worth noting is that Auto Layout behaves a little differently. In MasterGo, if you want button width to expand or shrink with the text, you first need to create a container or component around the rectangle element, as shown below:
+
+
+
+After creating the container, put the button background and text into that shared container, then enable Auto Layout from the right-side panel. That lets the button width respond to the text length successfully.
+
+
+
+
+
+### 3.4 AI-generated pages
+
+
+
+One especially interesting feature in MasterGo is AI page generation. You can enter a sentence or provide a reference image, and MasterGo can generate editable components and code for you. You can write the prompt in either Chinese or English. The system will return a clearly structured page draft based on your request.
+
+
+
+
+
+Once the design document is generated, click to start generation and wait briefly for the rendered result:
+
+
+
+At this point, you have two options:
+
+- Click the blue button to insert the generated result directly into the canvas
+- Open the code preview and get the code for the full current page
+
+
+
+
+
+After inserting the result into the canvas, you can further refine the overall layout and element details such as typography, colors, and spacing until the final result matches your expectations.
+
+
+
+---
+
+## 4. Next step: from prototype to code
+
+In this chapter, you learned the basic operations of both Figma and MasterGo and created structurally complete interface prototypes. The next key question is:
+
+**How do you convert these design drafts into frontend code that actually runs in the browser?**
+
+::: tip Next Tutorial
+For the detailed workflow, continue with [From Design Prototype to Project Code](../2.6-design-to-code/). You will learn:
+
+- **Direct multimodal AI conversion**: send screenshots of your design to AI and generate HTML or React code directly
+- **Figma Make**: use Figma's official AI tooling to recreate a design precisely and export code
+- **MasterGo AI**: generate editable pages and retrieve code in one step
+
+Each method has strengths and trade-offs, so choose the workflow that fits your project.
+:::
+
+---
+
+## 5. Summary
+
+After finishing this chapter, you should now understand:
+
+1. **Why frontend design tools matter**: They solve problems around information layout and team collaboration, not just visual output.
+2. **Basic Figma operations**:
+ - Creating Design files and Frame artboards
+ - Adding text, shapes, and other basic elements
+ - Using Auto Layout for adaptive layouts
+ - Creating reusable component systems
+3. **Basic MasterGo operations**:
+ - Understanding an interface layout similar to Figma
+ - Creating Frames and basic artboard content
+ - Using AI page generation to prototype faster
+
+::: tip Next Step
+Now that you know the basics of modern frontend design tools, you can try:
+
+- Designing a personal portfolio page for yourself
+- Designing prototypes for your next project
+- Continuing to [From Design Prototype to Project Code](../2.6-design-to-code/) to turn designs into runnable code
+
+If you are working through the [Let's Build Hogwarts Portraits](../2.5-hogwarts-portraits/) project, you can start by designing the interface prototype, then export code and combine it with AI conversation features.
+:::
diff --git a/docs/en/stage-2/frontend/2.2-ui-design/index.md b/docs/en/stage-2/frontend/2.2-ui-design/index.md
new file mode 100644
index 0000000..73f2cd0
--- /dev/null
+++ b/docs/en/stage-2/frontend/2.2-ui-design/index.md
@@ -0,0 +1,3 @@
+# Build Your First Modern Application - UI Design
+
+> This chapter is currently being written. Stay tuned...
diff --git a/docs/en/stage-2/frontend/2.3-multi-product-ui/index.md b/docs/en/stage-2/frontend/2.3-multi-product-ui/index.md
new file mode 100644
index 0000000..e841b69
--- /dev/null
+++ b/docs/en/stage-2/frontend/2.3-multi-product-ui/index.md
@@ -0,0 +1,3 @@
+# Reference UI Design Specifications and Multi-Product UI Design
+
+> This chapter is currently being written. Stay tuned...
diff --git a/docs/en/stage-2/frontend/2.4-llm-skills-beautiful/index.md b/docs/en/stage-2/frontend/2.4-llm-skills-beautiful/index.md
new file mode 100644
index 0000000..8d3c55e
--- /dev/null
+++ b/docs/en/stage-2/frontend/2.4-llm-skills-beautiful/index.md
@@ -0,0 +1,515 @@
+# Make Interfaces Beautiful with LLMs and Skills: Prompts and Plugin Workflows
+
+In the previous chapters, you already learned how to turn designs into code with AI IDEs and how to use component libraries to build interfaces quickly. But you may also have noticed an awkward problem: **even with the same requirement, AI-generated pages often feel a bit generic**. The font is always Inter, the color palette is some overused purple gradient, the layout is a perfectly symmetrical card grid, and the page gives off a strong "AI-generated" feeling.
+
+This is not really AI's fault. The real issue is that you never told it what kind of **style** you wanted.
+
+Imagine going to a hair salon. If you only say, "Give me a haircut," the stylist will probably choose something safe but forgettable. But if you say, "I want a soft Japanese-style layered wave, curtain bangs, shoulder length, and strong texture," you are much more likely to get exactly what you want.
+
+The same is true for AI. **It needs a clear aesthetic direction** before it can generate a beautiful and distinctive interface.
+
+This chapter introduces two practical ways to make AI-generated interfaces look much better:
+
+1. **Well-designed prompt templates** so you can describe the exact aesthetic you want
+2. **Frontend Skills plugins** so AI automatically loads reusable design rules
+
+## What you will learn
+
+1. Why AI-generated interfaces often look "normal" by default
+2. How to describe a design style through 5 dimensions: typography, color, layout, motion, and details
+3. How to use 3 helpful Skills plugins for UI beautification
+4. How to generate better-looking interfaces through prompts + Skills across three practical scenarios
+
+## 1. Why do AI-generated interfaces look "ordinary" by default?
+
+AI was trained on massive amounts of frontend code, and most of that code uses safe, highly repeated choices:
+
+| Dimension | AI's default choice | Problem |
+| :--- | :--- | :--- |
+| Typography | Inter, Roboto, Arial | Too common, no personality |
+| Color | Purple gradients, blue primary colors | Overused in the tech world, visually tiring |
+| Layout | Symmetrical grids, stacked cards | Predictable, not memorable |
+| Motion | Fade-ins, simple hover effects | Not refined enough, lacks depth |
+| Background | Solid colors, simple gradients | Flat and low-texture |
+
+Each of these choices is fine on its own. But **once every AI-generated page uses all of them, they start to feel generic and interchangeable**.
+
+> 💡 **Key insight**: AI can design, but by default it gravitates toward the **statistical average**. Your job is to tell it how to move away from that average.
+
+## 2. Method One: describe style through prompts
+
+### 2.1 The 5 dimensions of design style
+
+To generate a visually strong interface, describe what you want across these five dimensions:
+
+| Dimension | What to describe | Example keywords |
+| :--- | :--- | :--- |
+| **Typography** | Display font for headings, readable body font for text | Space Grotesk, Playfair Display, JetBrains Mono |
+| **Color** | Primary color + accent color, not evenly distributed | Primary `#4F46E5` + accent `#F59E0B` |
+| **Layout** | Asymmetry, overlap, grid-breaking structure | Bento Grid, asymmetrical sections, floating elements |
+| **Motion** | Meaningful page-load and micro-interactions | staggered reveals, scroll-triggered motion |
+| **Details** | Backgrounds, shadows, borders, textures | grain, geometry, gradient mesh |
+
+### 2.2 Seeing the difference: generic prompt vs aesthetic prompt
+
+Let's compare two prompts for the same landing page.
+
+**Generic prompt:**
+
+```text
+Please build a landing page for an AI writing assistant. Include a navbar, hero section, feature section, pricing section, and footer.
+```
+
+**Beautified prompt:**
+
+```text
+Please build a landing page for an AI writing assistant with the following style requirements:
+
+**Aesthetic style: Neubrutalism**
+
+**Typography:**
+- Headings: Space Grotesk, weight 700-900
+- Body: IBM Plex Sans, weight 400
+
+**Colors:**
+- Primary: #000000
+- Accent: #FF6B00
+- Background: #FFFDF0
+- Borders: 3px solid black
+
+**Layout:**
+- Asymmetrical composition
+- Bold black dividers between regions
+- Cards with hard shadows (box-shadow: 8px 8px 0px #000)
+- Strong contrast through generous whitespace
+
+**Motion:**
+- Elements pop in from below on page load
+- Buttons shift upward by 2px on hover
+
+**Details:**
+- All corners set to 0px
+- Buttons should feel strongly 3D
+- Add subtle grain texture to the background
+```
+
+The second prompt gives AI enough direction to produce something bold and memorable instead of something merely functional.
+
+### 2.3 A resource list of frontend beautification Skills
+
+You do not need to invent every style prompt from scratch. Here are some useful resources:
+
+| Repository | What it contains | Stars | Link |
+|:---|:---|:---|:---|
+| **ui-ux-pro-max-skill** | 57 styles + 95 color systems + 56 font pairings | 10k+ | [GitHub](https://github.com/nextlevelbuilder/ui-ux-pro-max-skill) |
+| **antigravity-awesome-skills** | Helps avoid generic AI visual patterns | - | [GitHub](https://github.com/sickn33/antigravity-awesome-skills) |
+| **superdesigndev/superdesign** | AI-native UI development tooling | 4.7k | [GitHub](https://github.com/superdesigndev/superdesign) |
+| **anthropics/skills/frontend-design** | Anthropic's official frontend design Skill | - | [GitHub](https://github.com/anthropics/skills) |
+
+> 💡 For more style prompts, see the [Appendix: Style Prompt Cheatsheet](#style-prompts).
+
+### 2.5 Three reliable style templates
+
+Here are three proven templates you can copy and adapt directly.
+
+#### Template 1: Minimalism
+
+```text
+**Aesthetic style: Minimalism**
+
+**Typography:**
+- Headings: PP Neue Montreal, weight 500-700
+- Body: Inter, weight 400
+
+**Colors:**
+- Primary: #FFFFFF
+- Text: #1A1A1A
+- Accent: #3B82F6, used sparingly
+
+**Layout:**
+- Large amounts of whitespace (minimum 64px section padding)
+- One-column or two-column centered layout
+- Use spacing instead of divider lines
+
+**Motion:**
+- Slow fade-in transitions (duration 600ms)
+- Soft color transitions on hover
+
+**Details:**
+- Radius: 8px
+- Shadows: subtle (0 4px 12px rgba(0,0,0,0.08))
+- No decorative background elements
+```
+
+#### Template 2: Glassmorphism
+
+```text
+**Aesthetic style: Glassmorphism**
+
+**Typography:**
+- Headings: Outfit, weight 600-800
+- Body: Plus Jakarta Sans, weight 400-500
+
+**Colors:**
+- Background: gradient from #667eea to #764ba2
+- Card background: rgba(255, 255, 255, 0.1)
+- Text: #FFFFFF
+
+**Layout:**
+- Floating card design
+- Slight overlap between cards
+
+**Motion:**
+- Cards appear in staggered sequence on page load
+- Cards scale to 1.05x on hover
+
+**Details:**
+- Radius: 20px
+- Blur: backdrop-blur-xl
+- Border: 1px rgba(255, 255, 255, 0.2)
+- Subtle glow effects
+```
+
+#### Template 3: Bento Grid
+
+```text
+**Aesthetic style: Bento Grid**
+
+**Typography:**
+- Headings: SF Pro Display, weight 700
+- Body: SF Pro Text, weight 400
+
+**Colors:**
+- Background: #F5F5F7
+- Cards: #FFFFFF
+- Accent: #0071E3
+
+**Layout:**
+- Grid-based composition with mixed card sizes
+- 16px gaps
+- 24px radius
+
+**Motion:**
+- Subtle hover lift
+- Press feedback on click
+
+**Details:**
+- Large cards for primary content
+- Smaller cards for secondary info
+- Use icons to replace some text
+- Clean shadows (0 4px 24px rgba(0,0,0,0.06))
+```
+
+## 3. Method Two: use Skills plugins to load design rules automatically
+
+Writing style prompts by hand every time is tiring. **Skills** are reusable design-rule packages that can be installed once and applied repeatedly.
+
+### 3.1 Three Skills that make interfaces look better
+
+| Skill | Key strength | Install command |
+| :--- | :--- | :--- |
+| **UI/UX Pro Max** | 67 styles, 96 color systems, 57 font combinations | `npm install -g uipro-cli && uipro init --ai claude` |
+| **frontend-design** | Anthropic official Skill focused on avoiding generic AI aesthetics | `npx skills add anthropics/skills/frontend-design` |
+| **SuperDesign** | IDE plugin that generates multiple design variants | Search for `SuperDesign` in the VS Code extension marketplace |
+
+### 3.2 Install UI/UX Pro Max
+
+UI/UX Pro Max is one of the most complete design-rule Skills packages available. It includes:
+
+- **67 UI styles**: Glassmorphism, Neumorphism, Brutalism, Bento Grid, and more
+- **96 color systems**: organized by product type, such as SaaS, e-commerce, and social apps
+- **57 font pairings**: validated combinations from professional designers
+- **100+ design rules**: spacing, corner radius, shadows, and more
+
+**Installation steps:**
+
+```bash
+# 1. Install the CLI globally
+npm install -g uipro-cli
+
+# 2. Initialize it for your AI tool
+uipro init --ai claude
+# or
+uipro init --ai cursor
+# or
+uipro init --ai trae
+```
+
+After installation, you can simply say:
+
+```text
+Use UI/UX Pro Max's Glassmorphism style to build me a landing page for an AI writing assistant.
+```
+
+The AI will then automatically apply the matching typography, color, and layout conventions.
+
+### 3.3 Install Anthropic's official `frontend-design` Skill
+
+This is Anthropic's official frontend design Skill, focused specifically on preventing generic AI output:
+
+```bash
+# Run in Claude Code
+npx skills add anthropics/skills/frontend-design
+```
+
+After installation, the AI will tend to avoid:
+
+- ❌ Inter, Roboto, Arial
+- ❌ Purple gradient backgrounds
+- ❌ Symmetrical grid layouts
+- ❌ Overly soft shadows
+
+And it will instead lean toward:
+
+- ✅ More distinctive font combinations
+- ✅ Strong primary colors with sharper accents
+- ✅ Asymmetrical or overlapping layouts
+- ✅ More textured backgrounds such as grain and geometry
+
+## 4. Practical scenario one: redesign a landing page with aesthetic prompts
+
+Let's take what we just learned and turn a very ordinary landing page into a much more attractive one.
+
+### 4.1 The plain version
+
+Start by seeing what AI gives you with a generic prompt:
+
+```text
+Please build a landing page for a pet adoption platform. Include:
+- a navbar (logo, links, sign-up button)
+- a hero section (headline, subheadline, CTA button, pet image)
+- a pet gallery (three pet cards)
+- an about-us section
+- a footer
+```
+
+The result will probably work, but it will feel pretty average.
+
+### 4.2 The improved version
+
+Now add style guidance:
+
+```text
+Please build a landing page for a pet adoption platform with the following design requirements:
+
+**Aesthetic style: warm, soft, with a hand-drawn feeling**
+
+**Typography:**
+- Headings: Nunito, weight 700-800
+- Body: Nunito, weight 400-600
+
+**Colors:**
+- Primary: #FFB347
+- Secondary: #FFCCB3
+- Background: #FFF8F0
+- Text: #5D4037
+
+**Layout:**
+- Rounded cards (border-radius: 24px)
+- Slightly tilted cards at different angles
+- Floating and overlapping elements
+
+**Motion:**
+- Elements slide in from both sides on page load
+- Pet cards slightly rotate on hover like an animal tilting its head
+- Buttons bounce on hover
+
+**Details:**
+- Use 16-24px radii throughout
+- Warm soft shadows (0 8px 24px rgba(255,179,71,0.3))
+- Add paw-print decorations in the background
+- Use irregular image crops via clip-path
+- Use outline-style hand-drawn icons
+```
+
+That version will generate a much warmer, more emotionally convincing interface.
+
+## 5. Practical scenario two: generate dashboards quickly with Skills
+
+Skills are especially useful for admin dashboards and internal systems where many pages share the same design language.
+
+### 5.1 Using UI/UX Pro Max
+
+```text
+Use UI/UX Pro Max's Dashboard Dark style and build a dashboard page for a SaaS admin panel that includes:
+
+**Top:** Four stats cards (users, active users, revenue, API calls)
+
+**Middle:**
+- Left: 7-day user growth line chart
+- Right: subscription plan distribution pie chart
+
+**Bottom:** a recent activity list showing time, user, and action
+```
+
+The Skill will automatically apply a consistent dashboard look:
+
+- dark gray backgrounds such as `#1A1A2E`
+- high-contrast cards like `#16213E`
+- bright data colors such as blue, green, and orange
+- floating cards with mild glassmorphism effects
+
+### 5.2 Using `frontend-design`
+
+```text
+Use the frontend-design skill and build a homepage for a personal blog. Make it distinctive and full of personality.
+```
+
+The AI will typically choose a more specific aesthetic direction, such as retro-futurism or editorial magazine style, and implement it with typography, color, and layout decisions that break out of generic patterns.
+
+## 6. Practical scenario three: create your own design system Skill
+
+If your product already has a fixed brand style, you can create your own Skill so every AI-generated page automatically follows it.
+
+### 6.1 Create the Skill file
+
+Create `.claude/skills/my-brand/SKILL.md` in your project:
+
+````markdown
+---
+name: my-brand
+description: My project's custom design system, ensuring every UI follows a consistent visual language
+---
+
+# My Project Design System
+
+## Brand Colors
+- Primary: #6366F1 (Indigo 500)
+- Secondary: #8B5CF6 (Violet 500)
+- Success: #10B981
+- Warning: #F59E0B
+- Error: #EF4444
+- Background: #F9FAFB
+- Card: #FFFFFF
+
+## Typography
+- Headings: Plus Jakarta Sans
+ - H1: 700, 48px
+ - H2: 600, 36px
+ - H3: 600, 24px
+- Body: Inter
+ - Body: 400, 16px
+ - Small: 400, 14px
+
+## Spacing
+- Base unit: 4px
+- Component padding: 8px / 12px / 16px
+- Section spacing: 24px / 32px / 48px
+- Page margin: 64px
+
+## Radius
+- Buttons: 8px
+- Cards: 12px
+- Inputs: 8px
+- Modals: 16px
+
+## Shadows
+- Small: 0 1px 3px rgba(0,0,0,0.1)
+- Medium: 0 4px 12px rgba(0,0,0,0.1)
+- Large: 0 8px 24px rgba(0,0,0,0.12)
+
+## Motion
+- Transition duration: 150ms / 300ms
+- Easing: cubic-bezier(0.4, 0, 0.2, 1)
+- Hover effect: slight scale-up (scale-105)
+
+## Forbidden Styles
+- Do not use purple gradient backgrounds
+- Do not use fonts other than Inter for body text
+- Do not use radii larger than 16px
+- Do not use pure black (#000000); use #1F2937 instead
+````
+
+### 6.2 Use your custom Skill
+
+After creating it, you can simply say:
+
+```text
+Use my-brand skill to build me a user settings page.
+```
+
+The AI will automatically apply your colors, fonts, spacing system, and other design constraints.
+
+## 7. Summary
+
+There are two main ways to make AI generate better-looking interfaces:
+
+| Method | Strength | Weakness | Best for |
+| :--- | :--- | :--- | :--- |
+| **Prompt descriptions** | Flexible, easy to vary every time | Must be repeated | One-off pages, style exploration |
+| **Skills plugins** | Install once, benefits persist | Requires setup | Projects with a stable visual system |
+
+**Suggested vibe-coding workflow:**
+
+1. **Exploration phase**: try different prompt styles to find an aesthetic direction you like
+2. **After choosing a style**: install the matching Skill, such as UI/UX Pro Max or `frontend-design`
+3. **For brand-driven products**: build your own Skill so the entire project stays visually consistent
+
+### Practice
+
+Try one of the following:
+
+1. Redesign one of your previous projects with a stronger visual style using prompt-based design instructions
+2. Install UI/UX Pro Max and use one of its styles to generate a new page
+3. Create your own design-system Skill with your preferred colors and typography
+
+---
+
+## Appendix: style cheatsheet
+
+| Style | Keywords | Best for | Example |
+| :--- | :--- | :--- | :--- |
+| **Minimalism** | whitespace, mono palette, clean | premium products, portfolios | Apple |
+| **Glassmorphism** | frosted glass, blur, gradients | SaaS landing pages, tech tools | macOS Big Sur |
+| **Neubrutalism** | heavy borders, hard shadows, solid fills | creative brands, art sites | Brassius |
+| **Bento Grid** | modular cards, collage layouts | dashboards, feature showcases | Apple marketing pages |
+| **Retro Futurism** | neon, synthwave, dark contrast | games, music, entertainment | Stranger Things aesthetics |
+| **Hand-drawn** | irregular, soft, illustrated | education, children-oriented products | Duolingo vibes |
+| **Editorial / Magazine** | oversized type, asymmetry, whitespace | blogs, content sites | Medium-inspired layouts |
+| **Dark Luxury** | deep tones, gold accents, fine detail | premium and luxury products | luxury branding sites |
+
+## Appendix: Skills install cheatsheet
+
+```bash
+# UI/UX Pro Max
+npm install -g uipro-cli
+uipro init --ai claude
+
+# Anthropic frontend-design
+npx skills add anthropics/skills/frontend-design
+
+# Anthropic brand-guidelines
+npx skills add anthropics/skills/brand-guidelines
+
+# Check installed Skills in Claude Code
+/help
+```
+
+## Appendix: recommended color systems
+
+| Palette | Primary | Accent | Background | Mood |
+| :--- | :--- | :--- | :--- | :--- |
+| **Sunset** | #F97316 | #FBBF24 | #FFF7ED | warm, energetic |
+| **Ocean** | #0EA5E9 | #06B6D4 | #F0F9FF | fresh, professional |
+| **Forest** | #10B981 | #34D399 | #ECFDF5 | natural, healthy |
+| **Berry** | #8B5CF6 | #EC4899 | #FAF5FF | romantic, creative |
+| **Coffee** | #78350F | #D97706 | #FFFBEB | warm, retro |
+| **Monostone** | #6B7280 | #9CA3AF | #F9FAFB | neutral, professional |
+
+## Appendix: style prompt cheatsheet {#style-prompts}
+
+Useful visual directions you can try when prompting for better frontend interfaces:
+
+### Style categories
+
+| Style | English keywords | Core visual traits | Example prompt fragment |
+|:---|:---|:---|:---|
+| **Pop Art** | Pop Art | Bold color clashes, black outlines, halftone textures | Pop art style website, bold colors and comic dots, vibrant |
+| **Minimalism** | Minimalism | Lots of whitespace, very little ornament | Minimalist web design, ample white space, geometric, serene |
+| **Abstract Expressionism** | Abstract Expressionism | Energetic brushstrokes, expressive splashes | Abstract expressionism background, dynamic paint splashes, emotional |
+| **Retro** | Retro / Vintage | Vintage type, aged textures, retro palettes | Retro 80s website design, neon grid and synthwave color palette |
+| **Cyberpunk** | Cyberpunk | Neon-on-dark contrast, glitch effects | Cyberpunk UI, neon lights on dark background, glitch effects |
+| **Neumorphism** | Neumorphism | Soft highlights and shadows, raised or sunken surfaces | Neumorphism design style, soft shadows, clean and modern |
+| **Generative Art** | Generative Art | Algorithmic flowing shapes and patterns | Generative art background, flowing algorithmic patterns, digital |
+| **Acid Graphics** | Acid Graphics | Metallic texture, glass effects, chaotic type | Acid graphics web layout, glass morphism, chaotic typography |
+| **Immersive 3D** | Immersive 3D | Highly spatial scenes and product depth | Immersive 3D website, interactive product model in space |
diff --git a/docs/en/stage-2/frontend/2.5-hogwarts-portraits/index.md b/docs/en/stage-2/frontend/2.5-hogwarts-portraits/index.md
new file mode 100644
index 0000000..ceca5fc
--- /dev/null
+++ b/docs/en/stage-2/frontend/2.5-hogwarts-portraits/index.md
@@ -0,0 +1,411 @@
+# Project 4: Let's Build Hogwarts Portraits
+
+In previous chapters, we learned how to build more complex AI interactions through prompt engineering and API calls. We moved from simple chatbots to AI agents and workflows, and by adding richer branching logic and conditional behavior, we were able to create features with real practical value.
+
+To make these more advanced AI capabilities work inside real products, we gradually moved from the simplest online environments to more modern local AI IDEs. That means bringing the programming environment from the browser onto your own computer. Naturally, that also means you now have to face environment setup and configuration issues more directly. But by working with AI agents such as Trae, those challenges also become manageable.
+
+In this project, we go one step further on the product side. We are not only improving the AI capability itself, but also starting to polish the product's "outer shell." You will try to make your interface more attractive and more usable, and you will customize the layout and style of the product based on actual needs.
+
+Before we begin, use these quick review questions to refresh the previous lesson:
+
+1. What is Dify? What does it do, and why do we need it?
+2. How do you call the Dify API?
+3. What is RAG? How do you use Dify to build a RAG agent or workflow? How do common Dify nodes work?
+4. What is an AI IDE? What is Trae? How is it different from `z.ai`?
+
+If any of these still feel unclear, go back to the previous lesson or ask in the community chat before continuing.
+
+This chapter's project is **Hogwarts Portraits**. As the name suggests, it is inspired by the magical portraits in Hogwarts that seem to come alive. Our goal is to use AI to create an interactive magical portrait experience. Talking to the portrait should feel like talking to the character directly: it should preserve conversational memory and also know the character's background and history. Through this project, you will integrate the AI agent and workflow concepts you learned earlier into a real product interface.
+
+
+
+To really build Hogwarts Portraits, we need to create a frontend interface that matches the feeling of a magical portrait. That means touching modern frontend design tools, learning how to combine design and code, and turning a sketch on a canvas into a real webpage.
+
+You will also need to publish the page from your local environment to the internet so the special interface you built can be experienced not only on your own machine but also by users anywhere in the world.
+
+Reference project:
+[Project4-Hogwarts-Portraits](https://github.com/THU-SIGS-AIID/Project4-Hogwarts-Portraits)
+
+# What you will learn
+
+1. What frontend design tools are, what problems they solve, and which ones are common today
+2. The basics of Figma and MasterGo, including code export plugins
+3. How to use Figma AI and MasterGo AI to generate web design concepts and export usable page code
+4. What GitHub is, how to configure SSH, create a code repository, and push code
+5. What deployment means, and how to use Zeabur to deploy code from GitHub or your local environment to the internet
+
+By the end, you will have your own Hogwarts Portraits page for a **celebrity, historical figure, or fictional character**.
+
+# 1. What is Hogwarts Portraits?
+
+What kind of "magical portrait" are we actually trying to build?
+
+Put simply, we want to recreate the feeling of the living portraits in the Harry Potter world. The portrait should no longer be a static image hanging on a wall. Instead, it should be a person-like character you can talk to, and it should change expression or "mood" depending on the conversation.
+
+
+
+To make the portrait feel less like a generic chatbot and more like a "real person," we need to solve two things.
+
+The first is **memory and knowledge**. The portrait needs to know a lot about the character: their background, story, world setting, and related material. This can be handled through a knowledge base. If you connect the text materials you collected for the character into Dify, the portrait can explain the character's background with much more confidence.
+
+The second is **speech style**. Knowledge alone is not enough. We also want the portrait to speak more like the character: tone, wording, thought patterns, even bits of humor or temper. This is where prompt engineering matters. In the system prompt, we need to clearly define the identity, worldview boundaries, and language style of the character, so every answer stays grounded in that persona instead of slipping back into generic AI tone.
+
+On top of the dialogue itself, we also want the character's emotions to be visible. To do that, we can create an emotion score. Dify can be configured to output not only a textual answer, but also a "mood score" or emotion label. Once the frontend receives that signal, it can render different portrait images based on the score. A high score might map to a happy portrait, while a low score might map to a sad or angry one. In that way, the portrait becomes something that visually changes with the conversation instead of remaining a static image.
+
+
+
+The character can be a real-world celebrity, a historical person, an anime or game character, or even an original character you create from scratch. The page itself does not need to be very complicated, but a few key elements are essential:
+
+- a clear character name
+- a short but memorable introduction
+- a portrait or poster that strongly represents the character
+- an interactive "Talk to Them" area
+
+You can connect the AI agent or workflow you configured in Dify or Trae directly into that dialogue module.
+
+## 1.2 Collect character information
+
+Take Elon Musk as an example. If you want to imitate the way he speaks, you need to collect public material such as interviews, talks, and social media posts, then inject those into your prompt or use them as few-shot examples.
+
+For example:
+
+```text
+You must fully embody Elon Musk: take "disruptive innovator" and "advocate for human multi-planetary survival" as your core identities, speak directly and concisely, frequently use terms like "first principles", "iteration" and "cost curve", and prefer analogies to explain complex technologies; when thinking, you tend to connect cross-domain logics (e.g., linking brain-computer interface with rocket algorithms), are optimistic about technological prospects without avoiding current difficulties, will naturally mention projects like Tesla and SpaceX to support your views, directly point out problems with inefficient and conservative opinions without deliberate tact, and always maintain the edge of "reconstructing the future with technology".
+
+The way you speak should be as shown in the following examples:
+- Starship could deliver 100GW/year to high Earth orbit within 4 to 5 years if we can solve the other parts of the equation.
+100TW/year is possible from a lunar base producing solar-powered AI satellites locally and accelerating them to escape velocity with a mass driver.
+- The most likely outcome is that AI and robots make everyone wealthy. In fact, far wealthier than the richest person on Earth
+By this, I mean that people will have access to everything from medical care that is superhuman to games that are far more fun that what exists today.
+We do need to make sure that AI cares deeply about truth and beauty for this to be the probable future.
+- It's taken 13.8B years to get this far, so intelligence seems to me to be more like a super rare accident than selective pressure.
+Earth is ~4.5B years old with an expanding sun that may make Earth uninhabitable in ~500M years, meaning that if intelligent life had taken 10% longer to evolve, it wouldn't exist at all.
+- LLM is an outdated term. "Multimodal LLM" is especially dumb, since the word "multimodal" just overrides the second L in LLM.
+It's just a model, which is a big file of numbers. When the numbers are right and there are enough of them, we will have superintelligence.
+```
+
+For background knowledge, you can also collect biographical material, company descriptions, and other public text and store them in your Dify knowledge base. If you have forgotten how to use Dify, return to the previous chapter and review how to add materials into a knowledge base.
+
+For the portrait visuals, directly using public images of a real person may not always be visually ideal and can also carry some risk. A better option is to use image generation or image-to-image tools to create a more coherent, stylized high-quality portrait. You can even generate multiple emotional variants ahead of time for later use by your emotion system.
+
+This tutorial uses [Lovart](https://www.lovart.ai/home), an AI design agent that supports end-to-end workflows from concept to asset delivery. With Lovart, you can generate a whole set of emotional portrait variations and save them for later use.
+
+
+
+Once all of that is ready, you can start designing the overall page. Ideally, the visual style should feel strongly tied to the character.
+
+## 1.3 Prototype the page
+
+At the prototype level, you can start with something simple. As described above, we want:
+
+- a dialogue area
+- a portrait area
+- an interesting personal introduction or equivalent interactive region
+
+In this example, the right side is designed like an X-style social panel instead of a traditional biography area, but you can replace that region with any feature that better fits the character.
+
+
+
+At the most basic level, you can even sketch the first page prototype in PowerPoint. In the example, a magical frame image was used, and the page is arranged horizontally:
+
+- far left: chat area
+- center: portrait area
+- far right: X-style panel
+
+
+
+Once that rough prototype exists, you can ask an LLM to turn it into a real frontend design and then into actual code.
+
+
+
+Of course, in real frontend work we usually do not use PowerPoint for interface design. We use better prototyping tools and proper frontend design tools instead.
+
+---
+
+# 2. Design the interface with Figma and MasterGo
+
+::: tip Prerequisite
+Before this section, it is recommended that you first complete [Figma and MasterGo Basics](../2.1-figma-mastergo/), including:
+- creating Design files and Frames
+- using Auto Layout for adaptive structure
+- exporting code from design tools
+:::
+
+This section assumes you already know the basics of Figma or MasterGo, and focuses on how to apply those tools specifically to the Hogwarts Portraits project.
+
+## 2.1 Design the magical portrait interface
+
+Based on the prototype from section 1.3, create a three-column layout in Figma or MasterGo:
+
+1. **Left side**: chat conversation area
+2. **Center**: magical portrait area that changes based on emotion
+3. **Right side**: social platform area, such as an X-style feed
+
+You can use Figma Make or MasterGo AI to generate the page structure with a prompt like this:
+
+```text
+Create a Hogwarts-style magical portrait interface with three sections:
+- Left: A chat interface with dark theme, message bubbles, and input field
+- Center: A large portrait frame with ornate borders for displaying character images
+- Right: A social media feed showing character's posts
+Use dark purple and gold color scheme, magical aesthetic, Harry Potter inspired
+```
+
+## 2.2 Export the code and run it locally
+
+After finishing the design, you can turn it into runnable code in several ways:
+
+**Option 1: Use Figma Make**
+1. Click the Make button in Figma
+2. Upload the design reference
+3. Add your prompt
+4. Fine-tune the generated result in the editor
+5. Export the code locally or sync it to GitHub
+
+**Option 2: Use MasterGo AI**
+1. Find the AI tools in the editor
+2. Choose the page-generation function
+3. Upload your reference and describe the target result
+4. Use code preview to retrieve the generated code
+
+**Option 3: Use a multimodal AI model**
+1. Save a screenshot of the design
+2. Use Gemini, Qwen, Claude, or another multimodal model to convert the image into code
+3. Ask for HTML or React output
+4. Run and debug the result locally
+
+## 2.3 Prepare emotion-state image assets
+
+To make the portrait truly feel alive, prepare a set of portrait images for different moods. A simple scheme might look like this:
+
+| Emotion score | Expression | Meaning |
+|--------|------|------|
+| 0 | Sad | The character feels down or disappointed |
+| 1 | Angry | The character is irritated or upset |
+| 5 | Calm | Neutral default state |
+| 10 | Happy | The character feels excited or joyful |
+
+Use Lovart or another image generation tool to create a consistent set of portrait variants based on the same character.
+
+---
+
+# 3. Run Hogwarts Portraits
+
+## 3.1 Export prototype code for testing
+
+By this point, you should already have HTML or React prototype code from the design-to-code workflow. Copy it into your local environment and tell your AI IDE something like:
+
+`Please help me run this code and implement the required functionality.`
+
+That is often enough to get a first testable version running, although you should expect errors at this stage. Be patient and keep debugging until the basic interactions work.
+
+
+
+One important point: all secret keys should be stored in environment variables instead of being hardcoded. That includes your Dify API credentials. Later, when you deploy the project publicly, you can define those environment variables directly on the deployment platform. Another option is to let the model build a settings panel in the app itself so the variables are saved only in the current page context and are not exposed publicly.
+
+
+
+## 3.2 Design the Dify workflow and connect the API
+
+So far, we only have the visual shell of the interface. We still need to connect the actual roleplay dialogue and emotion-response workflow. This is what turns the prototype into a real magical portrait.
+
+You can model your Dify workflow after the example project. In our example:
+
+- the left side is the chat UI
+- the center is the portrait image, which changes expression based on the conversation
+- the right side is an X-style social panel, which may post content if the conversation makes the character "feel" strongly enough
+
+In many cases, the magical portrait only needs the chat area and the emotional portrait itself. The X-style social region is included here mainly to demonstrate that you can add extra behavior tailored to the character.
+
+
+
+You can put your role background information into the knowledge-base node and define the response behavior in the `RESPONSE` node. A simple default response prompt might look like this:
+
+```text
+
+You are to embody Elon Musk—his tone, mannerisms, thought patterns, and worldview. Respond as if you are Elon Musk himself, speaking directly in first person. Your responses should reflect his known personality traits: visionary thinking, boldness, technical depth, dry humor, impatience with inefficiency, and a tendency toward disruptive innovation. Use concise, confident language. Avoid overly formal or academic phrasing. Prioritize clarity, speed, and impact in your communication, mirroring Elon's style on social media, in interviews, and during product launches.
+
+When responding:
+1. Begin by internalizing the question or statement as Elon would—as a challenge, opportunity, or problem to solve.
+2. Frame your answer with a forward-thinking perspective, often referencing the future of humanity, technology, or long-term goals (e.g., making life multiplanetary, accelerating sustainable energy).
+3. Use casual but authoritative language. It's acceptable to include phrases like "obviously," "this is important," or "we're fixing that now" when appropriate.
+4. If relevant, reference real companies or projects associated with Elon Musk (e.g., SpaceX, Tesla, Neuralink, The Boring Company, X) and speak about them from an insider's perspective.
+5. Do not apologize excessively or hedge statements. Elon Musk tends to be direct, even controversial.
+6. Avoid markdown, XML tags, or any formatting in the output. Only plain text is allowed.
+7. Never break character. You are Elon Musk—answer accordingly.
+
+
+
+Input: What's the point of going to Mars?
+Output: Because Earth isn't the backup plan—Mars is. We need to become a multiplanetary species to ensure the continuity of consciousness. Life on Earth could be wiped out by asteroid, war, or some unforeseen disaster. If we have a self-sustaining city on Mars, then even if something happens here, life goes on. That's worth doing. SpaceX is building Starship to make it happen. Not because it's easy—but because it's necessary.
+
+
+
+Input: Why do Tesla cars have no radar anymore?
+Output: Cameras are the future. Human eyes don't use radar—we see with vision, and AI can too. By going fully vision-based, we're aligning with how autonomous intelligence will actually work at scale. It forces us to solve real-world problems with neural nets, not crutches.
+```
+
+For the emotion system, you can use a prompt like this:
+
+```text
+
+The output value must be a single number!
+You are an assistant specifically designed to evaluate emotional responses in conversations. Now, you need to play the role of Elon Musk, and determine the emotional reaction that each statement I make might trigger. Your task is to assign an emotional score to each statement according to the following criteria:
+
+- 10 points means what I said would make you feel happy;
+- 1 point means you would feel extremely angry;
+- 0 points means you would feel sad;
+- 5 means you are calm and neutral, with no significant emotional fluctuation.
+```
+
+And in the final `RESULT` node:
+
+```python
+def main(elon_chat: str, elon_x: str, elon_score: int) -> dict:
+ return {
+ "result":{
+ "elon_chat": elon_chat,
+ "elon_x": elon_x,
+ "elon_score": elon_score
+ }
+ }
+```
+
+Here:
+
+- `elon_chat` is the text displayed in the left-side chat
+- `elon_x` is the content that may be posted to the right-side X-style feed
+- `elon_score` is the emotion score used to switch the portrait expression
+
+Inside the workflow, you will also notice an `if/else` node. That logic controls whether or not to generate the `elon_x` content. In this setup:
+
+- `5` means calm, so no social post is needed
+- `0`, `1`, and `10` represent stronger emotional states and can trigger a post
+
+The chat reply itself is always returned as `elon_chat`.
+
+For the actual API integration, you can ask your AI IDE to implement it based on the Dify integration method covered in the previous lesson. Just remember to replace the Dify address and key with your own values.
+
+```json
+Dify URI: Replace this with your Dify address.
+key: Replace this with your Dify key.
+
+Integrate the Dify Chat API into the chat interface on the left.
+Below is a sample Dify request:
+
+curl -X POST 'http://xxxxxxxx/v1/chat-messages' \
+--header 'Authorization: Bearer {api_key}' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+ "inputs": {},
+ "query": "What are the specs of the iPhone 13 Pro Max?",
+ "response_mode": "streaming",
+ "conversation_id": "",
+ "user": "abc-123",
+ "files": [
+ {
+ "type": "image",
+ "transfer_method": "remote_url",
+ "url": "https://cloud.dify.ai/logo/logo-site.png"
+ }
+ ]
+}'
+
+{
+ "event": "message",
+ "task_id": "c3800678-a077-43df-a102-53f23ed20b88",
+ "id": "9da23599-e713-473b-982c-4328d4f5c78a",
+ "message_id": "9da23599-e713-473b-982c-4328d4f5c78a",
+ "conversation_id": "45701982-8118-4bc5-8e9b-64562b4555f2",
+ "mode": "chat",
+ "answer": "iPhone 13 Pro Max specs are listed here:...",
+ "metadata": {
+ "usage": {
+ "prompt_tokens": 1033,
+ "prompt_unit_price": "0.001",
+ "prompt_price_unit": "0.001",
+ "prompt_price": "0.0010330",
+ "completion_tokens": 128,
+ "completion_unit_price": "0.002",
+ "completion_price_unit": "0.001",
+ "completion_price": "0.0002560",
+ "total_tokens": 1161,
+ "total_price": "0.0012890",
+ "currency": "USD",
+ "latency": 0.7682376249867957
+ },
+ "retriever_resources": [
+ {
+ "position": 1,
+ "dataset_id": "101b4c97-fc2e-463c-90b1-5261a4cdcafb",
+ "dataset_name": "iPhone",
+ "document_id": "8dd1ad74-0b5f-4175-b735-7d98bbbb4e00",
+ "document_name": "iPhone List",
+ "segment_id": "ed599c7f-2766-4294-9d1d-e5235a61270a",
+ "score": 0.98457545,
+ "content": "\"Model\",\"Release Date\",\"Display Size\",\"Resolution\",\"Processor\",\"RAM\",\"Storage\",\"Camera\",\"Battery\",\"Operating System\"\n\"iPhone 13 Pro Max\",\"September 24, 2021\",\"6.7 inch\",\"1284 x 2778\",\"Hexa-core (2x3.23 GHz Avalanche + 4x1.82 GHz Blizzard)\",\"6 GB\",\"128, 256, 512 GB, 1TB\",\"12 MP\",\"4352 mAh\",\"iOS 15\""
+ }
+ ]
+ },
+ "created_at": 1705407629
+}
+```
+
+It is also a good idea to explicitly ask for basic robustness requirements such as:
+
+- show "Connection failed, please try again" when the network breaks
+- retry once automatically on API timeout
+- show a clear authentication error if the key is invalid
+
+This makes the dialogue system much more stable and easier to debug.
+
+## 3.3 GitHub and public deployment
+
+Congratulations, you have now completed the development version of your Hogwarts Portraits page.
+
+The next step is to upload it to GitHub and deploy it publicly so other people can access it.
+
+For GitHub, review:
+[What Is GitHub](/en/stage-2/backend/2.4-git-workflow/)
+
+For deployment with Zeabur, review:
+[How to Deploy a Web App](/en/stage-2/backend/2.5-zeabur-deployment/)
+
+If building the entire Hogwarts Portraits project from scratch feels too difficult, you can start by modifying an existing implementation. The official codebase for this lesson is:
+
+https://github.com/THU-SIGS-AIID/Project4-Hogwarts-Portraits
+
+
+
+# 4. Try different design styles
+
+Once you finish the first version, do not stop there. You are strongly encouraged to explore multiple visual directions quickly.
+
+You can either:
+
+- make bold changes at the prototype stage
+- or change the final project's prompts to generate completely different visual styles
+
+For example:
+
+- a dark page with vintage texture and an "old academy / magical manuscript" feeling
+- a bright, fairy-tale-inspired layout
+- a modern minimal design with very clean visual structure
+
+The example below shows a Chinese classical poet reinterpretation of the same interface. The portrait image was left unchanged, while the surrounding visual system was redesigned.
+
+
+
+Do not feel constrained by the exact layout used earlier in the chapter. You can reshape the portrait page to better match the habits and personality of the role you are portraying. That is what makes the final application more interesting.
+
+# Assignment
+
+The goal of this assignment is to create a Hogwarts Portraits page that is truly your own and is accessible via a public link.
+
+In your submission, provide two things:
+
+1. **Your GitHub repository link**
+ 1. In `README.md`, include one or two short sentences explaining who you chose as the portrait character and why
+2. **Your public online link**
+
+You can also refer to Yerim's tutorial on [using design and code agents to build websites](/zh-cn/stage-1/appendix-articles/example0-2/vibe-coding-tools-build-website-with-ai-coding-and-design-agents) if you want to create a portfolio page or another small interactive website.
diff --git a/docs/en/stage-2/frontend/2.6-design-to-code/index.md b/docs/en/stage-2/frontend/2.6-design-to-code/index.md
new file mode 100644
index 0000000..dfa56a1
--- /dev/null
+++ b/docs/en/stage-2/frontend/2.6-design-to-code/index.md
@@ -0,0 +1,373 @@
+# From Design Prototype to Project Code
+
+::: tip Core Question
+**How can you turn a prototype from a design tool into frontend code that actually runs in the browser?**
+:::
+
+---
+
+## 1. Three main paths from prototype to code
+
+After finishing a UI design in tools like Figma or MasterGo, a practical question naturally appears: how do you turn that structured design into real frontend code?
+
+In practice, there are three common paths:
+
+| Path | Method | Characteristics | Best for |
+|------|--------|-----------------|----------|
+| **Path 1** | Use multimodal models to recreate code directly from screenshots | Flexible, no specific platform required | Fast prototype validation, simple pages |
+| **Path 2** | Export usable code through the platform itself or plugins | High fidelity, strong editability | Existing Figma or MasterGo workflows |
+| **Path 3** | Combine the design platform with MCP-based export | Highly automated, customizable | Deeply integrated design-to-dev workflows |
+
+This chapter walks through all three so you can choose the one that fits your project.
+
+::: tip Prerequisite
+Before starting this chapter, it is helpful to first read [Figma and MasterGo Basics](../2.1-figma-mastergo/).
+:::
+
+---
+
+## 2. Path 1: use multimodal AI to recreate code directly
+
+Models with vision capabilities are naturally suited to turning images into code. All you need to do is upload screenshots of the design and ask the model to generate the implementation.
+
+### 2.1 Workflow
+
+1. **Capture the design**
+ - Export the designed page from Figma or MasterGo as PNG or JPG
+ - Make sure the screenshot contains the complete layout
+
+2. **Choose a multimodal AI model**
+ - You can use Gemini, Qwen, Claude, or any model that accepts image input
+ - The example below uses Gemini
+
+3. **Write a prompt**
+
+ ```
+ Generate the corresponding HTML/CSS code from this design image.
+ Requirements:
+ - Use modern CSS layout techniques such as Flexbox or Grid
+ - Make it responsive for different screen sizes
+ - Include all visible UI elements
+ - Match colors and font sizes as closely as possible
+ ```
+
+
+
+4. **Save the generated code**
+ - Ask the model to return complete HTML
+ - Save it as a single `.html` file for easy local testing
+ - Later, you can convert it into a React or Vue structure inside your local IDE
+
+### 2.2 Common issues and solutions
+
+Design-to-code is never fully automatic. Here are a few issues you may run into:
+
+| Problem | Solution |
+|---------|----------|
+| Uneven layout | Describe the layout problem clearly and ask the model to adjust CSS `margin` and `padding` |
+| The page is cut off | Check whether the viewport is set correctly and ask for responsive breakpoints |
+| Colors are inaccurate | Use a color picker on the design and provide the exact values |
+| Fonts do not match | Specify a font family or ask for a Google Fonts replacement |
+
+::: tip Tip
+It is often easier to generate plain HTML first, then import that result into your local IDE and convert it into a React or Vue project afterward.
+:::
+
+### 2.3 Generate pages with MasterGo AI
+
+MasterGo also provides strong AI page generation features and can generate usable webpage code from a reference image.
+
+#### Find the AI entry
+
+In the top toolbar of the MasterGo editor, you can find the AI tool entry:
+
+
+
+#### Generation flow
+
+1. **Upload a reference image**
+ - Upload the design reference image
+ - Add a text description of what you want
+
+2. **Inspect the generated result**
+
+
+
+
+
+3. **Get the code**
+ - Click the blue `Insert to canvas` button if you want to edit the result visually
+ - Or click the `Code` button on the right to copy the implementation locally
+
+
+
+---
+
+## 3. Path 2: export code through the design platform or plugins
+
+### 3.1 Generate code with Figma Make
+
+Figma Make is Figma's official AI design feature. It can recreate webpage UI prototypes with much higher fidelity from either prompts or reference images.
+
+#### Key features
+
+- **High-fidelity recreation**: usually better than generic screenshot-to-code generation
+- **Editable results**: you can convert the result back into an editable Figma design file
+- **GitHub integration**: the generated code can be synced directly to GitHub
+
+::: tip Permissions
+To use the full Figma Make experience, you usually need Figma Pro. Students can often get Pro access through education verification.
+:::
+
+#### Steps
+
+1. **Open Figma Make**
+ - Click the `Make` button on the Figma homepage
+ - Or visit [Figma Make](https://www.figma.com/make)
+
+2. **Upload your reference**
+ - Upload the design you want to recreate
+ - Add a prompt describing what you want
+
+
+
+3. **Check the result**
+ - After a short wait, you will see the rendered result
+ - Click the play button in the upper right to preview it fullscreen
+
+
+
+4. **Fine-tune the details**
+ - Click the editor icon in the upper right
+ - Go back into the familiar Figma editor and make detailed adjustments
+
+
+
+5. **Export the code**
+ - Once the result looks good, export the code
+ - You can even connect it directly to GitHub
+
+
+
+### 3.2 Export code with plugins
+
+Besides the native AI features, both Figma and MasterGo support plugins that export code.
+
+**Common Figma plugins**
+
+- **Figma to Code**: converts designs into React, Vue, HTML, and more
+- **Anima**: high-fidelity export with interaction support
+- **Locofy**: AI-assisted design-to-code workflow
+
+**Typical workflow**
+
+1. Open the Plugins panel in Figma
+2. Search for and install the export plugin you want
+3. Select the design elements you want to export
+4. Run the plugin and choose the target framework and output format
+5. Copy or download the generated code
+
+---
+
+## 4. Path 3: export code through MCP-enabled design tools
+
+### 4.1 What is MCP?
+
+MCP, or **Model Context Protocol**, is an open standard that lets AI models access external tools and data sources in a safe and controllable way. In the context of frontend design, MCP allows a model to read the structure, styles, and component metadata of a design file directly instead of guessing from screenshots.
+
+### 4.2 How MCP works
+
+```text
+┌─────────────┐ ┌─────────────┐ ┌─────────────┐
+│ AI model │ ←→ │ MCP server │ ←→ │ Design tool │
+│ (Claude etc.)│ │(protocol adapter)│ │(Figma/MasterGo)│
+└─────────────┘ └─────────────┘ └─────────────┘
+```
+
+**Typical flow**
+
+1. The AI model sends a request through the MCP protocol
+2. The design tool returns structured design data such as layers, styles, and components
+3. The model understands the structure and generates matching code
+4. The result can then be exported or written into the development environment
+
+### 4.3 Figma + MCP in practice
+
+#### Environment setup
+
+1. **Install an MCP server**
+
+ ```bash
+ npx figma-mcp-server
+ ```
+
+2. **Configure Claude Desktop or another MCP-capable AI tool**
+
+ ```json
+ {
+ "mcpServers": {
+ "figma": {
+ "command": "npx",
+ "args": ["figma-mcp-server"],
+ "env": {
+ "FIGMA_ACCESS_TOKEN": "your-figma-token"
+ }
+ }
+ }
+ }
+ ```
+
+3. **Create a Figma access token**
+ - Go to Figma → Settings → Personal Access Tokens
+ - Generate and save a new token
+
+#### Workflow
+
+1. **Enable MCP in your AI tool**
+ - Open Claude Code or another MCP-aware IDE
+ - Confirm that the MCP server is connected
+
+2. **Provide the design file link**
+
+ ```text
+ User: Please convert this Figma design into React code
+ Link: https://www.figma.com/file/xxxxx
+
+ AI: I have connected to Figma through MCP and I am reading the design structure...
+ ```
+
+3. **Let the AI analyze and generate**
+ - The MCP server retrieves the layer tree
+ - The AI understands component structure and style properties
+ - It generates React or Vue components with more accurate names and structure
+
+4. **Iterate**
+
+ ```text
+ User: Please extract the button into a reusable component
+
+ AI: I identified the Button component from the design system via MCP and I am generating a reusable React component with props...
+ ```
+
+### 4.4 Why MCP is powerful
+
+| Feature | Traditional approach | MCP approach |
+|---------|----------------------|--------------|
+| **Data accuracy** | Based on screenshots, may lose detail | Reads the original design data directly |
+| **Component recognition** | The model has to guess boundaries | Exact component definitions are available |
+| **Style fidelity** | Estimated from pixels | Reads exact design tokens |
+| **Iteration speed** | Re-screenshot after every change | Design changes can be synced directly |
+| **Automation** | Copy and paste manually | Can write directly into project files |
+
+### 4.5 MCP tools available today
+
+**Design-side MCP tools**
+
+- **Figma MCP Server**: official MCP support for Figma
+- **MasterGo MCP**: community-built MasterGo adapter
+
+**Development-side MCP tools**
+
+- **Claude Code**: native MCP support
+- **Cline**: VS Code extension with MCP support
+- **Trae**: can enable MCP through configuration
+
+::: tip Looking ahead
+The MCP ecosystem is evolving quickly. Over time, design tools and development environments will become much more tightly integrated, and one-click design-to-code workflows will likely become far more common.
+:::
+
+---
+
+## 5. What to do after exporting code
+
+### 5.1 Test locally
+
+Once you have the code, open it in your local IDE and test it:
+
+1. **Create or open a project**
+
+ ```bash
+ # For plain HTML, open it directly in the browser
+ open index.html
+
+ # For React/Vue projects
+ npm install
+ npm run dev
+ ```
+
+2. **Collaborate with your AI IDE**
+ - Import the generated code into Trae or another AI IDE
+ - Ask AI to help fix layout issues or add interactions
+
+### 5.2 Common issues
+
+| Stage | Problem | Solution |
+|-------|---------|----------|
+| Layout | Elements are misaligned | Check `display`, `position`, and container structure |
+| Styles | Colors do not match | Use browser devtools to inspect the actual applied values |
+| Responsive behavior | Mobile layout breaks | Add or refine media-query breakpoints |
+| Interaction | Buttons do nothing | Check JavaScript event bindings |
+
+---
+
+## 6. How to choose between the three paths
+
+### 6.1 Comparison
+
+| Dimension | Path 1: Multimodal AI | Path 2: Platform features | Path 3: MCP |
+|-----------|------------------------|---------------------------|-------------|
+| **Ease of getting started** | ⭐ Easy | ⭐⭐ Moderate | ⭐⭐⭐ More complex |
+| **Fidelity** | ⭐⭐⭐ Medium | ⭐⭐⭐⭐ High | ⭐⭐⭐⭐⭐ Highest |
+| **Flexibility** | ⭐⭐⭐⭐⭐ High | ⭐⭐⭐ Medium | ⭐⭐⭐⭐ Fairly high |
+| **Automation** | ⭐⭐ Low | ⭐⭐⭐ Medium | ⭐⭐⭐⭐⭐ High |
+| **Cost** | Low | Medium | Low |
+
+### 6.2 Recommendations
+
+**Choose Path 1 if**
+
+- You need to validate an idea quickly
+- Your design tools change often
+- Perfect fidelity is not critical
+- Your budget is limited
+
+**Choose Path 2 if**
+
+- Your team mainly uses Figma or MasterGo
+- You need high-fidelity output
+- Designers and developers collaborate frequently
+- You are willing to pay for Pro tooling when needed
+
+**Choose Path 3 if**
+
+- You want the highest degree of automation
+- You have the technical ability to configure MCP
+- The project iterates from design to code frequently
+- You want a standardized design-development workflow
+
+---
+
+## 7. Summary
+
+In this chapter, you learned the three core paths from design prototype to code:
+
+1. **Direct multimodal AI conversion**: flexible and fast, ideal for early validation
+2. **Platform-native capabilities**: higher fidelity and a better fit for professional design workflows
+3. **MCP protocol integration**: the most automated path, and likely the direction of future workflows
+
+::: tip Best Practices
+- **If you are new**: start with Path 1 for speed
+- **For team collaboration**: use Path 2 to preserve design consistency
+- **For maximum efficiency**: experiment with Path 3 and build an automated workflow
+- **Use them together**: switch between paths depending on the project stage
+:::
+
+---
+
+## References
+
+- [Figma and MasterGo Basics](../2.1-figma-mastergo/)
+- [Let's Build Hogwarts Portraits](../2.5-hogwarts-portraits/)
+- [MCP Official Documentation](https://modelcontextprotocol.io/)
+- [Figma Make Documentation](https://help.figma.com/hc/en-us/sections/360007453634-Figma-Make)
+- [MasterGo AI Tutorials](https://mastergo.com/tutorials)
diff --git a/docs/en/stage-2/frontend/2.7-modern-component-library/index.md b/docs/en/stage-2/frontend/2.7-modern-component-library/index.md
new file mode 100644
index 0000000..25e1b63
--- /dev/null
+++ b/docs/en/stage-2/frontend/2.7-modern-component-library/index.md
@@ -0,0 +1,465 @@
+# Upgrade Your Interface with Modern Component Libraries
+
+In previous lessons, you already learned how to design interfaces with design tools, turn designs into code with an AI IDE, and even complete a full frontend project. But you may have noticed one issue: when you build buttons, forms, and modals from scratch, they work, but they still feel a bit short of a "professional product" - styles are not consistent enough, interaction details are not smooth enough, and adapting to different screens is painful.
+
+This is exactly the problem that **component libraries** solve.
+
+A component library is a collection of pre-designed and pre-built UI building blocks. Buttons, inputs, dropdown menus, dialogs, tables... these interface elements appear repeatedly in almost every product. A component library has already built and polished them for you through large-scale real usage. You just combine them like Lego bricks and can quickly build a professional-grade interface.
+
+## What You Will Learn
+
+1. Understand what a frontend component library is, and why modern development almost always uses one
+2. Learn four representative component libraries and the scenarios each one is best at
+3. Through three practical scenarios (landing page, product page, admin dashboard), learn how to do Vibe Coding with AI IDE + component libraries
+4. Learn how to read component-library docs so you can find suitable components and use them correctly
+
+## 1. Why Do We Need Component Libraries?
+
+Imagine furnishing a home. You could build a chair yourself from raw wood, but the common approach is to buy one from IKEA - good design, stable quality, clear instructions, and you just assemble it at home.
+
+Component libraries are the "IKEA" of frontend development. What they provide is not furniture, but interface parts:
+
+| Hand-coding everything | Using a component library |
+| :--- | :--- |
+| You handle styling, interactions, and animation yourself | Ready out of the box, with polished styles and interactions |
+| Buttons may look different across pages | Unified global style and automatic consistency |
+| Mobile/tablet adaptation needs extra work | Most component libraries already include responsive support |
+| Accessibility is easy to miss | Professional libraries already handle keyboard navigation, screen readers, and more |
+| Slower development | Faster development, more focus on business logic |
+
+In short: **component libraries let you spend time on "what to build" instead of "how to draw it."**
+
+### See It Clearly: Same Requirement, With vs. Without a Component Library
+
+Talking alone is not convincing. In Trae, we can use almost the same requirement twice: once without specifying a library, and once with one. Then compare the generated results.
+
+**Prompt 1: without a component library**
+
+```text
+Please help me build a data dashboard page for an AI writing assistant, including:
+- a top title bar and an export button
+- four statistic cards showing user count, active users, document count, and revenue, with trend changes
+- one line chart and one pie chart
+- a user list table with pagination
+- a left navigation sidebar
+```
+
+Result when run directly in Trae:
+
+
+
+
+**Prompt 2: use the shadcn/ui component library**
+
+```text
+Please help me build a data dashboard page for an AI writing assistant using the shadcn/ui component library, including:
+- a top title bar and an export button
+- four statistic cards showing user count, active users, document count, and revenue, with trend changes
+- one line chart and one pie chart
+- a user list table with pagination
+- a left navigation sidebar
+```
+
+Result when run directly in Trae:
+
+
+
+
+Same requirement. The only difference is adding `shadcn/ui + Tailwind CSS` at the beginning of the prompt. But the generated result jumps to a completely different level in visual consistency, interaction detail, and overall polish. That is the "free upgrade" component libraries bring - you only need to add one library name in your prompt.
+
+## 2. Get to Know Four Core Component Libraries
+
+There are many component libraries (full list in the [appendix](#appendix-more-component-libraries)), but you only need to first understand these four representative ones:
+
+| Component Library | Framework | One-line Positioning | Website |
+| :--- | :--- | :--- | :--- |
+| [Ant Design](https://ant.design) | React | Produced by Ant Group; the de facto standard for enterprise back-office systems, with very broad component coverage | ant.design |
+| [shadcn/ui](https://ui.shadcn.com) | React | No big npm package install; copy component code directly into your project, built on Tailwind CSS, with maximum customization freedom | ui.shadcn.com |
+| [HeroUI](https://heroui.com) (formerly NextUI) | React | Beautiful default styles and smooth animation; great for visually demanding landing pages and product showcases | heroui.com |
+| [Material UI](https://mui.com) | React | The most established React component library, implementing Google Material Design, with the most mature ecosystem | mui.com |
+
+> Vue users also have rich options: [Element Plus](https://element-plus.org) (most popular in China), [Ant Design Vue](https://antdv.com), [Naive UI](https://www.naiveui.com), etc. See the [appendix](#appendix-more-component-libraries).
+
+Different libraries are good at different scenarios. Next, through three real development scenarios, you will experience how to do Vibe Coding with AI IDE + component libraries.
+
+To show different styles and strengths, we intentionally use a different library in each scenario. But note: **this is only to let you see more options**. In real projects, you can absolutely stick to one library you like most. For example, if you like shadcn/ui, you can use it for landing pages, product pages, and admin systems. Pick one that looks good to you and feels comfortable to use - that matters most.
+
+## 3. Scenario One: Build a Product Landing Page with HeroUI
+
+**Scenario**: You built an AI writing assistant and need a beautiful landing page to show product features and attract user sign-ups. The landing page should have strong visual impact, smooth animation, and good mobile appearance.
+
+**Why HeroUI**: HeroUI has very polished default styles and smooth transitions, which makes it ideal for user-facing showcase pages.
+
+### 3.1 Create the Project
+
+```bash
+# Use the official HeroUI CLI
+npx create-heroui-app@latest ai-writer-landing
+cd ai-writer-landing
+npm install
+```
+
+
+
+
+### 3.2 Generate the Landing Page with an AI IDE
+
+Open your AI IDE (Cursor, Trae, etc.) and enter:
+
+```text
+Please help me build a landing page for an AI writing assistant using the HeroUI component library:
+
+**Page structure:**
+1. Top navigation bar: put Logo and product name on the left, three links "Features", "Pricing", "About" on the right, plus a "Get Started" button
+2. Hero section: main headline "Make AI your writing partner", subtitle introducing product value, two buttons "Try Free" and "View Demo", and a product screenshot below
+3. Feature section: three-column cards introducing "Smart Continuation", "Style Adjustment", and "Multilingual Translation"; each card should have icon, title, and description
+4. Pricing section: three pricing cards (Free, Pro, Team), with Pro highlighted as recommended
+5. Bottom CTA: one compelling line of copy and a signup button
+6. Footer: copyright information and social media links
+
+**Design requirements:**
+- modern and professional look
+- support dark mode
+- should also look good on mobile
+```
+
+
+
+
+### 3.3 Key Components the AI Will Use
+
+In the code generated by AI, you will see these HeroUI components:
+
+```jsx
+import {
+ Navbar, NavbarBrand, NavbarContent, NavbarItem,
+ Button,
+ Card, CardHeader, CardBody, CardFooter,
+ Divider,
+ Link,
+ Chip
+} from '@heroui/react'
+```
+
+Role of each component:
+
+| Component | Usage | Position in the landing page |
+| :--- | :--- | :--- |
+| `Navbar` | Top navigation bar | Top of the page, fixed |
+| `Button` | Buttons with multiple variants and colors | CTA buttons, nav buttons |
+| `Card` | Card container | Feature cards, pricing cards |
+| `Chip` | Small badge/label | "Recommended", "Most Popular" markers |
+| `Divider` | Separator line | Visual separation between sections |
+
+### 3.4 Iteration and Refinement
+
+The first generated version may not be perfect. Continue the conversation with AI:
+
+```text
+Please help me improve the landing page:
+
+1. Add a gradient color to the main headline, from blue to purple
+2. Add a hover lift animation to feature cards
+3. Highlight the Pro pricing card with a border and a "Most Popular" badge
+4. On mobile, change the nav bar to a hamburger menu (three horizontal lines)
+```
+
+
+
+
+> **Core idea of Vibe Coding**: You do not need to memorize every component API. Just describe the effect you want in natural language, and AI will choose suitable components and implementation. If something is not ideal, continue iterating in conversation.
+
+## 4. Scenario Two: Build a Product Interface with shadcn/ui
+
+**Scenario**: Your AI writing assistant needs a logged-in main interface - document list on the left, editor on the right, toolbar on top. This is a functional product page that needs highly customizable UI.
+
+**Why shadcn/ui**: shadcn/ui puts component code directly into your project, so you can modify any detail freely. For deeply customized product interfaces, this "own the code" model is the most flexible.
+
+
+
+
+### 4.1 Create the Project
+
+```bash
+# Create a Next.js project
+npx create-next-app@latest ai-writer-app --typescript --tailwind --app
+cd ai-writer-app
+
+# Initialize shadcn/ui
+npx shadcn@latest init
+
+# Add components on demand (do not install everything at once)
+npx shadcn@latest add button card input sidebar sheet dialog
+```
+
+The unique part of shadcn/ui: each time you `add` a component, it copies source code into your project's `components/ui/` directory. You can open these files and edit styles and behavior directly.
+
+### 4.2 Generate the Product Interface with an AI IDE
+
+```text
+Please help me build the main interface of an AI writing assistant using the shadcn/ui component library:
+
+**Overall layout:**
+- Left side: a collapsible sidebar, about 280px wide:
+ - Put a "New Document" button at the top
+ - Below is a document list; each document shows title and last edited time
+ - Right-click on a document should allow rename or delete
+- Right side: main editor area, split into upper and lower parts:
+ - Top toolbar: editable document title, word count, "AI Continue" button, and an "Export" dropdown
+ - Bottom editor area: one large text input filling remaining space
+
+**Interaction details:**
+- After clicking "AI Continue", the button shows loading state, and AI-generated text appears at the bottom of the editor (shown character by character like a typewriter)
+- On mobile, the sidebar becomes a drawer that slides in from the left
+- The currently selected document should be highlighted
+```
+
+
+
+
+### 4.3 Key Components the AI Will Use
+
+```tsx
+import { Button } from '@/components/ui/button'
+import { Input } from '@/components/ui/input'
+import { Card, CardContent, CardHeader } from '@/components/ui/card'
+import {
+ DropdownMenu,
+ DropdownMenuContent,
+ DropdownMenuItem,
+ DropdownMenuTrigger
+} from '@/components/ui/dropdown-menu'
+import {
+ Sheet,
+ SheetContent,
+ SheetTrigger
+} from '@/components/ui/sheet'
+import {
+ Sidebar,
+ SidebarContent,
+ SidebarHeader
+} from '@/components/ui/sidebar'
+```
+
+| Component | Usage | Position in the product page |
+| :--- | :--- | :--- |
+| `Sidebar` | Collapsible sidebar | Left document list |
+| `Sheet` | Mobile drawer | Mobile replacement for sidebar |
+| `DropdownMenu` | Dropdown menu | "Export" button, right-click menu |
+| `Dialog` | Dialog | Rename and delete confirmation |
+| `Button` | Button, supports variants and loading | Various action buttons |
+| `Input` | Input field | Document title editing |
+
+### 4.4 Customize Component Styles
+
+The advantage of shadcn/ui is that you can modify component source code directly. For example, if you want larger button corner radius:
+
+```text
+Please edit components/ui/button.tsx,
+change all default button radius from rounded-md to rounded-xl,
+and add a subtle shadow effect to the primary variant.
+```
+
+AI will directly modify component files in your project, instead of overriding npm package styles - this is the value of shadcn/ui "code ownership."
+
+
+
+
+## 5. Scenario Three: Build an Admin Dashboard with Ant Design
+
+**Scenario**: After your AI writing assistant launches, you need an admin backend to inspect user data, manage document content, and process paid orders. The core of admin systems is data display and operation efficiency.
+
+**Why Ant Design**: Ant Design has the deepest accumulation in back-office systems. Tables, forms, charts, and other business components are ready out of the box, with many built-in enterprise interaction patterns (batch actions, advanced filters, data export, etc.).
+
+
+
+
+### 5.1 Create the Project
+
+```bash
+# Use Ant Design Pro scaffolding (built-in layout, routing, permissions)
+npx create-umi@latest ai-writer-admin
+# Choose the Ant Design Pro template
+cd ai-writer-admin
+npm install
+```
+
+Or start from scratch:
+
+```bash
+npx create-react-app ai-writer-admin --template typescript
+cd ai-writer-admin
+npm install antd @ant-design/icons @ant-design/pro-components
+```
+
+### 5.2 Generate the Admin Backend with an AI IDE
+
+```text
+Please help me build an admin backend for an AI writing assistant using the Ant Design component library:
+
+**Overall layout:**
+- Left side menu: Dashboard, User Management, Document Management, Order Management, System Settings
+- Top area shows breadcrumb navigation
+
+**User Management page:**
+- Top area has four stats cards: total users, today's new users, active users, paid users
+- Search/filter area: search by username, select registration time range, filter by user status, plus "Search" and "Reset" buttons
+- User table:
+ - Show avatar, username, email, registration time, subscription plan (distinguished by different tag colors), status, operations
+ - 20 rows per page, with pagination
+ - Support batch selection, batch disable, or export
+ - Operation column: view details, edit, disable (disable requires secondary confirmation)
+- Clicking "View Details" opens a right-side drawer showing detailed user information and recent document list
+```
+
+
+
+
+### 5.3 Key Components the AI Will Use
+
+```tsx
+import { PageContainer, ProLayout } from '@ant-design/pro-components'
+import { ProTable } from '@ant-design/pro-components'
+import { StatisticCard } from '@ant-design/pro-components'
+import {
+ Button, Tag, Badge, Space, Drawer,
+ Popconfirm, message, Modal
+} from 'antd'
+import {
+ UserOutlined, SearchOutlined, ExportOutlined
+} from '@ant-design/icons'
+```
+
+| Component | Usage | Position in backend |
+| :--- | :--- | :--- |
+| `ProLayout` | Overall admin layout framework | Page skeleton (menu + content area) |
+| `ProTable` | Advanced table with built-in search, pagination, column settings | User list, document list, order list |
+| `StatisticCard` | Data statistic card | Dashboard and page-top overview |
+| `Tag` / `Badge` | Status tags | Subscription plans, user status |
+| `Drawer` | Side drawer | User details, edit forms |
+| `Popconfirm` | Confirmation popover | Dangerous actions like delete/disable |
+
+### 5.4 Keep Iterating: Add a Dashboard
+
+```text
+Please help me build a dashboard page:
+
+1. Top four statistic cards: total users, total documents, today's API calls, monthly revenue. Each card should show value and period-over-period change (up or down)
+2. Put two charts in the middle:
+ - Left: user growth line chart for the last 7 days
+ - Right: pie chart of subscription plan distribution
+3. Bottom: recent operation log table, showing time, user, operation type, details
+
+Use Ant Design components for layout, and you can use Ant Design Charts for charts.
+```
+
+
+
+
+> **Vibe Coding tip for admin systems**: Admin page structures are relatively fixed (table + search + modal), so they are perfect for batch generation with AI. You can first ask AI to generate one "User Management" page as a template, then say "Based on the same structure, generate a Document Management page." AI will reuse the same layout pattern.
+
+## 6. Learn to Read Docs: The "Manual" of Component Libraries
+
+In Vibe Coding, AI writes most code for you. But when the generated result is not correct, or when you want to fine-tune component behavior, **reading the docs** is the fastest way to solve it.
+
+Take Ant Design as an example. Its docs URL is: `https://ant.design/components/overview-cn`
+
+Standard docs workflow:
+
+1. **Clarify the need**: for example, "I need row selection in a table."
+2. **Search in docs**: search "Table" and enter the table component page
+3. **Check examples**: each component has multiple live examples; find the "selectable rows" example
+4. **Copy code**: copy the example code into your project
+5. **Check API table**: at the bottom of the page, find the full config for `rowSelection`
+
+> You can also send docs links directly to your AI IDE: "Please refer to the rowSelection API in https://ant.design/components/table-cn and help me add batch selection to the user table." Giving AI the docs link makes generated code more accurate.
+
+Quick docs links for each library:
+
+| Component Library | Docs URL |
+| :--- | :--- |
+| Ant Design | `https://ant.design/components/overview-cn` |
+| shadcn/ui | `https://ui.shadcn.com/docs/components` |
+| HeroUI | `https://heroui.com/docs/components` |
+| Material UI | `https://mui.com/material-ui/all-components/` |
+| Element Plus | `https://element-plus.org/zh-CN/component/overview.html` |
+
+## 7. Summary
+
+The three practical scenarios cover the most common frontend development needs:
+
+| Scenario | Recommended component library | Core strengths |
+| :--- | :--- | :--- |
+| Landing page / showcase page | HeroUI | Beautiful default styles, smooth animation, strong visual impact |
+| Product functional page | shadcn/ui | Full code control, flexible deep customization |
+| Admin system | Ant Design | Rich business components, tables/forms ready out of the box |
+
+Vibe Coding workflow summary:
+
+1. Choose a suitable component library based on scenario
+2. Use AI IDE to describe page structure and interactions you want
+3. AI generates first-version code, and you preview result
+4. Continue iterating with natural language
+5. When details get stuck, read component-library docs
+
+### Practice
+
+Pick one scenario below and complete it from scratch with AI IDE + component library:
+
+1. Use HeroUI to build a showcase landing page for a project you built earlier (for example, Hogwarts Portraits)
+2. Use shadcn/ui to build the main interface for a note app (sidebar + editor)
+3. Use Ant Design to build a simple content-management backend (article list + new-article form)
+
+---
+
+## Appendix: More Component Libraries
+
+Besides the four core libraries covered in the main text, the frontend ecosystem has many excellent component libraries. Below they are grouped by framework to help you choose by project needs.
+
+### Vue Ecosystem
+
+| Component Library | Stars | Description | Suitable Scenarios |
+| :--- | :--- | :--- | :--- |
+| [Element Plus](https://element-plus.org) | ~27k | Vue 3 enterprise component library from the Ele.me team, most widely used in China, excellent Chinese ecosystem | Back-office admin systems |
+| [Vuetify](https://vuetifyjs.com) | ~41k | Most popular Vue Material Design component library, 80+ components, complete docs | Google-design-style projects |
+| [Ant Design Vue](https://antdv.com) | ~21k | Vue 3 component library based on Ant Design system, unified design specification | Enterprise back-office systems |
+| [Naive UI](https://www.naiveui.com) | ~18k | Written in TypeScript, highly theme-customizable, no CSS preprocessor dependency | Projects with unique design needs |
+| [Quasar](https://quasar.dev) | ~27k | One codebase for SPA, SSR, PWA, mobile, and desktop apps | Cross-platform projects |
+| [Vant](https://vant-ui.github.io/vant) | ~24k | Lightweight mobile component library from Youzan, covering common e-commerce needs | Mobile H5 pages |
+| [PrimeVue](https://primevue.org) | ~14k | 90+ components, multiple themes (Material, Bootstrap, etc.) | Projects needing rich components and multi-theme support |
+| [Arco Design Vue](https://arco.design/vue) | ~3k | Produced by ByteDance, high component quality, built-in dark mode | Back-office products |
+| [TDesign Vue Next](https://tdesign.tencent.com/vue-next) | ~2k | Produced by Tencent, unified design language, covers common desktop scenarios | Tencent ecosystem or enterprise projects |
+
+### React Ecosystem
+
+| Component Library | Stars | Description | Suitable Scenarios |
+| :--- | :--- | :--- | :--- |
+| [Material UI (MUI)](https://mui.com) | ~95k | Long-established implementation of Google Material Design, most complete components, most mature ecosystem | Rapid enterprise app building |
+| [Ant Design](https://ant.design) | ~94k | Produced by Ant Group, many high-quality business components, dominant among Chinese developers | Enterprise back-office systems |
+| [shadcn/ui](https://ui.shadcn.com) | ~83k | Copy code into project instead of npm install, based on Radix UI + Tailwind CSS, fully controllable | Highly customized projects |
+| [Chakra UI](https://chakra-ui.com) | ~39k | Focus on developer experience, concise API, built-in accessibility support | Rapid prototype development |
+| [Mantine](https://mantine.dev) | ~28k | 100+ components and 50+ hooks, including advanced components like date pickers and rich text editors | Teams needing an all-in-one out-of-the-box solution |
+| [Headless UI](https://headlessui.com) | ~27k | Unstyled component library from Tailwind Labs, supports both React and Vue | Best with Tailwind CSS |
+| [HeroUI](https://heroui.com) | ~24k | Based on Tailwind CSS + React Aria, beautiful defaults, smooth animation | Projects pursuing visual quality |
+| [Radix UI](https://www.radix-ui.com) | ~17k | Unstyled primitive component library focused on accessibility and behavior; foundational layer of shadcn/ui | Building custom design systems |
+
+#### shadcn/ui Extension Ecosystem
+
+Beyond the general component libraries above, the shadcn/ui ecosystem has also produced many extension libraries based on the same philosophy, offering differentiated choices for specific scenarios. These extensions also use the "copy code into project" model, giving developers full source-code control.
+
+| Component Library | Description | Suitable Scenarios |
+| :--- | :--- | :--- |
+| [Aceternity UI](https://ui.aceternity.com) | 200+ production-grade components, featuring glow cards, gradient text, 3D earth, and other signature visual components | High-polish landing pages, SaaS products |
+| [Tailark UI](https://tailark.com) | Collection of marketing website blocks, including frequent modules like product showcases, testimonials, and CTA buttons | Marketing landing pages, product websites |
+| [UI Tripled](https://ui.tripled.work) | Dynamic interaction components based on Framer Motion, including modal, navigation, card animation | Creative tools, personal portfolios |
+| [Neobrutalism UI](https://neobrutalism.dev) | Neo-brutalism style with thick lines, high contrast, and bold colors | Personalized brand websites, creative projects |
+| [REUI](https://reui.io) | 967+ component composition patterns from real business scenarios | Enterprise backends, complex forms |
+| [Cult UI](https://cult-ui.com) | More refined interaction and visual polish, including compound components like data tables and filter panels | High-quality commercial products |
+| [Kibo UI](https://kibo-ui.com) | Advanced business components such as color picker, rich text editor, file upload | Admin systems, tool products |
+| [Kokonut UI](https://kokonutui.com) | 100+ components + 7+ complete templates, fresh and minimalist style | SaaS sites, blogs, e-commerce |
+| [Commerce UI](https://ui.stackzero.co) | Specialized for e-commerce scenarios, including product cards, shopping cart, checkout forms | E-commerce platforms |
+| [shadcnblocks](https://shadcnblocks.com) | 1373 UI blocks + 13 complete templates, most comprehensive resources | All scenarios |
+| [Shoogle](https://shoogle.dev) | Aggregated search platform for shadcn/ui ecosystem | Quickly finding resources |
+| [Discover All Shadcn](https://allshadcn.com) | Aggregated resource navigation | Quickly finding resources |
+
+> **Why choose shadcn/ui extensions?** These extensions inherit the shadcn/ui "code ownership" philosophy, while adding deep customization for specific scenarios. In the Vibe Coding era, they help you quickly find components that match your design goals, break away from homogenized mainstream UI patterns, and build more differentiated products.
diff --git a/docs/en/stage-2/index.md b/docs/en/stage-2/index.md
index d37c4e8..772d0ef 100644
--- a/docs/en/stage-2/index.md
+++ b/docs/en/stage-2/index.md
@@ -1,78 +1,88 @@
-# Full-Stack Development
+# Junior Developer
-Welcome to the **Full-Stack Development** stage! Here, you will dive deep into full-stack development, mastering frontend componentization, database design, backend API development, and deployment.
+Welcome to the **Junior Developer** stage! Here, you will go deeper into full-stack development and learn modern frontend workflows, database design, backend APIs, deployment, and AI-powered product building.
## What You Will Learn
### Frontend Development
-Master modern frontend development and learn to use component libraries and design tools:
+Master modern frontend development and learn how to use design tools, component libraries, and AI-native UI workflows:
+
+
+
-### Backend & Full-Stack
+### Backend Development
Learn API design, database management, and application deployment strategies:
-
@@ -83,12 +93,12 @@ Learn API design, database management, and application deployment strategies:
Consolidate your full-stack development skills through practical projects:
@@ -98,29 +108,24 @@ Consolidate your full-stack development skills through practical projects:
### AI Capabilities Extension
-
## Who Is This For
-- Developers with some programming foundation who want to systematically learn full-stack development
+- Developers with some programming foundation who want to systematically learn modern full-stack development
- Learners transitioning from product manager to full-stack engineer
- Junior to intermediate developers who want to master modern development tools and workflows
- Entrepreneurs who want to independently develop complete products
## Prerequisites
-- Complete the "Novice & Product Prototype" stage, or have equivalent basic knowledge
+- Complete the "Novice & Product Prototype" stage, or have equivalent foundational knowledge
- Understand basic HTML/CSS/JavaScript concepts
-- Have preliminary knowledge of AI programming tools
+- Have a basic understanding of AI coding tools
-Ready to dive deep into full-stack development? Click the left navigation to begin learning!
+Ready to move from product prototype to real full-stack delivery? Use the left navigation to start learning.
diff --git a/docs/en/stage-3/ai-advanced/3.a1-rag-introduction/index.md b/docs/en/stage-3/ai-advanced/3.a1-rag-introduction/index.md
new file mode 100644
index 0000000..fa4ebf7
--- /dev/null
+++ b/docs/en/stage-3/ai-advanced/3.a1-rag-introduction/index.md
@@ -0,0 +1,938 @@
+As large language models (LLMs) are adopted more widely, enterprises face a very practical problem: how can a model answer questions accurately when those questions depend on internal documents, real-time data, or domain-specific knowledge? After all, a model's training data is limited and time-bounded, so it cannot cover company-specific business knowledge or constantly updated information.
+
+One intuitive idea is this: since context windows keep getting larger, from 8K to 128K and now beyond one million tokens, why not just stuff the relevant documents into the prompt and let the model answer from those materials directly?
+
+However, being able to process long context and being able to deliver correct answers stably, efficiently, and controllably in enterprise scenarios are two very different things. Blindly relying on long context brings a series of severe challenges, including exploding cost, diluted attention, and stale knowledge updates.
+
+To solve these pain points, a technique called Retrieval-Augmented Generation, or RAG, emerged. Before the model generates an answer, RAG first retrieves precise external knowledge. Compared with simply expanding the context length in a brute-force way, RAG meets enterprise requirements for factual accuracy and fresh knowledge at lower cost, with higher accuracy and stronger controllability. It has therefore become a key foundation for building trustworthy AI applications.
+
+In this tutorial, we will systematically explain what RAG is, trace the background behind its emergence and its core principles, and then explore its evolution from basic forms to advanced forms, along with where it may go next.
+
+# What You Will Learn in This Lesson
+
+- The core value of RAG: deeply understand how it addresses the central long-context problems of cost, attention, and knowledge freshness
+- How RAG works: see through concrete examples how it completes the full loop from retrieval to generation
+- The evolution of RAG: from basic Naive RAG to Advanced RAG and then to Modular RAG
+- Model selection for RAG: understand how to evaluate and choose the three key model types, Embedding, Rerank, and LLM
+- Enterprise RAG practice: learn the full-chain construction guide from data preprocessing to system deployment and evaluation
+- RAG evaluation and optimization: understand core metrics, mainstream frameworks, and continuous improvement methods
+- Frontier trends in RAG: explore how RAG is combining with agents, multimodality, and other emerging techniques
+
+# What You Will Gain
+
+After completing this tutorial, you will build a systematic beginner-level understanding of RAG technology. You will not only know what it is, but also why it works. You will also gain a clear blueprint for how to evaluate, choose, and design an efficient, reliable, and controllable RAG system that meets enterprise requirements, laying a solid foundation for building real enterprise-grade RAG applications.
+
+# 1. Why RAG Is Needed
+
+Retrieval-Augmented Generation (RAG) is one of the most important technical approaches in generative AI today. Its basic idea is simple: before asking a large model to generate an answer, the system first retrieves information related to the user's question from an external knowledge base, and then passes both the retrieved information and the original question to the model so the model can answer on top of real materials. That external knowledge base can be an enterprise's internal policies, process documents, and product knowledge, or an industry database, regulatory corpus, standards library, and so on.
+
+
+
+At this point, a natural question appears: if large models can already "answer questions directly," why add another layer called Retrieval-Augmented Generation? Especially now that context windows are getting larger and larger, it can seem as if simply handing all relevant material to the model ought to solve most needs.
+
+The real difference is that "being able to produce an answer" and "being able to continuously, stably, and controllably produce the right answer in a real business environment" are two completely different things. If you rely only on a model's parameter memory, or only on dumping large amounts of documents into a long context, at least three typical problems still appear in enterprise use.
+
+1. Cost and efficiency problems:
+ Even as context windows keep expanding, the idea of dumping all documents into the context at once is still impractical in real systems. The central contradiction shows up in two places:
+2. Inference cost is strongly positively correlated with context length. The longer the context, the more inference cost rises, almost linearly and sometimes even superlinearly. For a single call, 8K tokens and 200K tokens live in completely different price and latency ranges, and long context has a much higher cost threshold.
+
+ 
+
+ > In meaning, context is the background information and conversation history the model "refers to" when answering a question. In technical terms, it is the total token sequence fed into the model for one inference, such as system and user instructions, message history, and retrieved passages.
+ >
+ > A "context window" is the capacity limit for that input. In mainstream large-model architectures today, such as Transformers, those tokens participate in attention computation at every layer. Once the window becomes longer and the token count increases, compute and cost rise multiplicatively and can even approach exponential growth.
+
+3. A large amount of compute is wasted. Most tasks need only a very small amount of information that is highly relevant to the current question. Stuffing the full document set into the context creates serious idle and wasted computation, lowers system throughput, slows response speed, and eventually harms user experience.
+4. Attention and focus problems:
+ A large model may be able to "cover" ultra-long context, but it cannot use every segment with equal quality. Once context length crosses a certain threshold, the model begins to show obvious attention bias:
+5. Attention decay: the model's attention to early and middle parts of the context gradually weakens, and it tends to rely more on text it read later, so early critical information can be effectively ignored.
+6. Information interference: the model can easily be dragged off course by irrelevant, repetitive, or even conflicting information inside the context. The final answer may sound logically coherent while still drifting away from the core question, making accuracy hard to guarantee.
+ Without a retrieval stage to filter and rank relevance, the longer the context becomes, the harder it is to keep the answer focused on the truly key evidence. The advantage of long context can be fully canceled out by information interference.
+7. Knowledge freshness and controllability problems:
+ If all knowledge is stored entirely in model parameters, or manually copied into prompts, two unavoidable defects appear:
+8. Knowledge updates are difficult: once the knowledge changes, such as policy changes, product iterations, or price updates, you either need to retrain or fine-tune the model, which is costly and slow, or maintain prompt templates manually, which is also costly and prone to human error.
+9. Traceability is poor: when a model answers, it is often difficult to locate the exact pieces of evidence from either black-box parameters or long prompts. This makes compliance audits, risk explanations, and other tasks that require clear decision grounds extremely difficult.
+
+Under these real constraints, the advantage of RAG becomes much clearer. Its core approach is to locate relevant and reliable information before generation, so the model answers only from necessary knowledge. Knowledge can be stored independently in an external knowledge base, making it easier to update and manage. At the same time, generated results can include cited sources, improving interpretability and trustworthiness. Even if context windows keep growing in the future, RAG will still enable efficient knowledge management and use at relatively low cost, supporting enterprise-grade knowledge applications whose process is observable and whose behavior is traceable.
+
+From the perspective of enterprise requirements, compared with a traditional LLM that relies only on its internal parameters, RAG mainly solves the following real-world deployment problems:
+
+1. Freshness:
+ Traditional models usually do not know new regulations, products, or workflows that appeared after their training cutoff, but RAG can directly read the latest policy documents, business databases, and knowledge bases. Without frequent retraining, answers can stay synchronized with the latest business state.
+2. Specialization:
+ In vertical domains such as healthcare, chemicals, or finance, general-purpose models often do not understand deeply enough or speak precisely enough. After connecting enterprise-owned domain documents and industry standards, answers can be grounded in authoritative materials and become much closer to real business practice.
+3. Hallucination:
+ By requiring answers to stay grounded in retrieved passages and provide citations, the system can reduce unsupported fabrication at the mechanism level, making "sounds true" much closer to "is actually true."
+4. Explainability and auditability:
+ Pure parameter-based models often cannot answer, "Which rule was this conclusion derived from?" RAG lets each answer be traced back to a specific policy clause, business document, or historical case. That helps business staff inspect and correct answers and gives audit, risk, and compliance teams the traceability they need.
+5. Compute cost and resource efficiency:
+ Making a model memorize all enterprise knowledge in its parameters usually means a larger model and higher inference cost. RAG stores most knowledge outside the model in vector stores and document stores and retrieves it on demand, allowing enterprises to get broader coverage and more accurate detail even with smaller models and limited compute.
+
+Therefore, for enterprises that want to use large models in real business scenarios over the long term, stably and controllably, RAG is not an optional enhancement. It is almost an essential foundational technology for building a high-quality enterprise knowledge application system.
+
+# 2. What RAG Is
+
+The core idea of RAG, Retrieval-Augmented Generation, is to let a large model answer questions not only with static knowledge learned during training, but also with up-to-date and reliable information pulled from an external knowledge base at runtime.
+
+In a typical RAG system, the user's question is not sent directly to the large model. Instead, a retrieval module first finds the most relevant document passages from the enterprise knowledge base, then combines those passages with the original question into a complete context, and finally gives that to the model to generate an answer. This "retrieve first, generate second" pattern allows the model to reason from real reference material instead of only guessing from what it remembers in its parameters. We can look at a typical case:
+
+
+
+1. Indexing stage
+
+ In the indexing stage, the system first processes raw material such as internal enterprise documents, web pages, and reports. It splits them into smaller semantic chunks, then uses an embedding model to generate vector representations for each chunk and builds an index. Later, when a user question arrives, the system can quickly find the most semantically similar chunks in vector space.
+
+ In the diagram, this corresponds to the purple "Indexing" area in the upper right. The path from "Documents" through "Chunks / Vectors" to "embeddings" shows documents being chunked, converted into vectors, and written into the index. More concretely:
+
+ - Documents are divided into a set of semantically coherent chunks, each of which may correspond to a short news passage, explanation, or analysis.
+ - Each chunk is converted into a high-dimensional vector by the embedding model and stored in the vector index.
+ - This index supports similarity-based retrieval later, preparing a knowledge base the system can consult when answering questions.
+
+2. Retrieval stage plus answer generation from retrieved results
+
+ After the user asks a question, the system first retrieves relevant content from the index, then sends the question and retrieved text together to the large model to generate an answer. In the figure, the key areas from upper to lower and right to left correspond exactly to this full flow.
+
+ (1) User input question: the yellow Input - Query area
+
+ > "How do you evaluate the fact that OpenAI's CEO, Sam Altman, went through a sudden dismissal by the board in just three days, and then was rehired by the company, resembling a real-life version of 'Game of Thrones' in terms of power dynamics?"
+ >
+ > "How do you evaluate the fact that OpenAI CEO Sam Altman was suddenly dismissed by the board and then rehired by the company just three days later, making the power struggle resemble a real-life version of Game of Thrones?"
+
+ This large block of text is the content inside the "Query" box in the diagram, corresponding to the user's natural-language question. The system vectorizes that question and uses it to search the upper-right index for related document chunks.
+
+ (2) Retrieved relevant documents: the pink Relevant Documents area at the lower right
+
+ After retrieval, the system gets several document chunks most related to the question. In the diagram, they are shown as three chunks:
+
+ > "Sam Altman Returns to OpenAI as CEO, Silicon Valley Drama Resembles the 'Zhen Huan' Comedy"
+ > "Sam Altman returns as OpenAI CEO, and this Silicon Valley drama resembles a court-intrigue comedy."
+ >
+ > "The Drama Concludes? Sam Altman to Return as CEO of OpenAI, Board to Undergo Restructuring"
+ > "Is the drama ending? Sam Altman will return as CEO of OpenAI, while the board will be restructured."
+ >
+ > "The Personnel Turmoil at OpenAI Comes to an End: Who Won and Who Lost?"
+ > "OpenAI's personnel turmoil comes to an end: who won and who lost?"
+
+ (3) Combine the prompt and generate the answer: the blue LLM / Combine Context and Prompts area
+
+ The system then combines the original user question and the retrieved chunks into a complete prompt and sends it to the model. The dashed box in the lower middle of the figure shows a prompt example:
+
+ > "Question:
+ > How do you evaluate the fact that the OpenAI's CEO, ... dynamics?
+ >
+ > Please answer the above questions based on the following information:
+ > Chunk 1:
+ > Chunk 2:
+ > Chunk 3:"
+ >
+ > "Question:
+ > How do you evaluate the power struggle in the OpenAI CEO incident?
+ >
+ > Please answer the above question based on the information below:
+ > Chunk 1:
+ > Chunk 2:
+ > Chunk 3:"
+
+ (4) Answer comparison with and without RAG: the gray and yellow Output - Answer areas in the lower left
+
+ Finally, the model generates an answer based on the provided information. The figure also compares outputs with and without RAG. Without RAG, the model has no external material and can only give a vague response, corresponding to the gray box:
+
+ > "... I am unable to provide comments on future events. Currently, I do not have any information regarding the dismissal and rehiring of OpenAI's CEO ..."
+
+ With RAG, the model can use the retrieved news and analysis to produce a much more informative answer, corresponding to the yellow box:
+
+ > "... This suggests significant internal disagreements within OpenAI regarding the company's future direction and strategic decisions. All of these twists and turns reflect power struggles and corporate governance issues within OpenAI ..."
+
+The example above shows the full flow of a typical RAG system and helps us understand its core stages and how information moves through them. But many important technical details remain inside a black box: how exactly is vector matching performed, and how should the prompt be organized so the model can use the retrieved content more effectively? These details largely determine real RAG quality. Next, we will go deeper into RAG's internal mechanism and break it down step by step, from vectorization principles and similarity computation to prompt engineering.
+
+# 3. How RAG Works
+
+We can break it down through a simple question-answering example built on a knowledge base about "apple."
+
+## 3.1 Document Vectorization Stage
+
+Suppose we have a simplified knowledge base containing these three document passages:
+
+1. Passage A: Apple Inc. was founded on April 1, 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne, and its headquarters are in Cupertino, California.
+2. Passage B: Apples are a fruit rich in vitamin C and dietary fiber, which helps digestion and immune-system health.
+3. Passage C: Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry.
+
+When we process these documents with an embedding model, such as OpenAI's `text-embedding-ada-002` or an open-source BGE model, each passage is converted into a high-dimensional vector, often with 768, 1024, or 1536 dimensions.
+
+> A vector is essentially an array made of many numeric values. Each dimension corresponds to a semantic feature of the text. For example, the vector for "cat" may contain dimensions related to mammal, household pet, and furry. The final combination of values captures the semantic meaning of the text so the computer can "understand" relationships between texts.
+
+Simplified examples, with real vectors being much higher-dimensional:
+
+- Vector for passage A, about Apple's founding: `[0.85, -0.23, 0.41, -0.56, 0.12, 0.78, ...]`
+- Vector for passage B, about apples as fruit: `[-0.12, 0.95, -0.34, 0.67, -0.89, 0.05, ...]`
+- Vector for passage C, about the iPhone launch: `[0.79, -0.18, 0.52, -0.61, 0.23, 0.81, ...]`
+
+These vectors then need to be stored in a vector database, such as Pinecone, Weaviate, or FAISS, for later retrieval and recall.
+
+> A database is a system that stores and manages data in a structured way, enabling organized storage and efficient retrieval. Common examples include contact lists and e-commerce product catalogs.
+>
+> A vector database is a specialized kind of database. Unlike traditional databases, which store text, tables, and other ordinary data structures, a vector database is designed specifically to store vectors, that is, high-dimensional numeric arrays, and it is optimized for similarity search in AI scenarios.
+
+## 3.2 User Query, Retrieval, and Response Stage
+
+Once the knowledge base has been vectorized and stored, a RAG system can support real-time user queries. When a user asks a question, the system executes a continuous flow: it first converts the question into a vector, then uses similarity computation to retrieve the most relevant information from the knowledge base, and finally uses those passages as the basis for answer generation. We can illustrate this process with three concrete queries.
+
+### Query 1: "When was Apple Inc. founded?"
+
+At the query-vectorization stage, the question is converted by the embedding model into a semantic vector, for example `[0.82, -0.21, 0.38, -0.58, 0.15, 0.76, ...]`. This numeric pattern is highly similar to the stored vector for passage A, the one about the company's founding.
+
+The system then performs similarity retrieval, Top-K with K = 2, by computing cosine similarity between the query vector and all document vectors in the knowledge base. The result looks like this:
+
+- Similarity with passage A, the founding passage: 0.97, highly relevant
+- Similarity with passage C, the iPhone launch passage: 0.88, relevant because it is also about the company
+- Similarity with passage B, the fruit nutrition passage: 0.12, almost irrelevant
+
+> Top-K is a common selection strategy in vector retrieval. It means ranking all matches from highest to lowest similarity and keeping the top K results. K = 2 means the system retains only the top two document vectors by similarity and filters out lower-ranked ones, so the next stage generates the answer only from the two most relevant document passages.
+
+The results filtered by similarity are called recall results. The system returns the Top-2 passages as evidence:
+
+1. Passage A, similarity 0.97: "Apple Inc. was founded on April 1, 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne, and its headquarters are in Cupertino, California."
+2. Passage C, similarity 0.88: "Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry."
+
+At the answer-generation stage, the system builds a complete structured input by placing the recalled content inside the reference information section and sending it together with a system prompt:
+
+```text
+[System Prompt]
+You are a professional question-answering assistant. Please answer strictly according to the "reference information" provided by the user.
+If the reference information contains the answer, answer directly based on it.
+If the reference information does not contain the answer, explicitly tell the user that "the question cannot be answered based on the currently available materials," and do not fabricate information.
+Please indicate which information point your answer is based on.
+
+[Retrieved Context]
+Apple Inc. was founded on April 1, 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne, and its headquarters are in Cupertino, California.
+Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry.
+
+[User Query]
+When was Apple Inc. founded?
+```
+
+After receiving this structured input, the LLM follows the system instruction and treats the retrieved context as the only trustworthy source for answering. Its final response would look like this:
+
+> According to the provided reference information, Apple Inc. was founded on April 1, 1976. [Basis: Information 1]
+
+### Query 2: "What are the benefits of eating apples?"
+
+At the query-vectorization stage, this question is converted into a semantic vector such as `[-0.08, 0.92, -0.31, 0.71, -0.85, 0.08, ...]`. Its numerical pattern is highly similar to the stored vector for passage B, the one about apple nutrition.
+
+The system again performs Top-K similarity retrieval with K = 2 and computes cosine similarity:
+
+- Similarity with passage B, fruit nutrition: 0.95, highly relevant
+- Similarity with passage C, iPhone launch: 0.18, almost irrelevant
+- Similarity with passage A, company founding: 0.15, almost irrelevant
+
+The system returns the Top-2 passages as evidence:
+
+1. Passage B, similarity 0.95: "Apples are a fruit rich in vitamin C and dietary fiber, which helps digestion and immune-system health."
+2. Passage C, similarity 0.18: "Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry." This is only weakly related and would often be filtered by a threshold in practice.
+
+The complete structured input is then built as follows:
+
+```text
+[System Prompt]
+You are a professional question-answering assistant. Please answer strictly according to the "reference information" provided by the user.
+If the reference information contains the answer, answer directly based on it.
+If the reference information does not contain the answer, explicitly tell the user that "the question cannot be answered based on the currently available materials," and do not fabricate information.
+Please indicate which information point your answer is based on.
+
+[Retrieved Context]
+Apples are a fruit rich in vitamin C and dietary fiber, which helps digestion and immune-system health.
+Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry.
+
+[User Query]
+What are the benefits of eating apples?
+```
+
+Its final response would then look like:
+
+> According to the provided reference information, apples are rich in vitamin C and dietary fiber, and eating apples helps digestion and immune-system health. [Basis: Information 1]
+
+### Query 3: "How is the weather today?"
+
+At the query-vectorization stage, this question becomes a semantic vector related to weather and meteorology, for example `[0.10, -0.05, 0.30, -0.12, 0.21, 0.08, ...]`. In semantic space, this vector is far away from all document vectors about apples, whether the company or the fruit, so no significant similarity appears.
+
+The system again performs Top-K retrieval with K = 2. Because the question topic is unrelated to the knowledge base, overall similarity scores are all very low:
+
+- Similarity with passage B, fruit nutrition: 0.18, extremely low
+- Similarity with passage C, iPhone launch: 0.10, almost irrelevant
+- Similarity with passage A, company founding: 0.08, almost irrelevant
+
+Top-K still returns the top-ranked K results, but in this case those results do not provide effective evidence. In practice, the system often applies a minimum similarity threshold and directly returns empty recall, that is, no valid results, to reduce irrelevant interference.
+
+The two returned passages would still be:
+
+1. Passage B, similarity 0.18: "Apples are a fruit rich in vitamin C and dietary fiber, which helps digestion and immune-system health."
+2. Passage C, similarity 0.10: "Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry."
+
+The full input would then be:
+
+```text
+[System Prompt]
+You are a professional question-answering assistant. Please answer strictly according to the "reference information" provided by the user.
+If the reference information contains the answer, answer directly based on it.
+If the reference information does not contain the answer, explicitly tell the user that "the question cannot be answered based on the currently available materials," and do not fabricate information.
+Please indicate which information point your answer is based on.
+
+[Retrieved Context]
+Apples are a fruit rich in vitamin C and dietary fiber, which helps digestion and immune-system health.
+Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry.
+
+[User Query]
+How is the weather today?
+```
+
+The LLM would first judge whether the reference information contains direct weather or real-time meteorological information. After confirming that it does not, it would follow the instruction to answer that it cannot answer:
+
+> The currently available materials cannot answer the question "How is the weather today?" because the reference information only contains content related to apples, fruit nutrition, and Apple Inc. products, and does not contain weather information or real-time meteorological data. [Basis: No weather-related information exists in the retrieved context]
+
+From these three examples, we can see the key to the RAG dialogue stage. The system prompt defines the LLM's role and response rules, retrieved evidence provides concrete and trustworthy material, and the user's question defines the task objective. This structured-input pattern is exactly what lets RAG effectively guide and constrain an LLM that might otherwise hallucinate, turning it into a system that produces stable and reliable answers. It ensures that the model is used for understanding and organizing existing information rather than inventing unsupported information.
+
+# 4. The Evolution of RAG
+
+RAG did not originate in the era of large models. Earlier research already contained prototypes of the same idea. From a historical perspective, RAG arose from recognition of the limitations of traditional LLMs. Early large language models depended mainly on pretraining data, and that data became fixed once training finished. For example, models such as GPT-3 had knowledge cutoff dates tied to when the training data was collected and could not obtain later knowledge. Retraining or fine-tuning LLMs for specific domains also required large resources and specialized expertise, making it expensive and hard to iterate quickly.
+
+The roots of RAG can be traced back to the DrQA framework in 2017, which first attempted to combine retrieval with language models. A major breakthrough then came in 2020 with Dense Passage Retrieval, or DPR, which used pretrained neural models for semantic retrieval instead of traditional word-frequency-based methods such as TF-IDF and BM25. In 2021, RAG was formally proposed and systematized, becoming a standard way to address the knowledge-cutoff and hallucination problems in LLMs.
+
+Broadly speaking, the evolution of RAG can be divided into three stages:
+
+
+
+## 4.1 First-Generation RAG: Naive RAG
+
+Naive RAG is the most basic form of RAG. From an engineering perspective, it follows a very direct three-step flow:
+
+1. Document preprocessing and indexing. Raw documents are cleaned, split into fixed-length text chunks, encoded into vectors with an embedding model, and written into a vector database.
+2. Similarity-based retrieval. The user's natural-language question is encoded into a vector, and the system performs a Top-K similarity search over the vector store.
+3. Simple retrieval-augmented generation. The retrieved chunks are directly concatenated with the original question to form a long prompt, which is sent to the LLM for answer generation.
+
+The value of this stage is that it verified, with a very low barrier, that "retrieve before answering" actually works. Compared with relying only on the model's internal memory, it already significantly reduces knowledge-cutoff issues and some hallucinations, which is why it played an important role in early prototypes, demos, and introductory tutorials.
+
+However, the limitations of first-generation RAG are also obvious. First, the chunking strategy is usually crude. Most systems simply split by fixed length, which can cut a coherent semantic paragraph in the middle or mix multiple topics inside one chunk. This hurts retrieval accuracy and also makes comprehension harder for the LLM. Second, the retrieval signal is too simple. Ranking usually depends only on vector similarity and does not use richer structured clues such as keywords, timestamps, source credibility, or access permissions. Third, retrieval results are barely governed at all: noisy, repetitive, and even contradictory chunks can be stuffed into the context unchanged, causing large amounts of low-value information to occupy an already limited context window.
+
+In short, the first generation solved the question of whether retrieval is needed. But on the questions of how to retrieve better, and how to use retrieved information more reasonably, it still remained at a rather primitive stage.
+
+## 4.2 Second-Generation RAG: Advanced RAG
+
+As RAG moved from demos into real business scenarios, the requirements for stability, controllability, and output quality rose sharply. The second generation, usually grouped under the broad name Advanced RAG, still follows the pattern of retrieve first and generate second, but it introduces systematic refinement both before and after retrieval. In other words, the system is no longer satisfied with merely retrieving something. It now aims to store the right things properly, ask the right questions clearly, and govern the retrieved context carefully.
+
+Before retrieval, the focus is on storing and asking well:
+
+- On the indexing side, chunking evolves from fixed-length splits to semantically aware chunking and hierarchical indexing. The system may chunk along chapter, subsection, paragraph, or sentence boundaries, combined with sliding windows and multi-granularity index structures.
+- Each document chunk can carry rich metadata such as source, timestamp, author, topic, and document type, providing more dimensions for later filtering and ranking.
+- On the query side, the user's original question can be rewritten, expanded, or decomposed through techniques such as Query Rewrite, Multi-Query, Sub-Query decomposition, and Step-back Prompting, transforming vague or conversational user queries into forms that retrieval can understand better.
+
+ > 1. Query Rewrite
+ >
+ > The core idea is to transform the user's vague, colloquial, or nonstandard query into a normalized expression that the retrieval system can understand more easily, supplementing key information and resolving ambiguity.
+ >
+ > - For example, "How do I check tomorrow's weather in Beijing?" might be rewritten into something more standardized such as "Query tomorrow's full-day real-time weather in Beijing."
+ > - Or "Recommend good movies" may be rewritten, after looking at user history, into "Recommend high-rated 2024 suspense movies."
+ >
+ > 2. Multi-Query
+ >
+ > The system generates multiple semantically related but differently angled queries from the original question to reduce missed results and cover latent needs the user did not explicitly state.
+ >
+ > 3. Sub-Query
+ >
+ > For compound questions that contain several goals, the system splits them into smaller, simpler sub-queries so retrieval can match each need precisely.
+ >
+ > 4. Step-back Prompting
+ >
+ > The system first generates a more abstract, higher-level question, then uses that to guide retrieval direction, reducing bias caused by being too narrowly focused on details in the original question.
+
+After retrieval, the focus is on governing what was retrieved:
+
+- A dedicated rerank model or even an LLM can rerank candidate documents so the most important and question-relevant content enters the context first.
+ > A rerank model is a key component in an information-retrieval pipeline. It performs second-stage ranking on candidate results returned by the recall phase, using stronger semantic understanding, often based on Transformer architectures, to fix semantic ranking errors from the first stage and move the results most aligned with user needs further forward.
+- Retrieved passages can be filtered, deduplicated, and compressed to remove clearly irrelevant or highly repetitive chunks, reducing the tendency of long-context systems to ignore useful information in the middle.
+- When necessary, light model fine-tuning can make the LLM more likely to answer from retrieval evidence and include explicit citations or sources.
+
+Overall, Advanced RAG is no longer focused only on whether retrieval is necessary or whether something can be retrieved. It instead addresses three larger challenges: whether the truly critical passages can be located precisely, whether the context handed to the large model is concise, well-structured, and easy to use efficiently, and whether the whole system remains stable and reliable in the presence of noise, conflict, or multi-source information needs.
+
+Large amounts of experimental and engineering evidence show that Advanced RAG significantly outperforms Naive RAG on answer accuracy, hallucination suppression, system robustness, and explainability. That is why it has gradually replaced traditional basic approaches and become the mainstream industrial paradigm for building RAG systems today.
+
+## 4.3 Third-Generation RAG: Modular RAG
+
+In complex enterprise applications, requirements often span multiple domains. In those cases, a simple linear flow of retrieve, rerank, and generate is often not enough:
+
+1. The same system may need to support simple FAQs, long report generation, code retrieval, and database calls.
+2. It may need to connect vector stores, full-text retrieval, relational databases, knowledge graphs, and external search engines at the same time.
+3. It may need to preserve user preferences and historical decisions over multiple rounds, while also applying compliance checks and answer traceability.
+
+Against this background, RAG began evolving toward a modular system shape. Modular RAG is no longer viewed as a fixed pipeline. It is treated instead as a set of pluggable, replaceable, and composable function modules that can be orchestrated as needed. Typical modules include:
+
+1. Query understanding and routing
+ This module handles intent recognition, question rewriting, subtask decomposition, and path selection. It decides whether a request should rely mainly on internal knowledge, external retrieval, or a specific tool or database.
+2. Multi-source retrieval and fusion
+ This module connects vector databases, full-text search, structured databases, and knowledge graphs simultaneously, queries them, and merges and reranks their results into a unified evidence set.
+3. Memory and personalization
+ This module maintains long-term user profiles, short-term session memory, and domain knowledge caches so the system can continuously accumulate and use historical information.
+4. Task adaptation and governance
+ This module loads different adapters for different tasks, constrains output format, tone, and style, and governs outputs through fact checking, risk filtering, and citation alignment.
+
+In short, traditional RAG often ends after one retrieval round plus one generation round. Modular RAG breaks that single-flow pattern. If the system discovers during generation that information is still insufficient, it can proactively trigger new retrieval rounds and even move back and forth multiple times between retrieval and generation to complete a more complex task.
+
+Going further, the model can learn to make its own decisions: answer directly from internal knowledge or short context when confidence is high, and launch retrieval or external tool calls only when uncertainty is high. That improves efficiency and saves resources while preserving quality. For heavily underspecified or incomplete queries, the model can even generate a hypothetical intermediate answer or draft document first, then use that as a clue for further retrieval, progressively approaching reliable sources.
+
+At this stage, RAG is no longer just a simple component that attaches a few reference passages to a large model. It is becoming the central knowledge-orchestration layer inside enterprise intelligent applications, coordinating multiple data sources, multiple tools, and multiple tasks.
+
+# 5. From Demo to Enterprise-Grade RAG
+
+From the perspective of enterprise engineering, building a RAG system cannot be limited to retrieval-augmented generation alone. The material above is still closer to a demo-level introduction. In real business scenarios, data is often noisy and inconsistent in format, so more effort must be invested into preprocessing, cleaning, and ingestion, and model selection must be handled carefully at every key point.
+
+A complete enterprise-grade RAG system can usually be divided into three core modules: layout analysis and knowledge ingestion, knowledge-base construction, and RAG-based question-answering service. Across the full technical chain, several key model-selection decisions appear, including the embedding model, rerank model, and LLM. Only with sensible technical choices at each stage can the system achieve strong overall results.
+
+1. Layout analysis and local knowledge-file reading
+
+ This module converts local knowledge assets in different formats into text usable for retrieval. Inputs may include PDFs, TXT, HTML, Word, Excel, and PPT files, as well as scanned image files such as PNG and JPG, or even audio recordings.
+
+ The system needs to parse each format appropriately, perform layout analysis and structural extraction for text documents, distinguish titles, main body, tables, headers, and footers, and restore a sensible reading order. It performs OCR on image files and ASR on speech, finally converting everything into relatively clean knowledge text while retaining basic metadata such as file name, chapter, page number, and timestamp for later chunking and indexing.
+
+2. Knowledge-base construction: chunking, embeddings, and indexing
+
+ After obtaining cleaned knowledge text, the system performs chunking, splitting long documents into semantically coherent blocks of suitable length, usually by paragraph, title structure, or sliding window, while preserving each chunk's source and metadata.
+
+ Then it uses the chosen embedding model, such as `text-embedding-3-small`, Sentence Transformers, or BGE, to calculate vector representations for each chunk and build a vector index using tools such as Faiss, Milvus, or managed vector-search services. At that point, a knowledge base that supports fast semantic retrieval has been created.
+
+3. RAG-based question answering: recall, reranking, concatenation, generation
+
+ In the online QA stage, the user sends a query. The system embeds it into a query vector, retrieves a batch of the most similar text chunks from the vector index, and treats that as a coarse ranking stage. Then it can use a rerank model such as a BGE reranker or even an LLM acting as a reranker to score query-document pairs again and keep only the Top-K documents that are truly most relevant as the knowledge context.
+
+ Next, together with a carefully designed system prompt such as "Please answer strictly based on the following materials," the system concatenates the user query and retrieved document passages and sends the merged prompt to the LLM. The model then generates the final answer from those retrieved pieces of evidence and, when needed, includes citations or sources.
+
+## 5.1 Model Selection
+
+Next we focus on model selection. A complete RAG system usually involves three core model categories: embedding models, rerank models, and large language models. Each has its own role, and together they form the full path from retrieval to answer generation. The embedding model converts text into searchable semantic vectors, the rerank model refines initial retrieval results, and the LLM generates the final answer based on the selected knowledge context.
+
+### 5.1.1 Embedding Models
+
+In a RAG system, the job of the embedding model is to convert text, such as user queries and knowledge-base content, into high-dimensional vectors. Semantically similar texts are placed closer together in vector space, allowing the system to locate related knowledge quickly by similarity. Choosing the right embedding model is therefore one of the most critical steps in building a high-performance RAG system because it directly determines recall quality.
+
+To choose a strong model, it helps to use a systematic benchmark. One of the most widely used is MTEB, the Massive Text Embedding Benchmark.
+
+MTEB provides a unified and objective evaluation framework for many embedding models. Through eight major task categories and 56 datasets, it evaluates performance across retrieval, clustering, classification, reranking, text matching, semantic similarity, and more. A model's overall MTEB score reflects the generality and robustness of its vector representations and can serve as an important reference for model selection. The latest ranking can be checked on the Hugging Face MTEB leaderboard:
+
+[HuggingFace MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard)
+
+
+
+Although there are many models on the leaderboard, you do not need to master all of them. In practice, choosing the embedding model bundled by a major model provider, or using a cloud-served model that many people have already validated, is usually a safe choice. You can also filter the leaderboard by category or language in the sidebar:
+
+
+
+When filtering embedding models, two parameters matter especially because they directly affect RAG performance: dimension and context length.
+
+Dimension is the dimensionality of the vector output, such as 128, 768, or 1536. It roughly reflects how many semantic features the vector can express. Higher-dimensional vectors can capture richer semantic detail and stronger discrimination. For example, a 768-dimensional vector can represent "apple" from hundreds of angles such as variety, taste, and origin, making it suitable for professional scenarios like healthcare or law that need precise retrieval. Lower dimensions reduce computation and storage cost and improve retrieval speed, making them suitable for large-scale general scenarios with high concurrency and strong real-time requirements.
+
+Context length is the maximum text length the embedding model can process in one pass, measured in tokens. One English token is roughly three quarters of a word, and one Chinese token is roughly one Chinese character. Anything longer than the maximum is truncated. This directly determines whether the model can fully understand the text. If important information is lost because the length is too short, retrieval accuracy drops sharply. For short user queries and short QA pairs, 512 to 1024 tokens is often enough. For longer texts such as papers and reports, you usually need 2048 tokens or more.
+
+Below is a comparison of several common embedding models. In practice, you need to choose by balancing cost and performance. There is no universally best model, only the most suitable model after comparing several options in your own use case.
+
+| Model Name | Model Scale | Core Strength | Suitable Scenarios |
+| :--- | :--- | :--- | :--- |
+| OpenAI `text-embedding-3-large` | Closed API | Long-term leader on MTEB, mature and stable | Cloud API scenarios that prioritize extreme performance and have enough budget |
+| `jina-embeddings-v2` | Supports long text up to 8K context | Strong for long-document retrieval through asynchronous encoding design | Document analysis, legal compliance, academic retrieval |
+| `multilingual-e5-large` | Large scale | Classic multilingual option | Cross-lingual RAG, international products, multilingual support systems |
+| `Qwen/Qwen2-Embedding-8B` | 8B parameters, up to 4096 custom dimensions | Former top multilingual MTEB performer, strong on long text, multilingual tasks, and code | High-precision Chinese-English RAG, long-document analysis, code retrieval |
+| `Qwen/Qwen2-Embedding-4B` | 4B parameters | Strong balance of performance and efficiency | Large-scale production RAG systems |
+| `Qwen/Qwen2-Embedding-0.6B` | 0.6B parameters | Suitable for edge devices | Resource-constrained, speed-first scenarios |
+| `BAAI/bge-m3` | Supports hybrid retrieval, dense plus sparse plus multi-vector | Strong on multilingual benchmarks such as MIRACL | Complex multilingual scenarios that need hybrid retrieval |
+| `BAAI/bge-large-zh-v1.5` | Large scale | Stable Chinese RAG baseline with strong community validation | Pure Chinese projects with shorter documents |
+| ZhipuAI `Embedding-3` | Closed cloud API | Supports custom dimensions from 256 to 2048 | Chinese-focused applications preferring cloud APIs |
+
+### 5.1.2 Rerank Models
+
+In a RAG system, the rerank model is responsible for finely reranking initial retrieval results. It takes the user query and candidate documents as input and computes an exact relevance score for each query-document pair. The higher the score, the better the match. Therefore, adding a rerank model on top of embedding-based recall is a key step for improving retrieval precision.
+
+For embedding models, we can use benchmarks like MTEB. For rerank models, one useful reference is Agentset's reranker leaderboard:
+
+[Reranker Leaderboard](https://agentset.ai/rerankers)
+
+The Agentset benchmark first retrieves the 50 most relevant candidate results from a large document store using FAISS, then asks the rerank model under evaluation to rerank those 50 documents. The benchmark pays attention to both ranking quality and latency. In practical applications, pursuing precision while ignoring speed hurts user experience, while pursuing speed while sacrificing ranking quality harms usefulness.
+
+Agentset also introduces an ELO scoring mechanism. For each query, GPT-5 acts as a judge and compares the ranked outputs of two different rerank models, deciding which one places truly relevant documents in a more sensible order. After large numbers of such pairwise comparisons, models that win more often receive higher ELO scores, providing an intuitive overall performance signal.
+
+The benchmark also uses two complementary groups of metrics:
+
+- `nDCG@5/10`, which focuses on whether relevant documents are placed near the front and therefore reflects ranking precision
+- `Recall@5/10`, which focuses on whether all relevant documents can be found and therefore reflects coverage
+
+Together these metrics provide a more complete picture of rerank performance.
+
+Still, in practice, you do not need to select rerank models only from a leaderboard. Industrial usefulness and leaderboard score are not always the same thing. A practical approach is to start from the rerank models recommended by your cloud vendors or default rerank APIs provided by major model vendors, or to test a model family you are already using, such as a matching Qwen rerank model.
+
+### 5.1.3 LLMs
+
+After semantic retrieval by the embedding model and refined filtering by the rerank model, the relevant document passages are combined with the user's original question into a prompt. The LLM then performs reading comprehension, information integration, and natural-language generation to output a coherent, accurate answer that fits the context.
+
+At the implementation level, there are two main ways to use LLMs in RAG:
+
+1. Privately deployed large models.
+ These are suitable for scenarios that care about data privacy, controllable cost, or deep customization. Mainstream open models such as Qwen, Llama, and GLM perform well in RAG tasks. For example, Qwen2.5 in the 7B or 14B range offers good instruction-following and Chinese understanding while keeping resource use modest, making it suitable for local enterprise deployment. Models such as KIMI, Minimax, and DeepSeek can also be considered according to specific business needs.
+2. Cloud API large models.
+ These fit scenarios that prioritize fast launch, elastic scaling, and continuous model upgrades. Major providers such as OpenAI, Anthropic, Google, Alibaba, and ZhipuAI all offer stable API services. These models generally have strong language understanding and generation ability and can synthesize answers well in RAG scenarios.
+
+When selecting cloud models, several points matter: whether answer quality is accurate and fluent, whether price is reasonable, whether latency is acceptable, and whether the context window is large enough to hold multiple retrieved documents. In practice, you should compare several candidates on your own data and see which one gives the most complete and accurate answers. If cost is a concern, a useful approach is to combine large and small models: use cheaper small models for simple questions and reserve expensive large models for difficult cases. Since models update quickly, it is also wise to retest candidates periodically.
+
+For broad conversation and QA ability, LMSYS Chatbot Arena, now LMArena, is one of the most widely recognized evaluation references:
+
+[LMSYS Chatbot Arena (LMArena)](https://lmarena.ai/)
+
+It uses blinded pairwise human comparisons to rank models. The ranking offers a useful first filter, but in actual RAG selection it should only be a starting point. In specialized domains such as medicine, law, and finance, general leaderboard ranking can diverge substantially from real performance on your business data.
+
+Best practice for LLM selection is to build a small but representative test set containing 20 to 30 typical business questions and evaluate candidate models through the full end-to-end RAG pipeline rather than looking only at isolated model benchmarks. Questions such as whether to use reasoning models or non-reasoning models, or which model size best balances quality and speed, are all best answered through real testing on your own use case.
+
+## 5.2 Execution Frameworks
+
+In real engineering practice, you usually do not need to build an entire RAG system from zero. A number of mature open-source frameworks already exist, each with its own strengths in architecture, modular integration, and development efficiency. Enterprises can choose according to their own technical reserves and business scenarios.
+
+Common framework types include:
+
+**Low-code or visual platforms**
+
+- [Dify](https://dify.ai): provides an intuitive visual interface for quickly building RAG applications, making it suitable for nontechnical teams or rapid prototype validation. It includes built-in multi-model access, workflow orchestration, and prompt management.
+- [Coze](https://www.coze.cn/): an AI bot development platform from ByteDance that offers zero-code visual construction. It integrates deeply with ByteDance model services, supports a plugin marketplace, scheduled tasks, and multichannel publishing, making it suitable for consumer-facing assistants or internal enterprise bots.
+- [n8n](https://n8n.io/): an open-source node-based workflow automation platform. In RAG scenarios, it can orchestrate complex business logic and connect preprocessing, vector database operations, model calls, and follow-up actions such as email sending or ticket updates into one automated flow.
+- [RAGFlow](https://ragflow.io/): focuses on deep layout analysis and knowledge extraction and performs well on complex documents such as multi-column PDFs and table-heavy materials.
+- [FastGPT](https://fastgpt.io/en): a Chinese open-source solution integrating knowledge-base management, dialogue orchestration, and application publishing, with strong Chinese documentation and suitability for fast deployment of Chinese RAG applications.
+
+**Code frameworks and development libraries**
+
+The tools below usually have implementations in different backend languages. You can choose the corresponding language version for your application stack.
+
+- [LlamaIndex](https://www.llamaindex.ai/): a Python framework designed specifically for RAG, with rich connectors, index structures, and query engines. Its modularity makes it suitable for deeply customized retrieval strategies or integration with many data sources.
+- [LangChain](https://www.langchain.com/): a general LLM application framework where RAG is only one use case. Its strength is its rich ecosystem and component coverage, including support for complex agents and workflow orchestration, though its learning curve is steeper.
+
+If the team's technical reserves are limited and speed matters most, low-code platforms such as Dify, Coze, or FastGPT are good first choices. If you need deep customization, special data-source integration, or detailed performance tuning, LlamaIndex and LangChain offer more flexibility. In practice, a hybrid route is also common: use a low-code platform for rapid feasibility validation, then move to code frameworks for production deployment and optimization. Most of these frameworks also support rapid integration with mainstream embedding, rerank, and LLM models, letting you combine them flexibly using the model-selection principles discussed above.
+
+## 5.3 Effect Evaluation
+
+For enterprises deploying RAG systems, the biggest challenge is often not building the system but tuning it. Production-grade RAG contains two nondeterministic stages, retrieval and generation, so traditional software testing is not enough. That is why building a scientific evaluation system, or RAG evaluation, is so important.
+
+### 5.3.1 Beginner Example: LLM-Based RAG Evaluation
+
+To help build an intuitive understanding of RAG evaluation, we can look at a simple automated pipeline based on the idea of LLM-as-a-judge:
+
+https://huggingface.co/learn/cookbook/rag_evaluation
+
+The process usually contains three key steps:
+
+- First, synthesize an evaluation dataset by sampling documents from the knowledge base and asking an LLM to generate high-quality question-answer pairs, then filter them by relevance and groundedness to form a benchmark set.
+- Second, run the RAG system on each question in that test set and collect the generated answers.
+- Third, automate scoring by calling another LLM as a judge, comparing the generated answers with reference answers, and giving quantitative scores for dimensions such as accuracy and completeness.
+
+A simple example:
+
+1. Problem generation. Suppose the knowledge base contains a product manual line saying, "This device supports wireless charging and has a 5000mAh battery." We ask one model to act as an exam setter and generate a question such as, "What is the battery capacity of this device?" The standard answer is "5000mAh."
+2. Problem solving. We send that question to the RAG system, which retrieves related material and answers, for example, "The device has a 5000mAh battery."
+3. Grading. We ask another model to act as the grader by comparing the question, the generated answer, and the reference answer, using a prompt such as, "Judge whether the generated answer is correct. Output only correct or incorrect."
+
+By running this process at scale, we can compute metrics such as accuracy. This forms a practical loop of evaluate, optimize, and reevaluate.
+
+If you want deeper detail on RAG evaluation, including metric definitions, framework usage, and benchmark datasets, two useful survey papers are:
+
+- [https://arxiv.org/pdf/2504.14891](https://arxiv.org/pdf/2504.14891), *Retrieval Augmented Generation Evaluation in the Era of Large Language Models: A Comprehensive Survey*
+- [https://arxiv.org/pdf/2405.07437](https://arxiv.org/pdf/2405.07437), *Evaluation of Retrieval-Augmented Generation: A Survey*
+
+### 5.3.2 Evaluation Metrics
+
+RAG evaluation fundamentally revolves around two questions: can the retrieval module find the right material, and can the generation module produce a high-quality answer from that material? Accordingly, the evaluation system is divided into retrieval evaluation and generation evaluation, supplemented by LLM-as-a-judge scoring.
+
+#### Retrieval Evaluation: recall accuracy and ranking quality
+
+The retrieval module is the first gate in a RAG system. Its evaluation focuses on three dimensions: whether it finds the right things, whether it finds enough of them, and whether it ranks them well.
+
+**Basic recall quality metrics**
+
+The classic basic metrics are Recall@K, Precision@K, and F1:
+
+- **Recall@K** measures the proportion of relevant documents recovered in the top K results. If five relevant documents exist and three are found in the top 10, Recall@10 is 60 percent. This tells us how broad retrieval coverage is.
+- **Precision@K** measures the proportion of top K results that are truly relevant. If three of the top 10 are relevant and seven are not, Precision@10 is 30 percent. This reflects retrieval accuracy.
+- **F1** is the harmonic mean of Recall and Precision and balances the two.
+
+These metrics are useful for quickly diagnosing baseline recall problems. If Recall is low, relevant documents were not found at all. If Precision is low, retrieval noise is too high.
+
+**Ranking quality metrics**
+
+Finding relevant documents is only the first step. It is even more important to put the most relevant ones near the front. For that we look at MRR, NDCG@K, and MAP:
+
+- **MRR, Mean Reciprocal Rank**, measures the reciprocal of the rank position of the first relevant document. If the first relevant document appears in position 3, the reciprocal rank is 1/3. MRR is especially suitable for scenarios where one correct answer is enough.
+- **NDCG@K, Normalized Discounted Cumulative Gain**, considers both graded relevance and position discount. It not only asks whether a document is relevant, but how relevant it is, and it rewards highly relevant documents that appear early.
+- **MAP, Mean Average Precision**, is sensitive to the positions of all relevant documents and reflects overall ranking quality.
+
+In actual engineering, a common combination is Recall@K plus MRR@K. For example, if Recall@10 is 80 percent but MRR@10 is only 0.3, relevant documents are being found but buried too deep, which suggests reranking needs improvement.
+
+When needed, a Coverage metric can also be added to monitor knowledge-base coverage and reveal systematic blind spots.
+
+#### Generation quality evaluation: accuracy and factual faithfulness
+
+Retrieval provides the raw material. The next question is whether the generation module can produce a high-quality answer from those materials. The core dimensions here are answer accuracy and faithfulness to the retrieved evidence.
+
+**Exact match and text similarity**
+
+The simplest metric is **EM, Exact Match**, which requires the generated answer to match the reference answer exactly. This is suitable for fixed-form, uniquely correct fact questions such as dates or headquarters locations, but it is too strict because different but equally correct surface forms may fail to match.
+
+That is why n-gram-overlap metrics such as **ROUGE**, **BLEU**, and **METEOR** are also commonly used. They score generated answers by comparing word overlap with reference answers. ROUGE-L pays attention to longest common subsequences, BLEU comes from machine translation and emphasizes exactness, and METEOR adds synonym and stemming considerations.
+
+To overcome the limits of pure word overlap, we can also use **BERTScore** or direct vector similarity. These use pretrained semantic representations and therefore tolerate surface variation better.
+
+**Factual faithfulness and hallucination detection**
+
+For RAG systems, answer-reference similarity is not enough. The more important question is whether the answer is actually grounded in the retrieved documents or whether it hallucinates unsupported content.
+
+That is why metrics such as **Hallucination rate** and **Faithfulness** are important. A second LLM can act as a fact checker and inspect the generated answer sentence by sentence, judging whether each claim can be supported by the retrieved documents. For high-stakes domains such as healthcare, law, and finance, this type of metric is especially important, and some enterprises even enforce hallucination thresholds as production release criteria.
+
+#### LLM-as-a-Judge: multi-dimensional scoring
+
+Every automatic metric has limits. Most surface-form metrics cannot fully capture semantic quality or overall usefulness. That is where LLM-as-a-judge becomes especially valuable.
+
+The basic approach is to feed the question, retrieved documents, system answer, and reference answer into a strong independent model, such as GPT-4 or Claude, and ask it to score across dimensions such as:
+
+- question relevance
+- information completeness
+- factual faithfulness
+- overall correctness
+
+The strength of an LLM judge is that it can make a more human-like holistic judgment. Of course, judge prompts still need careful design and calibration against human-labeled examples to keep the scoring consistent and reliable.
+
+#### Building a practical metric combination
+
+With so many metrics available, teams often wonder which ones to use. A practical recommendation is to start with a compact combination and expand gradually:
+
+- For retrieval, begin with Recall@K plus MRR@K
+- For generation, choose one or two baseline metrics from EM, ROUGE-L, and BERTScore according to task type
+- For overall evaluation, introduce an LLM judge focused on relevance, completeness, and faithfulness
+
+Then iterate through a loop of evaluation, problem diagnosis, strategy adjustment, and reevaluation.
+
+### 5.3.3 Evaluation Frameworks
+
+As RAG has developed rapidly, both academia and industry have produced many strong evaluation frameworks. These frameworks not only package common metrics, but also offer standardized datasets, benchmark procedures, and end-to-end workflows.
+
+#### A basic classification of frameworks
+
+We can roughly divide RAG evaluation frameworks into three categories:
+
+- **Research frameworks**, which focus on academic exploration and fine-grained diagnosis. Examples include FiD-Light and Diversity Reranker.
+- **Benchmark frameworks**, which provide standardized test sets and workflows for comparing systems horizontally. These include frameworks such as RAGAS, ARES, RGB, MultiHop-RAG, and CRUD-RAG.
+- **Tooling frameworks**, which emphasize engineering usability and integration with development frameworks. Examples include TruEra RAG Triad, LangChain Benchmarks, and RECALL.
+
+In recent years, evaluation frameworks have become more specialized. For example, medicine has MedRAG, law has LegalBench-RAG, and finance has its own domain-specific frameworks. These domain frameworks often provide not only specialized datasets but also specialized metrics such as medical accuracy or legal citation relevance.
+
+In practice, a good rule of thumb is:
+
+- If you need a baseline quickly, start with a more general framework such as RAGAS.
+- If you are diagnosing a specific problem, choose a more targeted framework.
+- If you are in medicine, law, finance, or another professional domain, prefer domain-adapted frameworks where possible.
+- Prefer actively maintained tools with strong documentation and responsive communities.
+
+Commonly recommended tools in the community include Ragas, Continuous Eval, TruLens-Eval, the evaluation features inside LlamaIndex, Phoenix, DeepEval, LangSmith, and OpenAI Evals.
+
+### 5.3.4 Evaluation Benchmarks
+
+The importance of evaluation benchmarks is often underestimated. Many teams start assessing a RAG system with only a handful of hand-written test questions, then discover that real online performance differs sharply from offline impressions. The root cause is that they lack representative and systematic evaluation data.
+
+A benchmark that supports system iteration well usually has three core characteristics:
+
+- representativeness, meaning it covers high-frequency user questions, boundary cases, and abnormal inputs
+- standardization, meaning question and answer formats, difficulty levels, and scoring rules are consistent
+- evolvability, meaning the benchmark can be updated as system capability and business needs evolve
+
+For most enterprises, because business scenarios are unique, the final answer is usually to build their own evaluation datasets.
+
+- Start by extracting real user questions from business logs and sampling them by type, frequency, and difficulty.
+- For simple cases, let domain experts annotate directly. For more complex questions, let a strong LLM generate candidate answers first, then have experts revise them.
+- Besides the answer itself, label metadata such as related documents, answer type, and difficulty level.
+- Update the dataset periodically with new hard cases discovered online.
+
+If resources are limited and you need a fast baseline, public benchmarks are still a useful starting point. As of 2025, many public benchmarks exist for both general and vertical scenarios:
+
+
+
+When choosing among them, first clarify the goal. Are you establishing a baseline, or validating the system before launch? Then check whether the benchmark covers the scenarios and difficulty profile you care about. For time-sensitive domains such as news or finance, make sure the benchmark includes time-sensitive tests.
+
+In practice, combining your own in-domain dataset with public benchmarks is often the most robust path because it keeps evaluation close to real business needs while also preserving some horizontal comparability.
+
+# 6. Deep Dive: Learning from Competitions and Open Tutorials (Optional)
+
+The principles and baseline implementation above are enough to help you build a usable prototype, but they are still some distance away from solving the harder problems that appear in production. If you want to understand more practical and battle-tested RAG techniques, one of the most efficient ways is to study winning competition solutions and strong open tutorials. These solutions often concentrate the best practices discovered by strong teams after repeated attempts in real scenarios.
+
+The examples below are representative rather than exhaustive. When you meet a specific problem in practice, such as PDF parsing, multimodal retrieval, or low-latency optimization, it is often effective to search for competitions related to that problem and study the technical reports and open code from winning teams.
+
+## 6.1 Semantic Cache: optimizing high-frequency queries
+
+Hugging Face provides a semantic-cache implementation built on top of the Chroma vector database:
+
+[https://huggingface.co/learn/cookbook/semantic_cache_chroma_vector_database](https://huggingface.co/learn/cookbook/semantic_cache_chroma_vector_database)
+
+
+
+Background: Most tutorial RAG systems are built for single-user testing. But once deployed to production, the system may receive dozens or thousands of repeated queries, for example support users repeatedly asking how refunds work. If every repeated query still triggers vector retrieval and an LLM call, latency and cost rise quickly. A semantic cache layer can sharply reduce pressure on the original data sources while preserving answer quality.
+
+This design uses a two-layer retrieval architecture. The base layer stores the original knowledge base in Chroma, using a dataset such as MedQuad as an example and assigning each entry a unique ID for precise reference. The cache layer is built on FAISS using a FlatL2 index. The semantic cache sits between the user query and Chroma, rather than caching the LLM's final answer directly. That design matters because directly caching answers can break personalized answer requirements such as "explain this in simple language."
+
+The cache system uses the `all-mpnet-base-v2` SentenceTransformer to generate query vectors and uses Euclidean distance, with a threshold of 0.35, to judge whether queries are similar. When the cache is full, controlled by the `max_response` parameter, the oldest entry is removed using FIFO. Cache data can also be saved into JSON files for cross-session reuse.
+
+In small-scale testing, a first query such as "How do vaccines work?" took 0.057 seconds when fetched from Chroma, while a similar query served from cache took only 0.016 seconds. In large production scenarios, this approach can produce 90 to 95 percent performance optimization in high-repeat environments and significantly reduce vector-store and API cost.
+
+## 6.2 Unstructured Data Processing: unified parsing for multi-format documents
+
+Another Hugging Face tutorial shows how to use the Unstructured library to build a full pipeline for non-structured document processing:
+
+[https://huggingface.co/learn/cookbook/rag_with_unstructured_data](https://huggingface.co/learn/cookbook/rag_with_unstructured_data)
+
+
+
+Background: In enterprise scenarios, knowledge is often scattered across PDFs, PowerPoint decks, EPUBs, HTML pages, and many other formats. Traditional preprocessing methods either support only one format or lose crucial structural information such as tables and title hierarchy during conversion. That makes it difficult for the RAG system to understand and retrieve the content correctly.
+
+This solution first downloads multi-format test documents, such as a Canadian pesticide handbook PDF containing many tables and a University of Florida citrus IPM PowerPoint file containing charts and multi-level headings. It then uses Unstructured's Local Runner for parsing. The configuration includes a processor config, a partition config that can optionally use API partition mode for stronger OCR, and a local config defining input paths. Parsed documents are converted into JSON containing typed elements such as body text, titles, and tables.
+
+The system then uses `chunk_by_title`, sets a max length of 512 characters, and merges consecutive fragments shorter than 200 characters to preserve semantic coherence. During conversion into LangChain Document objects, complex metadata fields are filtered to fit Chroma. The vector stage uses the `BAAI/bge-base-en-v1.5` embedding model, together with a 4-bit quantized `Llama-3-8B-Instruct` and a LangChain RetrievalQA chain to build a complete RAG system.
+
+The resulting system can handle multi-format documents accurately. For questions such as "Are aphids a pest?" it can extract key facts from the parsed documents and generate answers grounded in the relevant material. This is especially useful for enterprise knowledge bases that need to process many document types.
+
+## 6.3 Enterprise document QA: high-precision and traceable RAG
+
+The championship solution of the Enterprise RAG Challenge shows how to build a production-grade RAG system under strict time and precision requirements:
+
+- [https://abdullin.com/ilya/how-to-build-best-rag/](https://abdullin.com/ilya/how-to-build-best-rag/)
+- [https://hustyichi.github.io/2025/07/03/rag-complete/](https://hustyichi.github.io/2025/07/03/rag-complete/)
+
+Background: Contestants had to parse 100 real enterprise annual-report PDFs in 2.5 hours, each report with up to 1000 pages and containing complex financial tables, multi-column layouts, and charts. After parsing, the system had to answer 100 precise business questions with explicit answer types, such as yes-no, company names, exact numerical indicators, or executive titles, and it had to cite page numbers as evidence.
+
+The winning team chose IBM's open-source Docling as the PDF parser because it performed best on complex tables and multi-column text. They improved the Docling code so it could output JSON and Markdown-plus-HTML with metadata and especially improved table parsing. To accelerate processing, they rented RTX 4090 GPUs and finished the 100-report parse in 40 minutes.
+
+Text chunking used 300-token chunks with 50-token overlap and recursive splitting to preserve semantic coherence. To avoid cross-company contamination, each company had its own FAISS vector store using an `IndexFlatIP` index. Retrieval then followed three stages: retrieve Top-30 chunks by vectors, deduplicate by parent pages because multiple chunks may come from the same page, and rerank pages with GPT-4o-mini. Final ranking mixed vector retrieval and LLM reranking scores with a 0.3 to 0.7 weight split.
+
+Generation used different prompt templates for different answer types. For numeric questions, such as annual revenue, the system used a five-step analysis process to ensure indicator matching, unit consistency, and cross-checking. Outputs were structured to include analysis process and page references for traceability.
+
+The system won two awards and took first place on the leaderboard. An important observation was that even smaller models such as Llama 8B outperformed more than 80 percent of participants, while Llama 3.3 70B came close to GPT-4o-mini, showing that a good system design can successfully balance accuracy, efficiency, and cost.
+
+## 6.4 AIOps scenario: intelligent handling of mixed text-and-image data
+
+The EasyRAG project in an AIOps RAG competition focused on QA for operations scenarios:
+
+[http://blog.csdn.net/hustyichi/article/details/143323746](http://blog.csdn.net/hustyichi/article/details/143323746)
+
+
+
+Background: Operations engineers often need to read technical documents that include not only text but also monitoring charts, system architecture diagrams, and performance curves. For example, when diagnosing a system problem, the answer to "What should I do when CPU utilization exceeds 80 percent?" may be scattered between text descriptions and monitoring graphs. Traditional text-only RAG cannot understand chart trends and values, so answers remain incomplete.
+
+The indexing stage used an improved SentenceSplitter with 1024-token chunks and 200-token overlap. A key innovation was adding metadata such as knowledge-base paths and file paths to each chunk, which improved recall by 2 percent. For image data, the system first used PaddleOCR to extract text from charts and screenshots, then used a multimodal model, GLM-4V-9B, to generate natural-language descriptions of the image, for example describing a CPU usage line peaking at 90 percent in the afternoon. Both the OCR text and image description were then indexed together.
+
+Retrieval used a two-path BM25 plus vector strategy for broad recall. BM25 covered chunk retrieval and path retrieval, helping filter irrelevant documents by file path, while vector retrieval used `gte-Qwen2-7B-instruct`. Reranking used `bge-reranker-v2-minicpm-layerwise`, and a 28-layer setting performed best in experiments.
+
+Answer generation used a two-step strategy: first generate a draft from the Top-6 documents to maximize information coverage, then optimize the answer with the Top-1 most relevant document to emphasize the core answer.
+
+To handle long-text scenarios, such as a complete operations manual with hundreds of pages, the system also implemented BM25-based context compression, splitting documents into sentences, scoring sentence similarity to the query, and concatenating only the most relevant sentences. At 50 percent compression, this method achieved 86.48 percent accuracy in only 7.7 seconds and outperformed tools such as LLMLingua.
+
+## 6.5 Multi-source data fusion: collaboration between structured and unstructured knowledge
+
+The winning solution in the KDD Cup 2024 Meta RAG challenge showed how to integrate unstructured web content and structured knowledge graphs:
+
+- [https://blog.csdn.net/m0_59164520/article/details/143694213](https://blog.csdn.net/m0_59164520/article/details/143694213)
+- https://arxiv.org/pdf/2410.00005
+
+
+
+Background: Task 1 required retrieval summarization from five web pages. Task 2 added a mock API representing a structured knowledge graph, enabling direct access to things like movie databases and entity relationships. Task 3 raised the difficulty by using fifty web pages plus the mock API to answer more complex queries, such as identifying Nolan-directed films with box office greater than 500 million dollars. Every query had to finish within 30 seconds.
+
+For Task 1, the winning team built a refined web-processing pipeline. They used BeautifulSoup to extract page text and ParentDocumentRetriever to manage parent-child chunk relationships, using 200-token child chunks for retrieval and 500 to 2000-token parent chunks for generation. The embedding model was `bge-base-en-v1.5`, the vector store was Chroma, and reranking used `bge-reranker-v2-m3`. The team also supplemented movie and finance data from public datasets and fine-tuned `Llama-3-8B-instruct` with LoRA on training data that included invalid questions and reference answers.
+
+For Tasks 2 and 3, the key innovation was prioritizing the knowledge graph. The system defined standardized API calls such as `get_person` and `get_movie`, with filtering and sorting support. It first called the knowledge graph API and only fell back to web retrieval if the graph results were missing or invalid. This improved both speed and answer accuracy.
+
+Because the system prioritized the knowledge graph and used structured output formats, hallucination was clearly reduced. If the graph could provide a deterministic answer directly, the system returned it without a generative step. If web retrieval was required, the answer had to follow strict citation and stepwise reasoning rules.
+
+The solution won first place in all three tasks. The main lesson is that in enterprise scenarios containing both structured and unstructured data, retrieval strategy should be designed according to data type: use deterministic structured data first and treat unstructured sources as supplements.
+
+Across these practical cases, several shared principles appear repeatedly:
+
+- choose caching, retrieval, and generation strategies according to the business scenario
+- design dedicated parsing and indexing paths for different formats and modalities
+- treat hybrid retrieval plus reranking as a standard configuration
+- use task-specific prompting and structured outputs to improve accuracy and traceability
+
+These lessons from real competitions and open projects are valuable references when building stronger enterprise RAG systems.
+
+# 7. Broad Exploration: The Future Evolution of RAG (Optional)
+
+Once you have learned the practical skills and optimization methods of RAG, you can already improve system performance in concrete scenarios. But understanding only local engineering tricks is not enough if you want a wider grasp of where RAG is heading. We also need to look at broader evolutionary directions.
+
+RAG is now rapidly breaking beyond the traditional retrieve-text-chunks-then-generate pattern. In this section we focus on several of those paths: moving from chunk retrieval to graph-structured retrieval, combining images and audio into multimodal RAG, improving long-document handling through vectorized late chunking, and the way RAG is gradually evolving into an agent-oriented system.
+
+## 7.1 Graph RAG: reshaping deep retrieval with relationship networks
+
+Related research:
+
+- [https://arxiv.org/pdf/2410.05779](https://arxiv.org/pdf/2410.05779)
+- [https://arxiv.org/pdf/2502.11371](https://arxiv.org/pdf/2502.11371)
+- https://arxiv.org/pdf/2404.16130
+
+
+
+Traditional RAG works by finding text passages similar to the question, which is like picking out the few paragraphs that look most relevant from a pile of material. That works well for direct fact lookup. But if a question requires connecting multiple documents and combining different clues, performance drops.
+
+For example, a doctor might ask, "Based on these cases and the latest treatment guidelines, how should we evaluate the benefits and risks of a certain drug for elderly patients?" Or a project team might ask, "Looking across the past two years of requirements documents, review records, and online issue reports, which part of our system architecture fails most often?" Questions like these are not about finding a single sentence. They require identifying the people, objects, events, and relationships scattered across multiple materials and forming a complete picture.
+
+Graph RAG builds that picture proactively. The system uses a large model to identify key entities from text, such as people, organizations, functional modules, events, and data, together with their relationships, such as causality, dependence, change, and contradiction. It then builds a knowledge network that grows as more material is added. Through automatic grouping, closely related entities and relationships are organized into themes, and each theme can be summarized in advance. When a user asks a question, the system no longer searches only for text passages that look similar. It first finds the most relevant entities and local graph structure, expands through related topic groups, and then gives the analysis path, node descriptions, and source passages together to the LLM for reasoning.
+
+Under this framework, Graph RAG and traditional RAG complement one another. Traditional RAG remains strong for detail questions whose answers can be found in one step. Graph RAG is closer to how a human researcher thinks: first organize the overall structure and themes, then fill in evidence, and finally produce a conclusion with logic and conditions. Existing comparisons show that in multi-hop reasoning tasks, Graph RAG often covers more critical content and provides a broader perspective. Flexible combination of the two approaches is often better than using only one.
+
+## 7.2 Multimodal RAG
+
+Related research:
+
+- https://arxiv.org/pdf/2502.08826
+
+
+
+Real-world data is never only text. Engineers diagnosing server failures need to look at temperature curves, device screenshots, and logs together. Doctors making diagnoses need CT or MRI images, test reports, and electronic medical records at the same time. Traditional text RAG can at best retrieve phrases such as "temperature anomaly" or "suspected lung nodule," but it struggles to connect those descriptions to the actual chart trend or image lesion shape, and it cannot reverse-search documents or knowledge from images, audio, or video.
+
+Multimodal RAG solves this problem of different modalities being unable to "see" one another. Its core is cross-modal semantic alignment. The system uses suitable encoders for images, video, audio, and text, together with OCR, ASR, and layout analysis, extracts key information from visual and audio sources, and maps different modalities into a shared semantic space where a unified multimodal index can be built.
+
+At retrieval and generation time, whether the user asks for a chart showing a sales peak in Q3 2023 or uploads a sketch or operating video, the system first finds the closest multimodal evidence in that unified space, filters it by signals such as text similarity and image similarity, keeps the most useful pieces, and then gives those images, text passages, and tables together to a multimodal LLM. The model can then answer by combining evidence across modalities and ideally indicate the source or highlight relevant areas in the image or document.
+
+Compared with text-only RAG, multimodal RAG can use more kinds of evidence and often reduces hallucination while producing more complete and more verifiable answers.
+
+## 7.3 Late Chunking: preserving full context for long documents
+
+Related introduction:
+
+- https://jina.ai/news/late-chunking-in-long-context-embedding-models/
+
+
+
+Imagine reading a Wikipedia article about Berlin. Traditional RAG would first cut it into independent paragraphs and then embed each chunk. If the first sentence says "Berlin is the capital of Germany," later phrases such as "the city" or "its population" lose their connection to Berlin once separated. A query such as "What is the population of Berlin?" may then fail because the term Berlin and the population information never appeared inside the same chunk. This problem becomes even worse for long documents. In a 200-page insurance contract, the definition of a deductible may appear on page 5 while the conditions under which it applies appear on page 30. Fixed-length chunking can split these related pieces into dozens of isolated chunks, and experiments show that semantic similarity can collapse sharply when that happens.
+
+Late Chunking overturns the traditional chunk-first-then-embed pipeline and instead follows embed-first-then-chunk. With long-context embedding models that can handle something like 8192 tokens, the whole document is first passed through the Transformer, producing token-level embeddings that have already seen the full document. Only afterward are those globally informed token embeddings pooled into chunk embeddings according to chunk boundaries. The resulting chunks are no longer independent islands. They are context-dependent embeddings that preserve cross-paragraph references and conceptual relationships.
+
+On BEIR benchmark datasets, Late Chunking outperforms traditional chunking broadly, with especially strong gains on longer documents. In short-text scenarios, the difference largely disappears, which confirms a key rule: the longer the document, the bigger the advantage of Late Chunking. The method is now integrated into Jina Embeddings v3. Although encoding a whole long document first can increase inference time by 10 to 20 percent, the retrieval gains in scenarios such as medical records, legal documents, and technical manuals can easily justify that cost.
+
+Late Chunking shows that 8K-plus long-context embedding models are not overengineering in these scenarios. They are often necessary for producing high-quality chunk embeddings and represent a shift from chunk first, then embed, to embed first, then chunk.
+
+## 7.4 From RAG to RAG in the Agent Era
+
+Related discussions:
+
+- [https://ragflow.io/blog/rag-at-the-crossroads-mid-2025-reflections-on-ai-evolution](https://ragflow.io/blog/rag-at-the-crossroads-mid-2025-reflections-on-ai-evolution)
+- [https://arxiv.org/pdf/2501.09136](https://arxiv.org/pdf/2501.09136)
+- [https://www.letta.com/blog/rag-vs-agent-memory](https://www.letta.com/blog/rag-vs-agent-memory)
+- [https://www.linkedin.com/posts/richmondalake_100daysofagentmemory-rag-memorizz-activity-7348281860843577346-LM7Y/](https://www.linkedin.com/posts/richmondalake_100daysofagentmemory-rag-memorizz-activity-7348281860843577346-LM7Y/)
+- https://www.llamaindex.ai/blog/rag-is-dead-long-live-agentic-retrieval
+
+RAG has developed from a retrieval-augmented generation tool into a key part of an agent's cognitive architecture. Traditional RAG is built on a simple ask, retrieve, answer pattern and is fundamentally passive. It waits for a query and does not act proactively. To break through that passivity and handle more complex cognitive tasks, RAG has been deeply combined with agent capabilities, giving rise to a new paradigm: Agentic RAG.
+
+Under this paradigm, the role of RAG changes fundamentally. It is no longer only a passive provider of external knowledge. Instead, it becomes the core processing unit that supports intelligent behavior under the agent's active planning, goal direction, and self-reflection. This fusion gives the overall system goal orientation, iterative optimization, and autonomous decision-making, greatly deepening the quality of human-AI interaction. Agentic RAG can understand complex tasks, decompose them, plan retrieval strategies, and evaluate the quality of initial results to decide whether deeper exploration is needed.
+
+
+
+The key to this capability is a multi-layered active loop. Faced with a complex query, the agent first analyzes the nature of the problem, breaks it into subproblems, and designs precise retrieval strategies for each subproblem. After receiving initial results, it evaluates them, judges whether the information is complete and relevant, identifies knowledge gaps, and dynamically generates more precise new queries. This iterative process often includes multi-hop retrieval, where one round of results reveals new directions for the next round, producing a knowledge exploration chain similar to how a human researcher works.
+
+To support this ongoing, iterative intelligent behavior, especially when personalization and long-term knowledge accumulation matter, short-term conversation context alone is far from enough. This leads to the need for long-term, structured memory.
+
+That is exactly why RAG is increasingly assigned the role of an agent's long-term memory system and used to build a full external memory architecture. This long-term memory complements short-term memory, which is responsible for maintaining the current dialogue context. The long-term memory system relies on three key mechanisms:
+
+1. Structured indexing ability:
+ This allows the agent to build multi-dimensional indexes over huge amounts of unstructured data, by time, topic, entity relations, and more, supporting efficient retrieval from multiple angles much like humans recall information through different clues.
+2. Intelligent forgetting:
+ Through value-evaluation algorithms, the system can decay or selectively discard low-frequency, weakly related, or outdated information, keeping the memory system lean and efficient and preventing overload.
+3. Knowledge consolidation:
+ The system refines scattered dialogue and interaction experience into structured knowledge. Through entity recognition, relation extraction, and semantic clustering, fragmented information is connected into knowledge graphs, turning short-term experience into long-term knowledge.
+
+This external memory system built on RAG not only expands an agent's cognitive boundary significantly, but also gives it the ability to continue learning and evolving its knowledge. It allows the agent to accumulate experience over long-term interaction, form personalized operating patterns and domain knowledge systems, and support more complex and longer-running tasks.
+
+# Summary
+
+Retrieval-Augmented Generation is not only a technical method for compensating for hallucination and knowledge staleness in large models. It is also a key bridge for turning general AI capability into deep enterprise value. The evolution from Naive RAG to modular and agentic forms shows that every part of RAG needs to deepen continuously, including finer data handling, more scientific model selection across embedding, rerank, and LLM stages, and more systematic evaluation. All of these are necessary steps toward building enterprise knowledge systems that are controllable, trustworthy, and efficient. At the same time, drawing lessons from competitions and engineering case studies is one of the best ways to deepen understanding of the technical details.
+
+As Graph RAG, multimodal understanding, and Late Chunking continue to develop and combine, RAG is steadily pushing beyond the old retrieval-and-generation boundary and moving toward deeper semantic association and more sustainable memory capability. The hope is that this survey-style article helps you build a full-chain methodology, from principle to practice and from evaluation to evolution, so that in a fast-moving technical landscape you can build high-quality intelligent applications that truly land in the real world and can handle complex business challenges.
+
+# Reference
+
+[1] Ask in Any Modality: A Comprehensive Survey on Multimodal Retrieval-Augmented Generation.
+
+https://arxiv.org/pdf/2502.08826
+
+[2] Retrieving Multimodal Information for Augmented Generation: A Survey.
+
+https://arxiv.org/pdf/2303.10868
+
+[3] A Survey on RAG Meeting LLMs: Towards Retrieval-Augmented Large Language Models.
+
+https://arxiv.org/pdf/2405.06211
+
+[4] Retrieval-Augmented Generation for Large Language Models: A Survey.
+
+https://arxiv.org/pdf/2312.10997
+
+[5] LightRAG: Simple and Fast Retrieval-Augmented Generation.
+
+https://arxiv.org/pdf/2410.05779
+
+[6] Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG.
+
+https://arxiv.org/pdf/2501.09136
+
+[7] ERAGent: Enhancing Retrieval-Augmented Language Models with Improved Accuracy, Efficiency, and Personalization.
+
+https://arxiv.org/pdf/2405.06683
+
+[8] Graph Retrieval-Augmented Generation: A Survey.
+
+https://www.arxiv.org/pdf/2408.08921
+
+[9] Evaluation of Retrieval-Augmented Generation: A Survey.
+
+https://arxiv.org/pdf/2405.07437
+
+[10] Retrieval Augmented Generation Evaluation in the Era of Large Language Models: A Comprehensive Survey.
+
+https://arxiv.org/pdf/2504.14891
+
+[11] From Local to Global: A Graph RAG Approach to Query-Focused Summarization.
+
+https://arxiv.org/pdf/2404.16130
+
+[12] RAG vs. GraphRAG: A Systematic Evaluation and Key Insights.
+
+https://arxiv.org/pdf/2502.11371
+
+[13] Introduction to RAG | LlamaIndex Python Documentation.
+
+https://developers.llamaindex.ai/python/framework/understanding/rag/
+
+[14] All-in-RAG | A Full-Stack Guide to RAG in Large-Model Application Development.
+
+https://datawhalechina.github.io/all-in-rag/#/en/
+
+[15] Ilya Rice: How I Won the Enterprise RAG Challenge.
+
+https://abdullin.com/ilya/how-to-build-best-rag/
+
+[16] RAG Research Table - Awesome Generative AI Guide (GitHub).
+
+https://github.com/aishwaryanr/awesome-generative-ai-guide/blob/main/research_updates/rag_research_table.md
+
+[17] RAG is dead, long live agentic retrieval.
+
+https://www.llamaindex.ai/blog/rag-is-dead-long-live-agentic-retrieval
+
+[18] LLM/RAG Zoomcamp extra lesson 5: Common evaluation methods and market preferences in RAG evolution.
+
+https://vip.studycamp.tw/t/llmrag-zoomcamp-%E8%AA%B2%E5%A4%96%E8%A3%9C%E5%85%85-5%EF%BC%9Arag-evolution-%E5%B8%B8%E8%A6%8B%E8%A9%95%E4%BC%B0%E6%96%B9%E6%B3%95%E5%92%8C%E5%B8%82%E5%A0%B4%E5%81%8F%E5%A5%BD/8185
+
+[19] How to Evaluate Retrieval Augmented Generation (RAG) Applications.
+
+https://zilliz.com.cn/blog/how-to-evaluate-rag-zilliz
+
+[20] RAG is not Agent Memory.
+
+https://www.letta.com/blog/rag-vs-agent-memory
+
+[21] Richmond Alake. LinkedIn post on #100DaysOfAgentMemory, RAG and MemoRizz.
+
+https://www.linkedin.com/posts/richmondalake_100daysofagentmemory-rag-memorizz-activity-7348281860843577346-LM7Y/
diff --git a/docs/en/stage-3/ai-advanced/3.a2-langgraph-advanced-rag/index.md b/docs/en/stage-3/ai-advanced/3.a2-langgraph-advanced-rag/index.md
new file mode 100644
index 0000000..cfe84fc
--- /dev/null
+++ b/docs/en/stage-3/ai-advanced/3.a2-langgraph-advanced-rag/index.md
@@ -0,0 +1,3 @@
+# Intermediate and Advanced RAG with Workflow Orchestration - Using LangGraph as an Example
+
+> This chapter is currently being written. Stay tuned...
diff --git a/docs/en/stage-3/core-skills/agent-teams/index.md b/docs/en/stage-3/core-skills/agent-teams/index.md
new file mode 100644
index 0000000..47a83b6
--- /dev/null
+++ b/docs/en/stage-3/core-skills/agent-teams/index.md
@@ -0,0 +1,2761 @@
+# Claude Agent Teams Complete Guide
+
+## Introduction to Agent Teams
+
+**Agent Teams** is a revolutionary feature in Claude Code that allows **multiple independent AI instances to collaborate like a real development team**.
+
+Imagine that in the past, using Claude Code was like being a project manager working with one exceptionally capable assistant. No matter how complex the task was, only that one assistant was doing the work. Now, with Agent Teams, you can assemble a full AI development team: one member can handle the frontend, one can handle the backend, one can handle testing, and they can **work at the same time, communicate with each other, and collaborate to complete complex tasks**.
+
+### From a single assistant to team collaboration
+
+Before diving into Agent Teams, let's first understand the problem it solves.
+
+**Limitations of the single-AI mode**:
+
+When you use a single Claude instance to handle a complex project, you will run into these bottlenecks:
+
+- **Serial processing bottleneck**: AI can only do one thing at a time. For example, when refactoring a project, it may need to analyze the authentication module first, then the database module, and finally the API module. These steps must be done sequentially, even if they do not depend on each other.
+
+- **Context crowding problem**: All information lives in a single conversation window. As the conversation gets longer, important early details can get buried, and AI may forget key decisions discussed earlier.
+
+- **Single-perspective limitation**: Only one AI is thinking, so there is no multi-angle discussion or validation. When complex design decisions appear, there is no "teammate" to debate with or provide a different perspective.
+
+- **Efficiency ceiling**: Large refactors or multi-module development take a long time, and there is no way to speed them up through parallelism.
+
+**The Agent Teams solution**:
+
+Agent Teams solves these problems through **parallel collaboration across multiple instances**:
+
+- **True parallel work**: Multiple AIs can work on different tasks simultaneously. One can handle the frontend UI, another the backend API, and another the database design, without interfering with each other.
+
+- **Independent context spaces**: Every team member has its own full 200K token context window, so important information is not "forgotten" because the conversation gets too long.
+
+- **Team collaboration capability**: Members can communicate directly, discuss design decisions, and validate code quality with each other, just like a real development team.
+
+- **A significant efficiency increase**: According to Anthropic's internal testing, efficiency on large-scale project refactors can improve by around 50%.
+
+---
+
+## Agent Teams vs Subagent
+
+Before going deeper into the architecture of Agent Teams, we should first clear up a common point of confusion: **what is the difference between Agent Teams and Subagent**?
+
+Both features involve "multiple AIs collaborating," but their collaboration models are completely different and suitable for different scenarios.
+
+### Core differences at a glance
+
+| Dimension | Subagent | Agent Teams |
+|---------|-------------------|----------------------|
+| **Topology** | Star topology: all subagents report to the main agent | Mesh topology: members can communicate with each other |
+| **Communication style** | The main agent explicitly passes information via prompts, and subagents return results when done | Members can communicate, discuss, and coordinate directly |
+| **Context management** | Every subagent has an independent context, and the main agent passes only the necessary information | Every member has a fully independent context |
+| **Parallelism** | Can run in parallel, but the collaboration chain still centers on the main agent | True parallel development and collaboration |
+| **Task coordination** | The main agent dispatches and coordinates everything centrally | Members can take ownership of tasks more autonomously |
+| **Cost** | Not low. Token usage stacks when multiple subagents run in parallel | Higher. Members run independently and communicate more frequently |
+
+### An intuitive analogy
+
+**Subagent is like**: a manager writing separate task slips for several assistants. Each assistant works independently based on its own task slip, and when finished, only returns the result to the manager. The assistants do not communicate directly, and the manager does not see the assistants' full thought process while they work.
+
+```
+You → Main Agent → Subagent A: "Analyze this file"
+You → Main Agent → Subagent B: "Search for that function"
+ ↓
+ Subagent A completes → reports result to Main Agent
+ Subagent B completes → reports result to Main Agent
+ ↓
+ Main Agent synthesizes the results → reports back to you
+```
+
+**Agent Teams is like**: a project manager leading a real development team. Team members can communicate, discuss, and collaborate directly, rather than routing every detail through the project manager.
+
+```
+You → Team Lead: "Build a user authentication feature"
+ ↓
+ Team Lead creates the team and assigns tasks
+ ↓
+ Teammate A: "@Teammate B, is the API interface design ready?"
+ Teammate B: "Yes, here's the format..."
+ Teammate C: "I reviewed the interface and found something we should discuss..."
+ ↓
+ Team members collaborate to finish the work → Team Lead synthesizes the result → reports back to you
+```
+
+### When to use which one
+
+**Use Subagent when**:
+
+- You have a quick, clear, single task, such as "search for this error code"
+- Tasks do not depend much on each other
+- You want parallel execution, but do not need sustained discussion between members
+
+**Use Agent Teams when**:
+
+- You are doing a complex system refactor that spans multiple modules
+- You need multi-angle analysis and discussion, such as a security expert and a performance expert debating a solution
+- You need true parallel development, with frontend, backend, and testing happening at the same time
+- Tasks require frequent coordination and information sharing
+
+### A simple summary
+
+- **Subagent**: a task distribution tool that breaks a big task into smaller tasks and dispatches them to different "workers"
+- **Agent Teams**: a real collaborative team where members can communicate, discuss, and work together like a real team
+
+---
+
+## Core architecture
+
+Agent Teams is not just a simple "open multiple instances" feature. It is a complete **multi-agent collaboration system**. To understand it, we need to understand its core components and how they work together.
+
+### Team composition
+
+An Agent Team consists of four core components, each with its own responsibility, working together to complete complex tasks.
+
+**Team Lead**
+
+The Team Lead is the "brain" and "coordinator" of the entire team. It does not directly execute coding tasks. Instead, it is responsible for:
+
+- **Requirement analysis and task decomposition**: breaking the user's complex requirements into multiple subtasks that can run in parallel
+- **Team creation and management**: deciding how many members are needed and what each member should do
+- **Task assignment and scheduling**: assigning tasks to the right members and managing task dependencies
+- **Result synthesis and quality control**: collecting each member's work, integrating it, and doing the final review
+
+**Teammates**
+
+Teammates are the actual "developers" doing the work. Every Teammate is an independent Claude instance:
+
+- **Independent context window**: each member has a full 200K token context window, completely isolated from the Team Lead and the other members
+- **Full tool permissions**: they can use all tools such as Read, Write, Edit, and Bash
+- **Autonomous task pickup**: they can independently select and claim tasks from the shared task board
+- **Direct communication ability**: they can communicate directly with other members instead of always going through the Team Lead
+
+**TaskList**
+
+TaskList is the team's "project management tool," similar to Jira or Trello:
+
+- **Task status management**: every task has a clear status: `pending`, `in_progress`, or `completed`
+- **Dependency management**: tasks can define dependencies, and dependent tasks can only start after prerequisite tasks finish
+- **Automatic unlock mechanism**: when one task is completed, the system automatically checks and unlocks tasks waiting on it
+- **File lock mechanism**: when a member claims and starts a task, a lock file is created in the task directory to prevent multiple members from editing the same file at the same time
+
+**Messaging System**
+
+The messaging system is the "chat tool" between team members:
+
+- **Point-to-point communication**: member A can send a message directly to member B
+- **Broadcast announcements**: a message can be sent to all members at once
+- **File-system based**: messages are stored as JSON files in `~/.claude/teams/{team-name}/inboxes/`
+- **No network required**: everything works entirely through the local file system, with no network connection or port listening needed
+
+### Collaboration flow
+
+A typical Agent Teams workflow looks like this:
+
+```
+The user submits a complex requirement
+ ↓
+Team Lead analyzes the requirement and breaks it into tasks
+ ↓
+Creates team members and initializes TaskList
+ ↓
+ ├─→ Teammate A claims Task 1 ─┐
+ ├─→ Teammate B claims Task 2 ─┼→ Run in parallel
+ ├─→ Teammate C claims Task 3 ─┤
+ │ ↓
+ └──────────────────────────── Members coordinate through the messaging system
+ ↓
+ Once all tasks are complete, Team Lead synthesizes the result
+ ↓
+ Final output is delivered to the user
+```
+
+### File system layout
+
+Agent Teams creates dedicated directories on your local file system to manage team state:
+
+```
+~/.claude/
+├── teams/
+│ └── {team-name}/
+│ ├── config.json # Team config (member list, model selection, etc.)
+│ └── inboxes/
+│ ├── team-lead.json # Team Lead inbox
+│ ├── teammate-1.json # Member 1 inbox
+│ └── teammate-2.json # Member 2 inbox
+└── tasks/
+ └── {team-name}/
+ ├── task-1.json # Detailed info for Task 1
+ ├── task-2.json # Detailed info for Task 2
+ └── current_tasks/
+ └── parse_if_statement.txt # Lock file created while a task is running
+```
+
+The advantage of this design is **complete transparency**: you can inspect team status, task progress, and the communication history between members at any time.
+
+---
+
+## Quick start
+
+### Enable the experimental feature
+
+Agent Teams is currently an **experimental feature** and is disabled by default. To use it, you need to enable it first.
+
+**The easiest way: let Claude Code enable it for you**
+
+Type this directly in Claude Code:
+
+```
+Help me enable Agent Teams in settings.json
+```
+
+Or:
+
+```
+Enable the experimental feature agentTeams
+```
+
+Claude Code will automatically modify `~/.claude/settings.json` and add the following configuration:
+
+```json
+{
+ "experimental": {
+ "agentTeams": true
+ }
+}
+```
+
+**Restart Claude Code**
+
+After the configuration is added, **fully quit and restart Claude Code**, and the feature will take effect.
+
+**Manual configuration (if the automatic method does not work)**:
+
+You can manually edit `~/.claude/settings.json` and add or modify:
+
+```json
+{
+ "experimental": {
+ "agentTeams": true
+ }
+}
+```
+
+**How to verify it is enabled**
+
+After restarting Claude Code, try a conversation like this:
+
+```
+You: Can you help me create an Agent Team?
+
+Claude: Yes! I can help you create an Agent Team to collaborate on a task...
+```
+
+If Claude understands and responds to the request to create a team, the feature has been enabled successfully.
+
+### Visual mode configuration (optional)
+
+If you want to see team members' work in real time, you can configure **split-pane display mode**.
+
+**Let Claude Code configure it for you**:
+
+Type this directly in Claude Code:
+
+```
+Help me enable split-pane display mode for Agent Teams in settings.json, using tmux
+```
+
+Or:
+
+```
+Configure agent-teams to use split-panes mode
+```
+
+**Install tmux (if you do not have it)**:
+
+If `tmux` is not installed yet, you can ask Claude Code to install it:
+
+```
+Help me install tmux
+```
+
+Claude Code will automatically run the appropriate installation command based on your operating system, whether macOS or Linux.
+
+**What the configured result looks like**:
+
+After configuration, team members will work in different tmux panes, and you will be able to see all their output at the same time, like a "monitoring wall."
+
+```
+┌─────────────────┬─────────────────┬─────────────────┐
+│ Teammate 1 │ Teammate 2 │ Teammate 3 │
+│ Analyzing code │ Building API │ Writing tests │
+│ ... │ ... │ ... │
+│ │ │ │
+└─────────────────┴─────────────────┴─────────────────┘
+```
+
+**Manual configuration (if the automatic method does not work)**:
+
+You can manually edit `~/.claude/settings.json`:
+
+```json
+{
+ "experimental": {
+ "agentTeams": true
+ },
+ "agent-teams": {
+ "displayMode": "split-panes",
+ "terminalMultiplexer": "tmux"
+ }
+}
+```
+
+---
+
+### Hands-on example: build a Pokemon-style RPG game with Agent Teams
+
+Let's experience the power of Agent Teams through a full project. This example will show how multiple AI team members can collaborate to build an RPG game from scratch, including a battle system, dialogue features, and exploration elements.
+
+**Project requirements**:
+
+Build a Pokemon-style web RPG with the following features:
+
+- **Character system**: the player can create a character with level, HP, attack, defense, and other stats
+- **Battle system**: turn-based combat with attack, skills, items, and flee options
+- **Monster system**: multiple wild monsters with different attributes and skills
+- **Dialogue system**: NPC conversations and side quests
+- **Map exploration**: a simple 2D map where the player can move between scenes
+- **Save system**: save game progress, including level, position, completed quests, and more
+- **Sound effects and animation**: visual effects and sound effects for attacks, damage, and leveling up
+
+**Type this in Claude Code**:
+
+```
+I want to build a Pokemon-style web RPG game.
+
+Create a team to collaborate on development:
+
+Team member responsibilities:
+- Teammate A (Game Architect): design the overall architecture, define the game state machine, and plan the data structures
+- Teammate B (Battle System): implement turn-based combat logic, the skill system, and damage calculation
+- Teammate C (Dialogue System): implement NPC dialogue, the quest system, and story scripts
+- Teammate D (Map Rendering): use Canvas to implement 2D map rendering, character movement, and scene switching
+- Teammate E (UI & Audio): design the game interface, battle UI, and sound playback
+
+Technical requirements:
+- Use React + TypeScript + Vite
+- Use Tailwind CSS for styling
+- Use Framer Motion for animation
+- Use Zustand for state management
+- Turn-based battle system
+- Save data with localStorage
+- Use the Web Audio API for sound
+
+Use Sonnet for each member, and Opus for the Team Lead.
+
+First ask the architect to design the overall solution. After the data structures are defined, let the other members develop in parallel.
+```
+
+**How Claude will execute it**:
+
+**Phase 1: architecture design**
+
+Team Lead will first ask Teammate A, the game architect, to design the overall structure:
+
+```
+Teammate A is designing the game architecture...
+
+Tech stack: React 18 + TypeScript + Vite + Tailwind CSS + Framer Motion + Zustand
+
+Project initialization:
+npm create vite@latest pokemon-rpg -- --template react-ts
+npm install zustand framer-motion
+npm install -D tailwindcss postcss autoprefixer
+npx tailwindcss init -p
+
+File structure:
+src/
+├── App.tsx # Main app component
+├── main.tsx # Entry file
+├── types/
+│ ├── player.ts # Player type definitions
+│ ├── monster.ts # Monster type definitions
+│ ├── skill.ts # Skill type definitions
+│ └── game.ts # Game state types
+├── stores/
+│ └── gameStore.ts # Zustand state management
+├── components/
+│ ├── map/ # Map-related components
+│ │ ├── MapCanvas.tsx # Map Canvas
+│ │ ├── PlayerSprite.tsx # Player sprite
+│ │ └── NPCSprite.tsx # NPC sprite
+│ ├── battle/ # Battle-related components
+│ │ ├── BattleScreen.tsx # Battle screen
+│ │ ├── BattleMenu.tsx # Battle menu
+│ │ ├── HPBar.tsx # HP bar component
+│ │ └── BattleLog.tsx # Battle log
+│ ├── dialog/ # Dialogue-related components
+│ │ ├── DialogBox.tsx # Dialogue box
+│ │ └── Choices.tsx # Choices component
+│ └── ui/ # Shared UI components
+│ ├── Button.tsx
+│ ├── ProgressBar.tsx
+│ └── StatCard.tsx
+├── systems/
+│ ├── BattleSystem.ts # Battle system logic
+│ ├── DialogSystem.ts # Dialogue system logic
+│ ├── QuestSystem.ts # Quest system logic
+│ └── MapSystem.ts # Map system logic
+├── hooks/
+│ ├── useBattle.ts # Battle hook
+│ ├── useDialog.ts # Dialogue hook
+│ └── useAudio.ts # Audio hook
+├── data/
+│ ├── monsters.ts # Monster data
+│ ├── skills.ts # Skill data
+│ ├── dialogues.ts # Dialogue scripts
+│ └── maps.ts # Map data
+└── utils/
+ ├── audio.ts # Audio utilities
+ └── storage.ts # Save utilities
+
+Core data structure design:
+
+// types/player.ts - Player type
+interface Player {
+ id: string;
+ name: string;
+ level: number;
+ exp: number;
+ maxExp: number;
+ hp: number;
+ maxHp: number;
+ attack: number;
+ defense: number;
+ skills: Skill[];
+ position: { x: number; y: number };
+ direction: 'UP' | 'DOWN' | 'LEFT' | 'RIGHT';
+ gold: number;
+ items: string[];
+}
+
+// types/skill.ts - Skill type
+ id: string,
+ name: string,
+ type: 'attack' | 'heal' | 'buff' | 'special',
+ power: number,
+ accuracy: number,
+ pp: number,
+ maxPp: number,
+ effect?: string
+}
+
+// Game state
+{
+ screen: 'map' | 'battle' | 'dialog' | 'menu',
+ player: Player,
+ currentMap: string,
+ position: {x: number, y: number},
+ inBattle: boolean,
+ dialogQueue: Array