diff --git a/docs/.vitepress/config.mjs b/docs/.vitepress/config.mjs index 22a45fa..25fdc64 100644 --- a/docs/.vitepress/config.mjs +++ b/docs/.vitepress/config.mjs @@ -411,20 +411,36 @@ const stage2SidebarEn = [ collapsed: false, items: [ { - text: 'Using Lovart for Assets', - link: '/zh-cn/stage-2/frontend/2.0-lovart-assets/' + text: 'Build Your Own Asset-Production Agent with Lovart', + link: '/en/stage-2/frontend/2.0-lovart-assets/' }, { text: 'Figma & MasterGo Basics', - link: '/zh-cn/stage-2/frontend/2.1-figma-mastergo/' + link: '/en/stage-2/frontend/2.1-figma-mastergo/' }, { - text: 'UI Design for Modern Apps', - link: '/zh-cn/stage-2/frontend/2.2-ui-design/' + text: 'Build Your First Modern App - UI Design', + link: '/en/stage-2/frontend/2.2-ui-design/' }, { - text: 'UI Design Specs & Multi-Product', - link: '/zh-cn/stage-2/frontend/2.3-multi-product-ui/' + text: 'UI Design Guidelines and Multi-Product UI', + link: '/en/stage-2/frontend/2.3-multi-product-ui/' + }, + { + text: 'Make Interfaces Beautiful with LLMs and Skills', + link: '/en/stage-2/frontend/2.4-llm-skills-beautiful/' + }, + { + text: "Let's Build Hogwarts Portraits", + link: '/en/stage-2/frontend/2.5-hogwarts-portraits/' + }, + { + text: 'From Design Prototype to Project Code', + link: '/en/stage-2/frontend/2.6-design-to-code/' + }, + { + text: 'Upgrade Your UI with Modern Component Libraries', + link: '/en/stage-2/frontend/2.7-modern-component-library/' } ] }, @@ -433,32 +449,28 @@ const stage2SidebarEn = [ collapsed: false, items: [ { - text: 'Backend 1: What is API', - link: '/zh-cn/stage-2/backend/2.1-what-is-api/' + text: 'Learn Git and GitHub', + link: '/en/stage-2/backend/2.4-git-workflow/' }, { - text: 'Backend 2: Database & Supabase', - link: '/zh-cn/stage-2/backend/2.2-database-supabase/' + text: 'From Database to Supabase', + link: '/en/stage-2/backend/2.2-database-supabase/' }, { - text: 'Backend 3: AI-Assisted API Code', - link: '/zh-cn/stage-2/backend/2.3-ai-interface-code/' + text: 'Backend API Design and Development', + link: '/en/stage-2/backend/2.3-ai-interface-code/' }, { - text: 'Backend 4: Git Workflow', - link: '/zh-cn/stage-2/backend/2.4-git-workflow/' + text: 'Ship Your Product Prototype', + link: '/en/stage-2/backend/2.5-zeabur-deployment/' }, { - text: 'Backend 5: Deployment', - link: '/zh-cn/stage-2/backend/2.5-zeabur-deployment/' + text: 'From IDEs to CLI AI Coding Tools', + link: '/en/stage-2/backend/2.6-modern-cli/' }, { - text: 'Backend 6: Modern CLI Tools', - link: '/zh-cn/stage-2/backend/2.6-modern-cli/' - }, - { - text: 'Backend 7: Stripe Payment', - link: '/zh-cn/stage-2/backend/2.7-stripe-payment/' + text: 'Integrate Stripe and Other Billing Systems', + link: '/en/stage-2/backend/2.7-stripe-payment/' } ] }, @@ -468,7 +480,7 @@ const stage2SidebarEn = [ items: [ { text: 'AI 1: Dify & Knowledge Base', - link: '/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/' + link: '/en/stage-2/ai-capabilities/2.1-dify-knowledge-base/' } ] }, @@ -478,15 +490,15 @@ const stage2SidebarEn = [ items: [ { text: 'Hogwarts Portraits Project', - link: '/zh-cn/stage-2/frontend/2.4-hogwarts-portraits/' + link: '/en/stage-2/frontend/2.5-hogwarts-portraits/' }, { text: 'Assignment 1: Full-Stack App', - link: '/zh-cn/stage-2/assignments/2.1-fullstack-app/' + link: '/en/stage-2/assignments/2.1-fullstack-app/' }, { text: 'Assignment 2: UI Library & Trae', - link: '/zh-cn/stage-2/assignments/2.2-modern-frontend-trae/' + link: '/en/stage-2/assignments/2.2-modern-frontend-trae/' } ] } @@ -494,76 +506,102 @@ const stage2SidebarEn = [ const stage3SidebarEn = [ { - text: 'Claude Code In-Depth', + text: 'Core Skills', collapsed: false, items: [ { - text: 'Claude Code 快速上手核心指南', - link: '/zh-cn/stage-3/core-skills/basics/' + text: 'Claude Code Quickstart Core Guide', + link: '/en/stage-3/core-skills/basics/' }, { - text: 'Claude Code MCP 完全指南', - link: '/zh-cn/stage-3/core-skills/mcp/' + text: 'MCP and Claude Code Complete Guide', + link: '/en/stage-3/core-skills/mcp/' }, { - text: 'Claude Code Skills 完全指南', - link: '/zh-cn/stage-3/core-skills/skills/' + text: 'Claude Code Skills Complete Guide', + link: '/en/stage-3/core-skills/skills/' }, { - text: 'Long Running Tasks', - link: '/zh-cn/stage-3/core-skills/long-running-tasks/' + text: 'Making Claude Code Work on Long-Running Tasks', + link: '/en/stage-3/core-skills/long-running-tasks/' }, { - text: 'Claude Agent Teams 完全指南', - link: '/zh-cn/stage-3/core-skills/agent-teams/' + text: 'Claude Agent Teams Complete Guide', + link: '/en/stage-3/core-skills/agent-teams/' }, { - text: 'Claude Code Superpowers 工程级开发', - link: '/zh-cn/stage-3/core-skills/superpowers/' + text: 'Claude Code Superpowers for Production-Grade Development', + link: '/en/stage-3/core-skills/superpowers/' }, { - text: 'Claude Code 工作流最佳实践', - link: '/zh-cn/stage-3/core-skills/workflow/' + text: 'AI-Assisted Development Workflow', + link: '/en/stage-3/core-skills/workflow/' }, { - text: 'Claude Code 手机远程开发', - link: '/zh-cn/stage-3/core-skills/mobile-development/' + text: 'Claude Code Remote Development on Mobile', + link: '/en/stage-3/core-skills/mobile-development/' }, { - text: 'Claude Agent SDK 完全指南', - link: '/zh-cn/stage-3/core-skills/claude-agent-sdk/', - items: [ - { - text: 'Spec Coding:规范驱动开发', - link: '/zh-cn/stage-3/core-skills/claude-agent-sdk/spec-coding' - } - ] + text: 'Claude Agent SDK Complete Guide', + link: '/en/stage-3/core-skills/claude-agent-sdk/' + }, + { + text: 'From Vibe Coding to Spec Coding', + link: '/en/stage-3/core-skills/spec-coding/' } ] }, { - text: 'Multi-Platform', + text: 'Cross-Platform Development', collapsed: false, items: [ { - text: 'Advanced 3: WeChat Mini Program', - link: '/zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/' + text: 'How to Choose the Right Platform for Your App', + link: '/en/stage-3/cross-platform/3.1-choose-platform/' }, { - text: 'Advanced 4: WeChat Mini Program (Backend)', - link: '/zh-cn/stage-3/cross-platform/3.4-wechat-miniprogram-backend/' + text: 'How to Build a Simple WeChat Mini Program', + link: '/en/stage-3/cross-platform/3.3-wechat-miniprogram/' }, { - text: 'Advanced 5: Android App - Compose', - link: '/zh-cn/stage-3/cross-platform/3.5-android-app/' + text: 'How to Build a WeChat Mini Program with a Backend', + link: '/en/stage-3/cross-platform/3.4-wechat-miniprogram-backend/' }, { - text: 'Advanced 6: iOS App - SwiftUI', - link: '/zh-cn/stage-3/cross-platform/3.6-ios-app/' + text: 'How to Build an Android App with Jetpack Compose', + link: '/en/stage-3/cross-platform/3.5-android-app/' }, { - text: 'Advanced 7: Personal Website & Blog', - link: '/zh-cn/stage-3/personal-brand/3.7-personal-website-blog/' + text: 'How to Build an iOS App with SwiftUI', + link: '/en/stage-3/cross-platform/3.6-ios-app/' + }, + { + text: 'How to Build a PWA Local App', + link: '/en/stage-3/cross-platform/3.8-pwa-local-app/' + }, + { + text: 'How to Build a Browser AI Assistant Extension', + link: '/en/stage-3/cross-platform/3.9-browser-ai-extension/' + }, + { + text: 'How to Build a Cross-Platform Electron Desktop App', + link: '/en/stage-3/cross-platform/3.10-electron-voice-to-text/' + }, + { + text: 'How to Quickly Build and Mint an NFT', + link: '/en/stage-3/cross-platform/3.11-nft-minting/' + }, + { + text: 'How to Build a VS Code Extension', + link: '/en/stage-3/cross-platform/3.12-vscode-extension/' + }, + { + text: 'How to Build an Industrial Qt Desktop App', + link: '/en/stage-3/cross-platform/3.13-qt-industrial-hmi/' + }, + { + text: 'How to Build Your Personal Website and Academic Blog', + link: '/en/stage-3/personal-brand/3.7-personal-website-blog/' } ] }, @@ -572,12 +610,12 @@ const stage3SidebarEn = [ collapsed: false, items: [ { - text: 'AI Advanced 1: RAG Introduction', - link: '/zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/' + text: 'What Is RAG and How It Works', + link: '/en/stage-3/ai-advanced/3.a1-rag-introduction/' }, { - text: 'AI Advanced 2: LangGraph & Advanced RAG', - link: '/zh-cn/stage-3/ai-advanced/3.a2-langgraph-advanced-rag/' + text: 'Advanced RAG and Workflow Orchestration with LangGraph', + link: '/en/stage-3/ai-advanced/3.a2-langgraph-advanced-rag/' } ] } @@ -2121,12 +2159,12 @@ Sitemap: ${siteUrl}/sitemap.xml }, { text: 'Full-Stack Development', - link: '/zh-cn/stage-2/', + link: '/en/stage-2/', activeMatch: '/en/stage-2/' }, { text: 'Advanced Development', - link: '/zh-cn/stage-3/', + link: '/en/stage-3/', activeMatch: '/en/stage-3/' }, { diff --git a/docs/en/stage-1/1.0-finding-great-idea/index.md b/docs/en/stage-1/1.0-finding-great-idea/index.md index e58ac5f..3fc9cbe 100644 --- a/docs/en/stage-1/1.0-finding-great-idea/index.md +++ b/docs/en/stage-1/1.0-finding-great-idea/index.md @@ -433,154 +433,233 @@ The reconstructed product is called "Diabetics Eat with Peace of Mind," position The ordinary idea is aggregating news from various platforms, saving the trouble of opening them one by one. But Toutiao, Tencent News, etc., already do this well. -Xiao Ming horizontally segmented and found the investment research group: They need to track industry dynamics, but information is scattered; Need to quickly filter valuable information from massive news; Need to correlate news with stock price movements. +Xiao Ming then did horizontal segmentation and found that financial analysts have a special need: they must track dynamics in specific industries, but information is too fragmented. He further deep-dived into their scenarios: in the morning they check overnight U.S. market moves and exchange-rate changes; during the day they track announcements and industry news for portfolio companies; in the afternoon they research potential targets and need large amounts of sector information. -The reconstructed product is called "Investment Research Intelligence Officer," positioned as an "intelligent assistant for investment researchers." +The reconstructed product is called "Investment Research Intelligence Officer," positioned as an "information radar and decision assistant" for financial professionals. + +--- + +#### Example 3: From "Campus Second-Hand Platform" to "Graduation Clearance Assistant" + +The ordinary idea is a campus second-hand marketplace. But Xianyu and Zhuanzhuan are already very mature. + +After horizontal segmentation, Xiao Ming found that graduates have a special need: they have too many things, and selling one by one is too troublesome. Deep-diving into their scenarios: they must leave campus within a week before graduation and do not have time to sell slowly; they do not know who needs their items; bargaining, delivery, and payment collection are all too cumbersome. + +The reconstructed product is called "Graduation Clearance Assistant," positioned as a "move-out asset manager" for graduates. + +--- + +### Act 2 Summary: Xiao Ming's Takeaways + +Through Act 2, Xiao Ming understood: + +**1. Start from yourself** +- You yourself are a user, so you naturally understand that group's pain points +- Interest is the best starting point, and passion is the best driving force + +**2. Segment users horizontally** +- Do not serve "everyone"; find "the segment with the strongest pain" +- The more specific the segment, the more opportunity and stronger willingness to pay + +**3. Deep-dive scenarios vertically** +- Describe the complete user journey: before use, during use, after use +- Find emotional touchpoints: fear, anxiety, helplessness, loneliness... + +**4. Reconstruct value** +- Upgrade from a "feature" to a "solution" +- Upgrade from a "tool" to an "assistant/manager/companion" + +--- + +📦 **Act Output:** +- Found one precise target user group (postpartum moms) +- Understood users' complete scenarios and real emotions +- Reconstructed the product concept with a clear differentiation strategy --- ## Act 3: AI Dialogue Refinement -After finding the direction, Xiao Ming didn't rush to code. He knew that a good idea also needs a clear execution plan. +Xiao Ming now had a clear product direction: a postpartum mom recovery assistant. But he still did not know exactly how to execute it, where to start, or how difficult the technical implementation would be. -He decided to use AI to help him refine the product concept. +He decided to use AI to help refine the concept and turn the idea into an actionable plan. -### Using AI to Refine Product Concepts +### Round 1: Present the raw idea -Xiao Ming opened ChatGPT and entered the following prompt: - -``` -I want to create a "Postpartum Mom Recovery Assistant" product. -Target users: Postpartum moms within 1 year -Core pain point: Want to recover figure but no time, don't know how to exercise safely -Main features: -1. Fragmented training (10-15 minutes per session) -2. Postpartum-specific courses -3. AI movement correction -4. Psychological support community - -Please help me refine: -1. Is this product concept complete? What's missing? -2. What should be the MVP (Minimum Viable Product) features? -3. What are the potential risks and challenges? -4. How to validate market demand? -``` - -AI gave detailed suggestions: - -**1. Product Concept Completeness Analysis:** -- Missing: User acquisition channels, retention mechanisms, monetization path -- Suggestion: Add referral mechanism, achievement system, tiered membership - -**2. MVP Features:** -- Core: 10 postpartum recovery courses + simple movement guidance -- Don't need: AI movement correction (high technical difficulty, can be added later) -- Don't need: Community (need user base first) - -**3. Potential Risks:** -- Medical compliance risk: Postpartum recovery involves medical knowledge -- User trust: Why would users trust your courses? -- Competition: Keep might add postpartum features - -**4. Market Validation Methods:** -- Create a landing page, collect interested users' emails -- Interview 20 postpartum moms, understand their real needs -- Pre-sell course packages, test willingness to pay - -### Using AI to Generate Business Plans - -Xiao Ming continued to ask AI: - -``` -Based on the above analysis, please help me generate a simple business plan, including: -1. Product positioning and target users -2. Core features and MVP scope -3. Business model and monetization path -4. Market validation plan -5. Risk assessment and countermeasures -``` - -AI generated a structured business plan that Xiao Ming could directly use to communicate with potential partners or investors. +Xiao Ming told AI his idea: "I want to build a postpartum mom recovery assistant app to help postpartum moms recover their figure. But I worry about technical implementation difficulty, especially the AI movement-correction feature. Can one person do this?" --- -## Summary: The Complete Methodology for Finding Great Ideas +### Round 2: AI helps plan the MVP -Through Xiao Ming's story, we learned a complete methodology: +AI replied: "Great question! Let's create an MVP (Minimum Viable Product) plan." -### 1. Establish Judgment Criteria -- Real needs = Users willing to pay + willing to change behavior + significant loss without solution -- Priority: Pain points > Delight points > Itch points +AI suggested a three-phase plan. Phase 1 is 1-2 months, and the goal is demand validation. Do not build a full app yet; first build a WeChat mini program that only provides postpartum-specific courses (videos) and a check-in community. Use manual guidance for movement feedback: users upload videos and coaches reply. The goal is to validate whether postpartum moms are willing to pay. -### 2. Discover Pain Points -- Start from yourself and people around you -- Horizontal segmentation: Find specific user groups -- Vertical deep dive: Understand complete user scenarios +Phase 2 is 3-6 months. If Phase 1 validates successfully, then add core features. Develop AI movement correction, possibly with existing SDKs such as Baidu human-pose recognition. Add personalized plans and optimize community functions. -### 3. Validate Needs -- Talk to real users -- Analyze existing alternatives -- Test willingness to pay -- Estimate market size -- Consider competitive barriers +Phase 3 is 6-12 months, for scaling. Build an independent app, cooperate with postpartum recovery institutions, and expand to pregnancy and pre-pregnancy scenarios. -### 4. Refine Product Concept -- Use AI to help refine ideas -- Define MVP scope -- Develop business plan -- Plan market validation +For technical implementation, AI suggested using existing AI capabilities for movement correction, such as Baidu or Tencent APIs, instead of building from scratch, to reduce cost. In the early stage, use a hybrid mode of manual plus AI, then transition gradually. -### Key Takeaways +For startup cost, mini program development is about 10,000-20,000 RMB if outsourced, or do it yourself with AI IDE. For content creation, cooperate with postpartum recovery coaches. For promotion, start from mom groups and Xiaohongshu. -1. **Direction is more important than effort** — Wrong direction, the more you do, the more wrong -2. **Real needs are voted by users** — Not imagined by product managers -3. **Segmentation is key** — Serving a small group well is better than pleasing everyone poorly -4. **Validate early** — Don't invest heavily before validating demand -5. **AI is your assistant** — Use AI to refine ideas, but the final judgment is yours +Validation KPI: 100 users in the first month and a 10% paid conversion rate. If reached, demand is real and you continue investing; if not, adjust direction in time. --- -In the next chapter, we'll take our validated ideas and start learning how to use AI IDE to turn them into interactive product prototypes. +### Round 3: Xiao Ming raises concerns -## Final Act: Your Action Plan - -### Memory Mantra - -**Start from one person, one concrete task, one entry point; segment broadly, drill deeply; refine with AI; validate in five steps before building.** - -Interpretation: - -- **One person**: Start from a user group you naturally understand -- **One task**: Focus on one concrete problem, not everything at once -- **One entry point**: Find a sharp initial wedge -- **Horizontal segmentation**: Identify the user subgroup with strongest willingness to pay -- **Vertical deep dive**: Understand the full user scenario and journey -- **AI dialogue refinement**: Use AI to sharpen product concept and execution plan -- **Five-step validation**: Verify demand before investing heavily +After thinking, Xiao Ming asked again: "I am worried about content production. Postpartum recovery courses require professional guidance, and I do not have a medical background. How can I ensure course quality?" --- -### Post-class Exercise +### Round 4: AI provides solutions -Pick one small but real annoyance from your daily life and apply this chapter's method: +AI replied: "This is a common concern for technical founders, but there are solutions." + +Solution 1 is to collaborate with professionals. Work with postpartum recovery coaches or hospital nutrition doctors. You provide the technology and product; they provide professional content. Use a revenue-sharing model. + +Solution 2 is to integrate high-quality existing content. In the early stage, use public postpartum recovery videos, such as from YouTube or Bilibili. Do content review and curation first, then produce original content later. + +Solution 3 is user-generated content. Encourage experienced moms to share their recovery methods, and use community voting to filter quality content, gradually building a content ecosystem. + +AI recommended starting with Solution 1. Specific operation: find 2-3 postpartum recovery coaches and discuss cooperation; you build the product, they provide courses. Revenue split: you take 70%, they take 30%. This ensures quality while reducing cost. Early-stage course production cost can be zero with revenue sharing; you only invest development time and promotion cost. + +**My suggestion: start with Solution 1** + +Specific actions: +1. Find 2-3 postpartum recovery coaches +2. Propose cooperation: you build the product, they provide courses +3. Revenue split: you take 70%, they take 30% +4. This ensures content quality and lowers cost + +**Early-stage cost:** +- Course production: 0 RMB (revenue-share model) +- You only need to invest development time and promotion cost + +::: + +--- + +### Round 5: Xiao Ming asks about promotion + +Xiao Ming had one final concern: "Will promotion cost be high? How do I find postpartum moms?" + +--- + +### Round 6: AI provides promotion strategy + +AI replied: "This group is actually easy to find, and promotion cost can be very low." + +The most recommended channel is Xiaohongshu. Search keywords like "postpartum recovery" and "diastasis recti," find related creators, and DM for collaboration. Offer "1 month free trial" in exchange for promotion, with near-zero cost. + +Second is mom groups. Search WeChat groups and QQ groups. Join under the name of "postpartum recovery experience sharing," share useful content, do not post direct ads, and gradually build trust. + +You can also cooperate with hospital obstetrics departments by providing free postpartum recovery guidance and distributing flyers in hospitals, with only a few hundred RMB printing cost. Or cooperate with maternal-and-infant stores, place promotional materials, and provide trial cards with purchases, with only trial-card production cost. + +Validation metrics: in the first month, 100 users and 10 paid users (10% conversion rate), total promotion cost under 1000 RMB, and customer acquisition cost under 10 RMB per user. If these metrics are met, demand is real and you can continue investing. + +--- + +### Final: Xiao Ming now has a clear plan + +After 6 rounds of dialogue, Xiao Ming finally had a clear plan. + +Phase 1 (1-2 months): build a WeChat mini program, cooperate with 2-3 postpartum recovery coaches (revenue share), provide only postpartum-specific courses (videos) and a check-in community, and use manual movement guidance. Target: 100 users and 10% paid conversion. + +Phase 2 (3-6 months): if Phase 1 validates successfully, continue investing. Add AI movement correction, personalized plans, and optimize community features. + +Phase 3 (6-12 months): develop an independent app, cooperate with postpartum recovery institutions, and expand to pregnancy and pre-pregnancy phases. + +Startup cost is very low: development done by yourself using AI IDE (0 RMB), content with coach revenue sharing (0 RMB in early stage), and promotion via Xiaohongshu plus mom groups (under 1000 RMB). Total cost under 1000 RMB. + +--- + +### The 5-step method for AI dialogue refinement + +From this case, Xiao Ming summarized a standard AI dialogue workflow (see Appendix E for details). + +**Step 1: Present the raw idea.** Describe your initial idea, even if rough. Tell AI your concerns, such as heavy competition or unclear differentiation. + +**Step 2: Ask AI to plan the MVP.** What should the minimum viable product include? How many phases? What are the goals in each phase? How difficult is implementation? + +**Step 3: Raise your concerns.** Technical difficulty? Content production cost? Promotion cost? User acquisition difficulty? Tell AI all your concerns. + +**Step 4: Ask AI for concrete solutions.** AI will provide specific suggestions for your concerns. Compare options and choose the best one. Estimate costs. + +**Step 5: Finalize the plan.** Organize a clear action plan and set validation metrics. If targets are not met, adjust in time. + +**Prompt template:** +```text +I want to build a [product concept], +but I am worried about [your concern]. +Please help me: +1. Plan an MVP +2. Give concrete technical implementation suggestions +3. Estimate cost +4. Set validation metrics +``` + +--- + +### Act 3 Summary: Xiao Ming's Takeaways + +Through Act 3, Xiao Ming understood three things. + +**First, use AI dialogue to refine product concepts.** Do not expect one conversation to produce a perfect answer; iterate through multiple rounds. Tell AI your observations, experiences, and feedback from people around you. If AI suggestions are unreasonable, point it out in time. Always end with a concrete action plan. + +**Second, MVP core principles.** Keep it minimal, and only build the core function. Make it verifiable, so you can quickly validate whether demand is real. Keep it low cost, and validate with the smallest possible investment. + +**Third, validation metrics.** Paid conversion > 10% means demand is real and worth investment. Paid conversion 5-10% means demand exists but needs refinement. Paid conversion < 5% means demand does not hold and direction should be adjusted. + +--- + +📦 **Chapter Output:** +- A clear MVP plan +- A known technical implementation path +- Defined validation metrics + +--- + +## Final Act: Your Action + +### Memory mantra + +**Start from one person, one thing, one entry point. Segment horizontally, dig vertically, refine through AI dialogue, and only build after five-step validation.** + +**Explanation:** +- **One person:** Start from yourself because you naturally understand this group +- **One thing:** Focus on one concrete thing and do not be greedy +- **One entry point:** Find a sharp entry point, and the more segmented, the better +- **Horizontal segmentation:** Find users with strongest willingness to pay +- **Vertical deep dive:** Understand users' complete journey +- **AI dialogue:** Refine product concepts with AI dialogue +- **Five-step validation:** Use the five-step method to validate demand authenticity + +--- + +### Post-class exercise + +Choose one small annoyance from your daily life and expand it using this chapter's method: ::: tip Exercise Task -**1. Describe the annoyance in one sentence** -- Example: "I want to build a budgeting app to help users track spending." +**1. Describe this annoyance** (in one sentence) +- Example: "I want to build a bookkeeping app to help users record spending." -**2. Horizontal segmentation: list 3 user groups with different needs** +**2. Horizontal segmentation: find 3 user groups that may have different needs** - Example: small business owners, parents of overseas students, freelancers -**3. Choose one user group and perform a vertical deep dive** -- Example: parents of overseas students worry about spending but cannot see detailed usage +**3. Select one group, then deep-dive vertically: describe their complete scenario and real emotions** +- Example: scenario of overseas-student parents - they want to know how much their child spends abroad, but the child does not tell them -**4. Reframe product concept: from one feature to one solution** -- Example: "Overseas Spending Steward" instead of a generic bookkeeping app +**4. Reconstruct product concept: evolve from "one feature" into "one solution"** +- Example: "Overseas Spending Steward" - not just bookkeeping, but giving parents confidence and visibility into overseas spending -**5. Evaluate your idea with the validation checklist (see Appendix F)** +**5. Evaluate your idea with the validation checklist** (see Appendix F) -Share your analysis with the community and discuss with others. +**Share your analysis in the community and discuss with other learners!** ::: @@ -588,251 +667,420 @@ Share your analysis with the community and discuss with others. ## Appendix: SOP Methodology -### Appendix A: 5-Step Demand Validation Method +### Appendix A: 5-Step judgment method for need analysis -When you have an idea, how do you quickly judge whether it is worth building? +When you have an idea, how can you quickly judge whether it is worth investing in? -**Step 1: User validation (find 10 target users)** +**Step 1: User validation - find 10 target users** -Do **not** ask: "Would you use my product?" +**Do not ask:** "Will you use my product?" (false-positive rate is around 90%) -Ask: -1. "How do you solve this problem today?" (real behavior) -2. "How many times did this bother you in the last week?" (frequency) -3. "How much time/money do you currently spend to solve it?" (willingness to pay) -4. "If a solution requires behavior change, would you do it?" (switching cost) +**Ask instead:** +1. "How do you currently solve this problem?" (understand real behavior) +2. "How many times did this problem bother you in the last week?" (understand frequency) +3. "How much money/time did you spend to solve it?" (understand willingness to pay) +4. "If there is a solution but it requires changing habits, are you willing?" (understand change cost) -Evaluation hints: -- If 3+ users say "this hurts me every day," it may be a pain point -- If users say "interesting, but not urgent," it's likely an itch point -- If users already use alternatives but remain unsatisfied, there is a real opening +**Decision criteria:** +- If more than 3 users say "this gives me headache every day" - it may be a pain point +- If users say "interesting, but not urgent" - most likely an itch point +- If users say "I currently use XX, but not satisfied" - there is opportunity -**Key question:** what alternative are users using now? +**Key question:** what method do users currently use to solve this problem? -| Alternative Type | What It Means | Opportunity | +| Alternative Type | Description | Opportunity Assessment | |------------|------|---------| -| No alternative | Users endure silently | Big opportunity, but requires market education | -| Very manual workaround | Excel/manual/multi-person patchwork | Good opportunity | -| Tool stitching | A + B + C tools combined | Good integration opportunity | -| Mature product, dissatisfied | Existing gap in experience | Opportunity with differentiation | -| Mature product, satisfied | Need disruptive edge | Low probability opportunity | +| **No alternative** | Users silently endure | Big opportunity, but market education is required | +| **Using clumsy methods** | Excel, manual work, multi-person collaboration | Good opportunity, users want better solutions | +| **Combining multiple tools** | Tool A + Tool B + Tool C | Good opportunity, integration has value | +| **Using mature products** | But users are unsatisfied | Opportunity exists, but differentiation is needed | +| **Using mature products** | Users are satisfied | Very small opportunity unless there is disruptive innovation | ::: tip What is "disruptive innovation"? -It is not just "better features"; it is usually a simpler/cheaper way to serve ignored users, then expanding upward over time. + +**Simple definition:** not making products incrementally better, but serving previously overlooked user groups with a simpler/cheaper approach. + +**Examples:** +- Traditional phones -> smartphones (not just more functions, but a completely different interaction model) +- Traditional taxis -> Didi/Uber (not better cars, but on-demand ride calling anywhere) +- Traditional bookstores -> e-books (not more books, but easier carrying and purchasing) + +**Key point:** disruptive innovation often starts from low-end markets or new user groups, and then gradually moves upward. + ::: -**Most direct validation method: pre-order/deposit** +**Cases:** +- Diabetics currently control diet by "experience + guessing" (very clumsy method) -> big opportunity +- Ordinary dieters use Bohe Health (mature product, medium satisfaction) -> opportunity for vertical segmentation +- Students use WeChat groups for second-hand trading (multiple tools stitched together) -> opportunity for integration -1. Build a simple landing page -2. Add a pre-order or reservation button -3. Measure how many users actually pay (even a small deposit counts) +**Most effective method: presale or deposit** -Decision threshold: -- Deposit conversion > 10%: demand is likely real -- Deposit conversion 5%-10%: demand exists but product needs refinement -- Deposit conversion < 5%: concept likely not working yet +**Steps:** +1. Create a simple landing page and describe your product concept +2. Put a "presale" or "reservation" button +3. See how many people are willing to pay (even 1 RMB counts) -Simple market formula: +**Decision criteria:** +- Users willing to pay deposit > 10%: demand is real and worth doing +- 5%-10%: demand exists but needs refinement +- < 5%: demand may not be valid, or product concept has issues +**Note:** many people say "I will buy." The people who actually pay are your real target users. + +**Simple formula:** ```text -Market size = target user count × willingness to pay × average order value +Potential market size = target user count × willingness to pay × average order value ``` -**Early-stage moat reality:** -- Most early products do not start with a strong moat -- That's normal; speed of execution matters first -- Build traction first, then strengthen barriers +**Case: campus second-hand trading platform** +- Target users: 40 million college students in China +- With second-hand trading demand: 50% = 20 million +- Willing to use platform: 10% = 2 million +- Annual transaction frequency: 2 times +- Platform commission: 5% +- Average order value: 100 RMB +- Potential market size = 2,000,000 × 2 × 100 × 5% = 20 million RMB/year + +**Decision criteria:** +- Market size > 1 billion RMB: large track, worth pursuing +- 100 million-1 billion RMB: medium/small track, possible but ceiling is visible +- < 100 million RMB: niche market, suitable for side business or a small-and-beautiful business + +**Key question:** if the product succeeds, what if others copy it? + +**Common moat types:** + +| Moat Type | Description | Example | +|-----------|------|------| +| **Network effects** | More users -> more product value | WeChat, Didi | +| **Data accumulation** | More data -> better algorithm | Toutiao, Douyin | +| **Brand cognition** | Occupying user mindshare | Coca-Cola, Nike | +| **Scale effects** | Larger scale -> lower costs | JD logistics, Amazon | +| **Technical patents** | Core technology barriers | Huawei, DJI | +| **Switching costs** | High migration cost for users | Enterprise software, operating systems | + +**Early-stage reality:** +- Most early projects do not have clear moats +- But that is fine; the key is to **move fast** +- Occupy market first, then build barriers --- -### Appendix B: Horizontal Segmentation Method +### Appendix B: Horizontal user-segmentation method -Do not target "all X users." Choose one specific segment with sharper pain and clearer willingness to pay. +Do not try to serve "all XX users." Instead, find **one specific group** with sharper and more concrete needs. -Steps: -1. List all possible subsegments -2. Evaluate each segment -3. Pick one and go deep +**Step 1: List all possible segmented user groups** -| Evaluation Dimension | Explanation | +For your product concept, list all possible user groups. + +**Step 2: Evaluate the business value of each group** + +| Evaluation Dimension | Description | |---------|------| -| Pain intensity | Pain point or itch point? | -| Willingness to pay | How much would they pay? | -| Market size | How many users are there? | -| Competition | Are existing solutions already good enough? | -| Your understanding | Do you understand this group and have access channels? | +| Pain intensity | Is this group's need a pain point or itch point? | +| Willingness to pay | How much are they willing to pay for a solution? | +| Market size | How many people are in this group? | +| Competition level | Are current solutions satisfactory? | +| Your understanding of this group | Do you understand this group? Do you have access channels? | -Selection priority: -- Highest pain intensity -- Strongest payment intention -- Best domain understanding from you -- Relatively lower competitive pressure +**Step 3: Choose one group for deep analysis** + +Choose the one that is: +- most painful +- highest willingness to pay +- best understood by you +- relatively less competitive + +::: tip Segmentation Example + +**Product concept:** bookkeeping app + +| Segmented Group | Pain Point | Willingness to Pay | Market Size | Competition | +|---------|------|---------|---------|---------| +| Ordinary office workers | Recording is troublesome | Low | Large | High | +| Small business owners | Personal/company spending is mixed up | High | Medium | Medium | +| Freelancers | Unstable income, need cash-flow forecast | High | Medium | Medium | +| Parents of overseas students | Want to know child's spending but child does not say | High | Small | Low | + +**Chosen segment:** parents of overseas students (strongest pain point, high willingness to pay, relatively low competition) + +::: --- -### Appendix C: Vertical Scenario Deep-Dive Method +### Appendix C: Vertical scenario deep-dive method -After choosing a segment, do not stop at features. Understand the full context and emotional journey. +After finding the user group, do not stop at a single feature. You need to understand the user's **complete scenario**. -Steps: -1. Describe the user's day end-to-end -2. Find pain points in each scenario -3. Identify emotional triggers (fear, anxiety, helplessness, loneliness, frustration) -4. Rebuild product value around scenario + emotion +**Step 1: Describe one full day of the user** -The goal is to evolve from "tool feature" to "complete solution." +From morning to night, describe the complete scenario in which the user interacts with your product. + +**Step 2: Analyze pain points in each scenario** + +In each scenario, what problems does the user encounter? What emotions appear? + +**Step 3: Find emotional touchpoints** + +Fear, anxiety, helplessness, loneliness, anger, regret... + +**Step 4: Reconstruct value** + +Based on scenarios and emotions, reconstruct product value. + +::: tip Deep-Dive Example + +**User group:** postpartum moms + +| Time | Scenario | Pain Point | Emotion | +|------|------|------|------| +| 6 AM | Baby just fell asleep, 30 minutes free | Do not know what movement is safe | Fear | +| 10 AM | Holding baby to help sleep, lower back soreness | Hands occupied, wants recovery exercise | Anxiety | +| 3 PM | Baby sleeping, wants to exercise | Body is tired, unsure if can continue | Helplessness | +| 8 PM | Finally has time | Sees body in mirror and feels life is over | Depression | +| Long term | No one understands | Feels like only self suffers this much | Loneliness | + +**Reconstructed value:** upgrade from "fitness tool" to "rehab coach + psychological supporter" + +::: --- -### Appendix D: More Examples of Idea Reframing +### Appendix D: More examples from ordinary ideas to great ideas -#### Example 1: From "Bookkeeping App" to "Overseas Spending Steward" +#### Example 1: From "bookkeeping app" to "Overseas Spending Steward" -- Generic idea: automatic spending categorization -- Core problem shift: not bookkeeping, but parents' loss of control over overseas spending visibility -- Reframed value: real-time child spending sync + overspending alerts + monthly analysis + peer benchmark insights +**Ordinary idea:** automatic bookkeeping app, connecting bank cards to auto-categorize spending -#### Example 2: From "Pomodoro Tool" to "Remote Work Proof" +**Problem:** there are already SuiShouJi, WaCai, Alipay bills... -- Generic idea: focus timer -- Core problem shift: not focus itself, but trust gap between remote worker and manager -- Reframed value: time tracking + productivity reports + privacy-safe activity summaries + auto daily report +**Horizontal segmentation:** +- Parents of overseas students: want to know how much their child spends abroad and whether they overspend -#### Example 3: From "Used Book Marketplace" to "Picture Book Subscription Library" +**Vertical deep dive:** +- Pain point is not bookkeeping but **"loss of control"** - do not know how much the child spends or where money goes +- Scenario: every month parents see credit-card bills, but the child never proactively explains spending -- Generic idea: used book trading -- Core problem shift: short lifecycle of kids' picture books -- Reframed value: monthly age-fit picture book rotation + progress tracking + hygiene guarantee +**Reconstructed concept:** "Overseas Spending Steward" - not only bookkeeping, but letting parents "have clear visibility" on overseas spending + +**Core features:** +- Real-time child spending sync +- Overspending alerts +- Monthly spending analysis reports +- Peer comparison among similar students ("your child spends 20% above average") --- -### Appendix E: 5-Step AI Dialogue Method for Concept Refinement +#### Example 2: From "Pomodoro tool" to "Remote Work Proof" -Use multi-round AI dialogue to turn rough ideas into executable plans. +**Ordinary idea:** Pomodoro app to help users focus -**Step 1: Submit raw idea and concerns** +**Problem:** phones already have screen-time stats, plus Forest and Pomodoro Todo... +**Horizontal segmentation:** +- Remote workers: need to prove to managers that they are truly working + +**Vertical deep dive:** +- Pain point is not "cannot focus," but **"trust crisis"** - if manager cannot see me, how do I prove I am working? +- Scenario: every day after work, manager asks "how was your progress today?" and there is no proof + +**Reconstructed concept:** "Remote Work Proof" - helping remote workers build trust with employers + +**Core features:** +- Automatic work-time tracking +- Productivity reports +- Screen activity summaries (privacy-protected) +- Auto-generated daily work report sent to supervisor + +--- + +#### Example 3: From "second-hand book trading" to "Picture Book Library" + +**Ordinary idea:** second-hand book trading platform + +**Problem:** there are already Duozhuayu, Xianshu, and Kongfuzi used-book marketplaces... + +**Horizontal segmentation:** +- Mom users: children's picture books become idle after reading, but buying new books is expensive + +**Vertical deep dive:** +- Pain point is not "books are expensive," but **"short lifecycle of picture books"** - books for age 3 are not read at age 4 +- Scenario: home is full of picture books that children no longer read, but throwing them away feels wasteful + +**Reconstructed concept:** "Picture Book Library delivered to your home" - not selling used books, but providing "rental of usage rights" + +**Core features:** +- Picture book subscription (mail 5 age-appropriate books each month, return after reading, then rotate new ones) +- Reading progress tracking +- Age-appropriate recommendations +- Sterilization guarantee + +--- + +### Appendix E: 5-step method to refine product concepts via AI dialogue + +Use multi-round AI dialogue to gradually refine ordinary ideas into precise, executable product concepts. + +**Operation:** +- Describe your initial idea (even if rough) +- Tell AI your concerns (heavy competition, unclear differentiation, etc.) + +**Prompt:** ```text -I want to build [product concept], but I found [problem/concern]. +I want to build [product concept], +but I found [problem/concern]. ``` -**Step 2: Ask AI for MVP plan** +**Operation:** +- Ask AI to create a minimum viable product plan +- Discuss implementation difficulty and costs +- Define validation metrics +**Prompt:** ```text Please help me: 1. Plan an MVP -2. Provide concrete technical suggestions +2. Provide concrete technical implementation advice 3. Estimate cost 4. Define validation metrics ``` -**Step 3: Raise concrete concerns** +**Operation:** +- Technical difficulty? +- Content production cost? +- Promotion cost? +- User acquisition difficulty? +**Prompt:** ```text -I worry about: +I am worried about: 1. [Concern 1] 2. [Concern 2] 3. [Concern 3] ``` -**Step 4: Request option comparison and solutions** +**Operation:** +- Provide concrete solutions for your concerns +- Compare multiple options and choose the best +- Estimate costs +**Prompt:** ```text -Please provide specific solutions for my concerns. +Please provide concrete solutions for my concerns. ``` -**Step 5: Confirm final action plan** +**Operation:** +- Organize a clear action plan +- Set validation metrics +- If metrics are not met, adjust direction quickly +**Prompt:** ```text Please help me organize a clear action plan. ``` -::: tip Key Techniques -- Multi-round iteration: don't expect one perfect answer -- Add context: your observations, lived experience, user feedback -- Challenge weak AI suggestions when needed -- Always end with an actionable execution plan +::: tip Key techniques + +- **Multi-round dialogue:** do not expect a perfect answer in one round; iterate +- **Provide information:** tell AI your observations, experiences, and people-around-you feedback +- **Challenge AI:** if AI suggestions are unreasonable, call that out in time +- **Focus on execution:** always end with a concrete action plan + ::: --- -### Appendix F: Demand Validation Checklist +### Appendix F: Need validation checklist -Before investing development time, verify this checklist. The core question is: **will users pay for this?** +Before deciding to invest development time, validate your idea with the checklist below - **the core question is always: will users pay for this?** + +::: tip Need Validation Checklist -::: tip Validation Checklist **1. User profile clarity** -- ☐ Can you describe the target user in one sentence? -- ☐ Do you know their current alternative? -- ☐ Can you describe specific usage scenarios? -- ☐ Does this group have purchasing power? +- ☐ Can you describe your target user in one sentence? +- ☐ Can you state what alternative they currently use? +- ☐ Can you describe specific details of their usage scenario? +- ☐ Does this user group have payment capability? -**2. Pain intensity** -- ☐ What cost do users pay now (time/money/effort)? -- ☐ What happens if this problem stays unsolved? -- ☐ Are users actively searching for solutions? -- ☐ How much are they willing to pay? +**2. Pain intensity evaluation** +- ☐ What cost do users pay now to solve this problem? (time/money/effort) +- ☐ If they do not solve it, what consequence follows? +- ☐ Are users actively seeking solutions? +- ☐ How much are users willing to pay for this? -**3. Differentiation** -- ☐ What is your advantage over existing options? -- ☐ Is that advantage strong enough to trigger switching? -- ☐ Is your capability hard to copy quickly? -- ☐ Is differentiation strong enough to support payment? +**3. Solution differentiation** +- ☐ Compared with existing solutions, what is your advantage? +- ☐ Is that advantage strong enough to make users switch? +- ☐ How hard is it for big platforms to copy your feature? +- ☐ Is your differentiation enough to support paid conversion? -**4. Business viability** -- ☐ Will users really pay? How much? (must be tested) -- ☐ What is approximate CAC? -- ☐ Can LTV cover CAC? -- ☐ Do you have additional monetization paths? +**4. Business model feasibility** +- ☐ Are users willing to pay? How much? (must be tested in reality) +- ☐ What is rough customer acquisition cost? +- ☐ Can user lifetime value (LTV) cover customer acquisition cost (CAC)? +- ☐ Are there additional monetization paths? (ads, value-added services, B2B, etc.) + +**5. Rapid validation plan** +- ☐ Can you build a testable prototype with minimum cost in 1-2 weeks? +- ☐ Can you find 10 target users for interviews? +- ☐ Can you design an experiment to validate the core hypothesis? +- ☐ Can you ask users to prepay deposits to validate willingness to pay? -**5. Rapid validation** -- ☐ Can you build a testable prototype in 1-2 weeks with minimum cost? -- ☐ Can you interview 10 target users? -- ☐ Can you design an experiment to test the core hypothesis? -- ☐ Can you test willingness to pay with real deposits? ::: -Do not ask "Would you use this?" It usually gives false positives. +**Do not ask "Will you use this product?"** +This question mostly gives false positives. -Ask behavior-first questions: -- "How do you solve this now?" -- "How many times did this bother you in the last week?" -- "If the solution requires behavior change, will you change?" -- "Would you buy at price X?" +**Ask instead:** +- "How do you currently solve this problem?" (understand real behavior) +- "How many times did this problem bother you in the last week?" (understand frequency) +- "If there is a solution, but it requires changing your current habit, are you willing?" (understand change cost) +- "If it costs XX RMB, will you buy?" (understand willingness to pay) -Best validation is still real prepaid commitment. +**Best validation:** ask users to prepay deposits. Many people say they are willing to pay, but those who actually pay are your real target users. + +**Key metrics:** +- Deposit-paying user ratio > 10%: demand is real and worth investment +- Deposit-paying ratio 5%-10%: demand exists but needs refinement +- Deposit-paying ratio < 5%: demand is invalid, or product concept has issues --- ## Chapter Summary -In this chapter, through Xiao Ming's story, we learned to evaluate ideas with a product-manager mindset. The core question remained unchanged: **will users pay for this?** +In this chapter, through Xiao Ming's story, we learned how to evaluate product ideas from a product-manager perspective - **the core is always: will users pay for this?** -::: info Core Points -**1. Three standards of real demand** -- Users are willing to pay -- Users are willing to change behavior -- Without a solution, users experience clear loss +::: info Core points -**2. Path from ordinary idea to paid product** -- Horizontal segmentation: find specific users with stronger payment intent -- Vertical deep dive: understand complete scenario and emotional context -- Value reframing: evolve from feature/tool to complete solution +**1. Three standards of real demand:** +- Users are willing to pay for it (the most important standard) +- Users are willing to change behavior for it +- If no solution exists, users suffer clear loss -**3. Avoid fake-demand traps** -- Solving itch points instead of pain points -- Market too small to support the business model -- Solution complexity higher than problem value +**2. Path from ordinary idea to product people will pay for:** +- Horizontal segmentation: find a specific user group, and the more segmented, the stronger willingness to pay +- Vertical deep dive: understand complete scenarios, solving emotions rather than only functions +- Value reconstruction: evolve from tools into solutions and build reasons to pay -**4. Validate payment intention early** +**3. Avoid fake-demand traps:** +- Solving pseudo pain points (itch points instead of pain points) +- Market size is too small to support a business model +- Solution is more complex than the problem itself + +**4. How to validate willingness to pay:** - Interview 10 target users in depth -- Use prepaid deposits to test real willingness -- >10% deposit conversion is a practical signal to continue +- Ask users to prepay deposits to verify true willingness +- Only when deposit-paying ratio > 10% is it worth investing -**5. Use AI dialogue for refinement** +**5. Refine product concepts with AI dialogue:** - Iterate through multiple rounds -- Stay execution-focused -- Set measurable validation metrics and adjust quickly +- Focus on execution and action plans +- Set validation metrics and adjust direction promptly + ::: -**Remember:** strong product managers do not invent demand from thin air. They discover real needs that are ignored, underestimated, or poorly served, then build products users are willing to pay for. +**Remember:** good product managers do not create demand from thin air. They discover real needs that are ignored, underestimated, or poorly satisfied, then find ways to make users willing to pay. + +In the next chapter, we will bring validated ideas and start learning how to use AI IDE to turn them into interactive product prototypes. diff --git a/docs/en/stage-1/1.1-introduction-to-ai-ide/index.md b/docs/en/stage-1/1.1-introduction-to-ai-ide/index.md index fda16f3..2989438 100644 --- a/docs/en/stage-1/1.1-introduction-to-ai-ide/index.md +++ b/docs/en/stage-1/1.1-introduction-to-ai-ide/index.md @@ -1231,139 +1231,3 @@ If you encounter new words in the learning process later, you can also have AI a - Then write a sentence about "what you can imagine it as"; - Finally give a particularly simple small example. This way your "personal glossary" will grow longer and more practical, gradually enabling better communication with computers. - ---- - -# Appendix 2: Visual Studio Code Menu Bar Analysis - -To help everyone understand the meaning of each option, here we provide an in-depth analysis of the menu bar: - -![](../../../zh-cn/appendix/2-development-tools/editors-and-ai/images/index-2026-01-09-11-35-55.png) - -![](../../../zh-cn/appendix/2-development-tools/editors-and-ai/images/index-2026-01-09-11-36-23.png) - -
- File: Project and File Open/Save/Workspace Management - -This menu is mainly responsible for: **Creating/Opening Files**, **Opening Project Folders**, **Managing Workspaces**, **Saving and Closing**. - -> The most commonly used are: Open Folder to open a project; Open… to open a single file; then use Save / Save All to save changes, and finally use Close Editor / Close Folder to end the current work. Workspace-related content can be slowly learned as you get more projects, no need to understand everything at once. - -- **New Text File**: Create a new unnamed text buffer for temporary notes or quick pasting. -- **New File…**: Create a new file in the project (usually asks you to choose path/name). -- **New Window**: Open a new VS Code window instance. -- **New Window with Profile**: Open a new window with a specified Profile (extension/settings combination), suitable for isolating environments for different courses/projects. -- **Open…**: Open a single file for editing. -- **Open Folder…**: Open a folder as the project root directory (the most commonly used "open project" method). -- **Open Workspace from File…**: Open a `.code-workspace` file to load a workspace with multiple folders/specific settings. -- **Open Recent**: Quickly access recently opened files/folders/workspaces. -- **Add Folder to Workspace…**: Add another folder to the current workspace (forming a multi-root workspace). -- **Save Workspace As…**: Save the current workspace structure as a `.code-workspace` file for easy sharing/reuse. -- **Duplicate Workspace**: Duplicate the current workspace configuration (commonly used to create similar project environments). -- **Save**: Save changes to the current file. -- **Save As…**: Save the current file with a new name/path. -- **Save All**: Save all opened files that have modifications. -- **Share**: Entry related to sharing/collaboration (specific content depends on version and extensions). -- **Auto Save**: Toggle auto-save strategy (e.g., delayed save/focus change save). -- **Revert File**: Discard unsaved changes to the current file and revert to the disk version. -- **Close Editor**: Close the current tab. -- **Close Folder**: Close the current project folder (workspace becomes empty). -- **Close Window**: Close the current VS Code window. - -
- -
- Edit: Basic Editing, Find/Replace, Comments and Quick Edit Actions - -This menu is mainly responsible for: **Undo/Redo**, **Cut/Copy/Paste**, **Find/Replace**, **Comments and Editor Actions** (improving editing efficiency). - -- **Undo / Redo**: The most basic operations for when you write code wrong. -- **Cut / Copy / Paste**: Text transportation. -- **Find / Replace**: Search or batch modify in the current file. -- **Find in Files / Replace in Files**: Global (whole project) search and replace, very powerful but use with caution. -- **Toggle Line Comment**: `Ctrl + /`, quickly comment/uncomment the current line. -- **Toggle Block Comment**: `Shift + Alt + A`, quickly comment/uncomment the selected area. -- **Emmet: Expand Abbreviation**: A powerful tool for HTML/CSS development, type shorthand and press Tab to expand code. - -
- -
- Selection: Multi-cursor and Smart Selection - -This menu is mainly responsible for: **Cursor Control**, **Multi-line Editing**, **Expand/Shrink Selection**. This is VS Code's killer feature for improving efficiency. - -- **Select All**: Select all content in the current file. -- **Expand Selection / Shrink Selection**: Intelligently perceive syntax structure, gradually expand or shrink the selection range (e.g., word -> string -> inside parentheses -> whole line -> function body). -- **Copy Line Up / Down**: Quickly clone the current line. -- **Move Line Up / Down**: `Alt + ↑ / ↓`, adjust code line order directly without cut and paste. -- **Add Cursor Above / Below**: `Ctrl + Alt + ↑ / ↓`, enable multi-cursor mode to edit multiple lines simultaneously. -- **Add Cursor to Line Ends**: After selecting multiple lines of text, add a cursor at the end of each line. - -
- -
- View: Interface Layout and Panel Control - -This menu is mainly responsible for: **Toggle Sidebar/Panel**, **Adjust Layout**, **Command Palette**, **Output and Debug Console**. - -- **Command Palette…**: `Ctrl + Shift + P` / `F1`, VS Code's central command center, can search and execute all commands. -- **Open View…**: Quickly open specific sidebar views (such as Explorer, Source Control). -- **Appearance**: Control fullscreen, menu bar visibility, sidebar position, zoom level (Zoom In/Out). -- **Editor Layout**: Split editor (Split Up/Down/Left/Right) for side-by-side code comparison. -- **Explorer / Search / Source Control / Run / Extensions**: Directly switch views in the Activity Bar. -- **Problems / Output / Debug Console / Terminal**: Directly control the display content of the bottom panel. -- **Word Wrap**: `Alt + Z`, control whether long lines of code automatically wrap (does not affect actual file content). - -
- -
- Go: Code Navigation and Jumping - -This menu is mainly responsible for: **Jumping Between Files**, **Jumping Between Symbols (Functions/Variables)**. - -- **Back / Forward**: Like a browser, jump between your cursor history positions. -- **Switch Editor…**: Quickly switch between opened tabs. -- **Go to File…**: `Ctrl + P`, type filename to quickly open files. -- **Go to Symbol in Editor…**: `Ctrl + Shift + O`, list functions/classes/variables in the current file for quick jumping. -- **Go to Definition**: `F12`, jump to the definition of the variable or function at the cursor. -- **Go to References**: `Shift + F12`, see where this variable or function is used. -- **Go to Line/Column…**: `Ctrl + G`, jump to a specified line number. - -
- -
- Run: Debugging and Execution - -This menu is mainly responsible for: **Start Debugging**, **Breakpoint Management**. - -- **Start Debugging**: `F5`, run the program in debug mode (supports breakpoints, variable watching). -- **Run Without Debugging**: `Ctrl + F5`, run the program directly without attaching a debugger (slightly faster). -- **Stop Debugging**: Forcefully end the current debugging session. -- **Restart Debugging**: Run again. -- **Toggle Breakpoint**: `F9`, add or remove a red dot (breakpoint) on the current line. -- **New Breakpoint**: Supports conditional breakpoints, log breakpoints, and other advanced features. - -
- -
- Terminal: Integrated Command Line - -This menu is mainly responsible for: **New Terminal**, **Manage Terminal Windows**. - -- **New Terminal**: Open a new Shell (PowerShell/Bash/Zsh) in the bottom panel. -- **Split Terminal**: Split left/right/up/down in the same terminal panel to run multiple commands simultaneously. -- **Run Task…**: Run build/test tasks defined in `tasks.json`. - -
- -
- Help: Documentation and Feedback - -- **Welcome**: Open the welcome page (contains getting started guide, recent projects). -- **Show All Commands**: Same as Command Palette. -- **Documentation**: Jump to official documentation. -- **Editor Playground**: Interactive tutorial for learning editing techniques. -- **Check for Updates…**: Manually check for updates. -- **About**: View version number, build time, Electron/Node version information. - -
diff --git a/docs/en/stage-1/1.2-building-prototype/index.md b/docs/en/stage-1/1.2-building-prototype/index.md index d137bfe..20aa1af 100644 --- a/docs/en/stage-1/1.2-building-prototype/index.md +++ b/docs/en/stage-1/1.2-building-prototype/index.md @@ -1,537 +1,621 @@ --- -title: 'Building Prototypes - From Business Analysis to Multi-page Product Prototype Implementation' -description: 'Experience the complete loop from business analysis to multi-page product prototype implementation. Learn how to ask business questions, break down requirements, use AI IDE to generate single-page and multi-page applications, and beautify and test prototypes.' +title: 'Build a Prototype Hands-On - From Business Analysis to Multi-Page Product Prototype Implementation' +description: 'Experience the complete loop from business analysis to multi-page product prototype implementation. Learn how to ask business questions, break down requirements, use an AI IDE to generate single-page and multi-page apps, and polish and test prototypes.' --- -# Beginner Level 3: Building Prototypes +# Beginner 3: Build a Prototype Hands-On -## Chapter Overview +## Chapter Introduction - + -In the previous chapter, we learned how to find great ideas — starting from user needs to find directions people are willing to pay for. But finding direction is just the first step. What really tests a product manager is: how to turn vague needs into usable products. +In the previous chapter, we learned how to find a great idea - starting from user needs and finding directions people are willing to pay for. But finding direction is only step one. What really tests a product manager is: how to turn vague requirements into a usable product. -This chapter solves a real problem: Your boss gives you a vague but high-pressure task: "Use AI to improve the efficiency of publishing products to e-commerce platforms" — how do you turn this into a usable product prototype? +In this chapter, we solve one real-world problem: your boss throws one sentence at you, "Use AI to improve the efficiency of publishing products to e-commerce platforms." How do you turn that into a usable product prototype? -Unlike building Snake or calculators before, real business can't just imagine features: +Unlike building Snake or a calculator, real business work cannot rely on imagined features: -1. Clarify pain points: Talk to operations, dig out the real pain points from the vague "improve efficiency" -2. Prioritize: Among many problems, solve the most painful one first, don't try to do everything at once -3. Quick validation: Use AI IDE to build a single-page prototype first, then expand to multi-page after it works -4. Make something usable: Finally deliver an e-commerce material workbench that can be demonstrated and operated +1. Clarify pain points: talk to operations and dig out the real pain points hidden behind the vague phrase "improve efficiency" +2. Prioritize: among many problems, solve the most painful one first, instead of trying to do everything at once +3. Validate quickly: use an AI IDE to build a single-page prototype first; once it works, expand to multiple pages +4. Deliver something usable: finally deliver an e-commerce asset workbench that can be demonstrated and operated -We'll learn the transition from making toys to making applications, and learn to empathize and think about customers' real needs. +We will learn the shift from building toys to building applications, and learn how to empathize and think from real customer needs. ::: info Note -This chapter may contain some business terminology. If you don't understand something, you can ask AI for an explanation. +This chapter contains some business terms. If you do not understand one, ask AI for an explanation. :::
-## 1. Define Requirements Before Coding +## 1. Define Requirements Before Writing Code -In previous tutorials, we used AI IDE to easily generate Snake and various mini-games, but these are just toy projects that can't be applied in work and life. If we want AI capabilities to truly serve everyone, we should combine vibe coding with real life and work scenarios. +In earlier tutorials, we used AI IDE tools to quickly generate Snake and mini-games. But those are toy projects and are not directly useful in daily work and life. If we want AI capability to truly create value, we should combine vibe coding with real work and life scenarios. -In the last chapter, we learned how to find great ideas that people are willing to pay for, but finding direction is just the beginning. When actually building products, you'll discover: there's a huge gap between knowing "what to do" and knowing "how to do it." +In the previous chapter, we learned how to find ideas people are willing to pay for, but finding direction is only the beginning. In real product work, you will realize: there is a huge gap between knowing "what to build" and knowing "how to build it." -This gap is the concretization of requirements. +That gap is making requirements concrete. -For example, in classes or personal projects, we often start with the simplest executable features: +For example, in class or personal projects, we often start from the simplest executable function: -- "Make a kanban board, list the tasks." -- "Help me make a drawing tool." -- "Help me make a software that can collect questionnaires." +- "Build a board that lists tasks." +- "Help me build a drawing tool." +- "Help me build software to collect questionnaires." -These are often just a tool, a feature module, not even a clear business problem. More critically, these ideas are often just "you think it's useful," not "users really need it." +These are often just tools or isolated feature modules, and sometimes not even a clearly defined business problem. More importantly, these ideas are often "I think this is useful," not "users truly need this." -In enterprise projects or startup projects, product managers and engineers often start from larger business propositions. For example, let's assume such a scenario: +In enterprise projects or startup projects, product managers and engineers usually start from larger business goals. For example, assume this scenario: -
Business Scenario:
+
🛍️ Business Scenario:
-

You are an e-commerce operations product manager at a store. Your boss gave you a vague but high-pressure proposition:

-

"Now everyone on WeChat is using AI to make images and copy, it looks pretty simple. Help me set this up so we can be more efficient when listing new products on Douyin e-commerce."

+

You are an e-commerce operations product manager at a store. Your boss gives you a vague but high-pressure assignment:

+

"Everyone on public channels is using AI to make images and copywriting, and it looks easy. Set this up for us so we can launch new products on Douyin e-commerce more efficiently."

-At this point you might think: "Boss, you're dreaming again!" However, such vague one-sentence decisions are very common in actual work, even more frequent than your weekly bubble tea orders. Therefore, to be a qualified workplace worker (I'd rather you be the CEO of an emerging startup), we must learn how to transition from making tools for personal use to making real product prototypes. +You might think, "Boss, you are dreaming again." In real work, though, this kind of one-sentence, vague directive is very common. To become a capable professional (or better, an early-stage startup CEO), we must learn how to move from building personal tools to building real product prototypes. -Since we've learned AI IDE, you think about it and this requirement is actually quite simple — just let AI give a prompt based on this, throw it to the Agent and we're done, right? +Since we already learned AI IDE usage, you may think this requirement is easy: give AI a prompt and let the agent do everything: -``` -Please refer to my requirements below, -Help me design an e-commerce material workbench, -Including generation and management functions for product descriptions, images, videos, and other materials. +```text +Please refer to my requirement xxxx, +help me design an e-commerce asset workbench, +including generation and management of product descriptions, images, videos, and other assets. ``` -If you excitedly convert this requirement directly into a prototype and send it to your boss — congratulations, this quarter's bonus is cancelled! +If you excitedly convert this straight into a prototype and send it to your boss - congratulations, your quarterly bonus may disappear. -**Why is this? This is the core pain point we need to solve:** +**Why? This is exactly the core pain point we need to solve:** -Previously when learning AI IDE, we made toy projects for personal use like Snake and calculators — simple features, you know what you want, make it for yourself. But **real business scenarios are completely different**: +Previously, when learning AI IDE tools, we mostly built **toy projects for ourselves** like Snake and calculators: simple features, clear personal goals, and "works for me" is enough. But **real business scenarios are completely different**: -- **You're not the user**: The boss wants "improved efficiency," but you don't know how operations works daily or where they're stuck; -- **AI doesn't understand business either**: You throw a vague requirement to AI, it can only guess based on general knowledge. What it makes looks right but actually doesn't work; -- **Good ideas ≠ good products**: You think "adding an AI generation feature" is cool, but users might not need it at all, or it's more troublesome than before. +- **You are not the user**: the boss says "improve efficiency," but you do not know how operations actually works daily or where the bottleneck is. +- **AI does not understand your business either**: if you give AI a vague requirement, it can only guess from generic knowledge. The result may look plausible but be unusable. +- **A good idea is not the same as a good product**: you may think "add AI generation" is cool, but users may not need it, or it might create more friction. -**That's why we must learn "from thinking of ideas to understanding users"** Only when your creativity truly solves others' problems, ask questions and deeply understand the business, can you make something truly valuable. (Good ideas are even more important than good technology) +**That is why we must learn "from having an idea to understanding users."** Only when your idea truly solves someone else's problem, and you ask questions and deeply understand business context, can you produce real value. (A good idea can be even more important than good technology.) -### 1.1 From Imagination to Reality: Learn to Ask Business Questions +### 1.1 From Imagination to Reality: Learn to Ask the Business -::: info First, let's clarify: What are requirements? What is business? +::: info 💡 Clarify first: what is a requirement? what is business? -**Requirements** are what users really want, the troubles they encounter, the problems they want to solve. For example, "The boss wants me to list products faster" — this is a requirement. +**A requirement** is what users truly want: the problem they encounter and want solved. +For example, "my boss wants me to launch products faster" is a requirement. -**Business** is what users actually do every day, their way of working. For example, what e-commerce operations does daily: listing products, changing prices, making images, looking at data... these are all business. +**Business** is what users actually do every day: their operational workflow. +For example, daily e-commerce operations tasks include launching products, changing prices, making images, reviewing data, and more. -**Why care about business?** -Because if you don't understand the business, the tools you make might be "look good but nobody uses them." Only by truly understanding how users work daily and where they're stuck can you make something that really helps them. +**Why focus on business?** +If you do not understand the business, you may build something that "looks good but nobody uses." Only when you understand users' daily workflow and bottlenecks can you build something truly helpful. ::: -From the simplest perspective, you can first ask yourself a few questions: +From the simplest angle, ask yourself: -- The boss says "**improve efficiency a bit**" — what does that specifically mean? **Do it faster**? **Spend less money**? **Sell more goods**? -- How are products currently listed? **Where is it not smooth**? -- How many **new products** need to be done daily? How many **images** and how much **text** per product? -- In current work, **which task is most troublesome**, **most unwanted**? +- When the boss says "**improve efficiency**," what does that mean exactly? **Faster delivery**? **Lower cost**? **Higher sales**? +- How are products launched now? **Where does the current process break down**? +- How many **new products** are launched each day? How many **images** and how much **text** are needed per product? +- Which tasks in the current workflow are the **most painful** and **most disliked**? -But these are all guessed questions. We need to ask the frontline Douyin e-commerce business people directly, "Where are your difficulties and concerns?" Get more accurate answers through communication: +These are still assumptions. We need to ask frontline Douyin e-commerce practitioners directly: "Where are your actual difficulties, and what do you care about most?" This gives more accurate answers. -::: info Real Business Interview Results +::: info 📋 Real business interview findings -We asked people doing e-commerce operations, and they mentioned these troubles: +We asked e-commerce operators and heard: -**1. Too many things, too scattered** -- One person manages several stores, each store has many products to handle; -- Busy all day: **listing new products**, **changing prices**, **making images**, **looking at data** — one thing not finished before another starts. +**1. Too much, too fragmented** +- One person handles multiple stores, each with many products +- Daily work keeps switching between **launching products**, **changing prices**, **creating images**, and **checking data** -**2. Content creation isn't done once, but iteratively** -- First use **manufacturer-provided images**, **previously used materials** or **reference images found online**, quickly **list** products to test; -- Spend a little money on promotion, **see if anyone buys**; -- Only for **products that sell well** will they seriously make images, write details, shoot videos. - -::: -After interviewing the business side, we feel passionate because now we can truly make a product prototype that perfectly fits the business! — Wrong again. If we try to "satisfy all demands at once," the product will be very large and hard to implement within the course timeframe. Therefore, we need to further organize and converge, finding the real core pain points. - -### 1.2 From Divergence to Convergence: Lock in Core Business Pain Points and Features - -::: info Why "converge"? What is a "pain point"? - -**Many problems, but which one to do first?** - -Users might tell you a bunch of problems: A is troublesome, B is troublesome, C is troublesome... But if you try to solve all problems at once, you might end up doing nothing well. So you need to **converge** — from a pile of problems, pick the **most painful, most urgent, most solvable** one to start with. - -**What is a pain point?** -It's the specific problem users **find most annoying, most time-consuming, most want to solve**. Not "I think it's useful," but what users **complain about every day, find painful every time they do it**. +**2. Content is iterative, not one-shot** +- First use **vendor-provided images**, **historical assets**, or **reference screenshots** to quickly launch +- Spend a small budget to test and **see if sales happen** +- Only for **products that perform well** do they invest deeply in image design, detail pages, and video ::: -Through the interview above, we found operations has many problems: interrupted rhythm by activities, managing multiple stores, busy going back and forth between listing/pricing/images/data... +After interviewing the business side, we might feel, "Now we can build the perfect prototype." Still wrong. If we try to satisfy everything at once, the product becomes huge and impossible to land within course time. We still need to narrow and prioritize core pain points. -If we try to "solve all these problems," we'll end up with a **comprehensive but hard-to-use tool**. +### 1.2 From Divergence to Convergence: Lock the Core Pain Point and Features -Let's categorize these problems (you can have AI help), roughly three types: +::: info 💡 Why "convergence"? What is a "pain point"? -1. **Rhythm problems**: When to list, when to adjust prices; -2. **Efficiency problems**: How to manage multiple stores and products simultaneously; -3. **Content problems**: How to quickly create product images and copy. +**There are many problems. Which one do we solve first?** -For our course, the most suitable to solve first is **the 3rd type: content creation problems**. But "quickly create content" is still a bit abstract. Let's ask the business side specifically where they're stuck: +Users can list many issues: A hurts, B hurts, C hurts. If we try to solve all of them at once, we may solve none well. So we must **converge**: pick the **most painful, most urgent, and most solvable** problem first. -::: info Business Side Says: Two Most Painful Parts of Content Creation - -**Pain 1: Batch creating images and copy is too much effort** -- Materials scattered everywhere: cloud drives, WeChat records, platform backends... **finding them is a hassle**; -- Need to list many products at once, **no time to carefully craft each one**, can only throw something together; -- Requirements aren't high, **presentable and listable is fine**, doesn't need to be fancy. - -**Pain 2: Good solutions can't be saved for reuse** -- Previously made good titles and layouts, **can't find them next time**; -- Solutions scattered in chat history, old product links; -- When needed, have to **dig through everything, copy-paste and edit for ages**; -- Lacking a tool that can **collect, manage, and directly apply**. +**What is a pain point?** +It is the concrete problem users find **most frustrating, most time-consuming, and most urgent to fix**. Not "I think this is useful," but what users complain about repeatedly in real work. ::: -Based on these two pain points, we want to make a simple little tool: **Help operations batch create images and copy, and save good solutions for direct reuse next time**. +From interviews, we found many issues: activity-driven interruptions, multi-store management pressure, frequent context-switching between launch/pricing/creative/data tasks. -It only does two things (you can have AI help refine, remember to keep deleting features based on business feedback): +If we attempt "solve all of it," we will end up with a **big but unusable** tool. -::: info Feature 1: Batch Generate E-commerce Product Images and Copy +With AI help, we can classify the issues into three groups: -**What does this do?** -Give the system some product information, and it automatically generates product images and text that can be used for listing on e-commerce platforms (like Douyin, Taobao). +1. **Rhythm problems**: when to launch, when to adjust price +2. **Efficiency problems**: how to manage many stores/products in parallel +3. **Content problems**: how to quickly produce product images and copy + +For this course, the best first target is **Group 3: content creation**. But "make content quickly" is still broad, so we ask where exactly they get stuck: + +::: info 📋 The business side says content has two biggest pain points + +**Pain Point 1: Batch image/copy production is exhausting** +- Assets are scattered (cloud drives, chat history, backend), and **hard to find** +- Many products need launching at once, so there is **no time for per-item perfection** +- The standard is practical: **good enough to launch**, not perfect design + +**Pain Point 2: Good approaches are not reusable** +- Previously successful titles/layouts are **hard to find next time** +- Useful approaches are scattered in chat records and old product links +- Reuse requires **manual searching + copy/paste + heavy editing** +- Missing a tool to **save, manage, and apply templates directly** + +::: + +Based on these two pain points, we define a simple tool: **help operations batch-generate image and copy drafts, and save good patterns for direct reuse next time**. + +The tool only focuses on two capabilities (and you can keep cutting features with AI support as business feedback arrives): + +::: info Feature 1: Batch generate e-commerce product images and copy + +**What does it do?** +Given product information, the system auto-generates product images and text that can be used on platforms like Douyin and Taobao. **Input** | Type | Content | |------|------| -| Product Information | Name, category, brand, material, size, color, etc. | -| Product Images | White background or simple scene images | -| Reference Images | Screenshots of previously best-selling products or reference links | -| Import Method | Batch import via Excel, or fill in directly on the page | +| Product data | Name, category, brand, material, size, color, target users, etc. | +| Product images | White background image or simple scene image | +| Reference assets | Screenshots/links of previously successful products | +| Import method | Excel batch import or direct form input/upload | -**Output (Generated E-commerce Materials)** -- **Product Main Image**: Product display image with text selling points (first image users see when scrolling) -- **Product Title**: Keyword combination that can be searched -- **Selling Point Copy**: 1-2 sentences to attract buyers -- All are **finished products that can be listed with minor edits** +**Output (generated listing assets)** +- **Main product image**: a presentable image draft with core selling points +- **Product title**: keyword-structured title fit for search +- **Selling-point copy**: 1-2 sentences that attract buyers +- All outputs should be **launch-ready or editable with light changes** -**Effect** -- Before: Every product had to start from scratch making images and writing copy -- After: Throw a batch of products into the system, generate drafts, then pick and edit +**Workflow impact** +- Before: start each product's creative work from scratch +- After: submit a batch, get drafts, then filter and fine-tune ::: -::: info Feature 2: Save Good Solutions as Templates +::: info Feature 2: Save effective output as reusable templates **Input** | Type | Content | |------|------| -| Complete Set | Main image + Title + Copy | +| A complete set | Main image + title + selling-point copy | **Output** | Function | Description | |------|------| -| Apply | Use template to auto-generate for new products | -| Edit | Directly modify title, modify copy | -| Manage | Name, tag (like "men's bag template", "promotion title"), easy to find | +| Apply | Reuse a saved template for new product generation | +| Edit | Directly edit title or copy | +| Manage | Name and tag templates (for example "men's bag template", "campaign title"), searchable later | -**Effect** -1. Import new product -2. Choose: Let system generate by default, or **use my saved template** -3. System automatically applies template style, outputs new images and copy +**Workflow impact** +1. Import a new product +2. Choose default generation or **apply a saved template** +3. System applies template style and outputs a new image + copy draft ::: --- -**Review what we just did:** +**What did we just do?** -1. **Asked questions first**: Didn't start building directly, but first asked operations "what annoys you most"; -2. **Found pain points**: Discovered their most painful parts are "making images and copy is too much effort" and "good solutions can't be saved"; -3. **Converged scope**: Not making a comprehensive platform, just these two features: "batch generate images and copy + save templates". +1. **Asked first**: not coding immediately, but asking operators what hurts most +2. **Found core pain**: "image/copy creation is too labor-intensive" and "good patterns cannot be reused" +3. **Converged scope**: not building a huge platform; only two core features first -**Why is this important?** +**Why this matters** -Many beginners' misconception about product building is: more features is better. But what users really need is **to solve the most painful problem**. Making a bunch of features that don't work well is worse than making one or two features that really help users. +A beginner trap is "more features = better." In reality, users need you to solve the **single most painful problem** first. Many weak features are less valuable than a few features that truly work. -**Core of Product and Business Thinking:** -- Don't think for yourself "I think users need what" -- Ask users "What do you do every day? Where is it most painful?" -- From a pile of problems **converge** to the most painful, most solvable one -- First make a **minimum viable** version, then slowly iterate +**Core product/business thinking** +- Do not decide from your assumptions +- Ask users what they do daily and where it hurts most +- Converge toward the most painful and solvable point +- Build a **minimum usable version** first, then iterate -This is what we need to figure out before writing code. Code is just a tool; **understanding users and finding the right problem** is the first step. +This is what must be clear before coding. Code is just a tool; **understanding users and locking the right problem** is step one.
-## 2. Generate Prototype in 10 Minutes: Let AI IDE Implement "Core Gameplay" +## 2. Build a Prototype in 10 Minutes: Let AI IDE Implement the Core Gameplay -::: info Programming Plan Suggestion -If you feel the current IDE isn't smart enough, or you run out of quota quickly, you can buy a **programming Plan**. Preview in advance by referring to [this article](../../stage-2/backend/2.6-modern-cli/) for programming with Claude. +::: info 💡 Coding plan suggestion +If your current IDE feels not smart enough, or you run out of quota quickly, consider a dedicated **coding plan**. You can preview [this article](../../stage-2/backend/2.6-modern-cli/) to use Claude for coding. ::: -Thinking is good, but don't overthink. Let's control excessive reflection and try making a prototype starting from a single page. +Thinking is good, but avoid overthinking. Let's start from one page and build a prototype first. -### 2.1 First Step: Tell AI What You Want in Plain Language +### 2.1 Step 1: Tell AI What You Want in Plain Language -When starting out, don't pursue perfect prompts. Begin with your most natural expression. Just like describing requirements to a colleague, tell AI in plain language what you want to do, then let AI help you optimize it into a more professional expression. +At the beginning, do not chase a perfect prompt. Start with your natural description. Explain your goal to AI as if talking to a teammate, then let AI help refine it into clearer language. -#### 2.1.1 Start from Verbal Description (Recommended for Beginners) +#### 2.1.1 Start with spoken-style description (recommended for beginners) -First describe your idea in your own words, even if it's rough, that's fine: +Describe your idea in your own words. Rough is fine: -``` -I want to make a tool that helps e-commerce operations automatically generate product main images and copy. -Operations usually have to manually make images and write copy one by one, which is very troublesome. -My idea is: they upload product information, the system automatically generates a batch of drafts, -operations pick the good ones and make minor edits before using. +```text +I want to build a tool that helps e-commerce operators automatically generate product main images and copy. +Operators currently make images and copy one by one manually, which is painful. +My idea: they upload product info, and the system generates a batch of drafts. +Operators pick useful ones and make light edits. -First make the simplest version: one page, fill in product info on the left, -display generated results on the right. Can upload images, can fill in text, -after generation show main image preview and copy. +Start with the simplest version: one page. Input area on the left, +generated results on the right. Support image upload and text fields. +After generation, show main image preview and copy. ``` -Next, send this text to AI (like ChatGPT, Claude, etc.) and let it help you expand. AI usually helps you add details you didn't consider, organizes your ideas more clearly, and finally generates a prompt suitable for sending to AI IDE. +Then send this to AI (ChatGPT, Claude, etc.) and ask it to expand and structure it. AI often adds details you might miss and produces a better prompt for your AI IDE. -You can say this to AI: -``` -Help me expand the above idea, organize it into a clear business logic document, -then generate a prompt suitable for sending to AI IDE (like Cursor, Trae), -for generating single-page application prototype code. +You can ask like this: + +```text +Please expand the idea above into a clear business-logic document, +then generate a prompt suitable for an AI IDE (for example Cursor or Trae) +to generate a single-page prototype application. ``` -AI will return a structured requirement and corresponding prompt. You check it yourself, delete unnecessary features, and after confirming it's correct, use it to generate code. +AI will return a structured requirement and prompt. Review it, remove unnecessary features, confirm it, then use it for code generation. -The benefit of doing this: verbal descriptions are the most authentic ideas, but might miss some important details. When AI helps you expand, it might ask "do you want to support batch upload?" — questions you didn't think of, helping you further validate. You can choose to keep or delete impractical features based on feedback, and through repeated modifications determine the first version prompt to give AI. +Why this works: your spoken description captures your true intent, but may miss key details. AI expansion can surface questions like "do you need batch upload?" which helps validation. Keep refining by adding/removing features until your first working prompt is solid. -#### 2.1.2 Skip the Expansion Step: Directly Throw Your Organized Business Document to AI +#### 2.1.2 Skip expansion: directly give AI your organized business doc -If you've already organized the business logic document in previous chapters (like a requirements description written in plain language), you can directly use the format below to send to AI IDE, skipping the intermediate step of having AI expand. Suitable when requirements are already clear and you want to start coding directly: +If your business logic document is already prepared (for example from earlier chapters), you can directly feed it to the AI IDE using a structured format. This is suitable when requirements are already clear and you want to move fast. -``` -Help me implement a single-page application based on business logic, for validating core gameplay functionality. +```text +Please implement a single-page app based on the business logic below +to validate the core gameplay. -Business logic reference: -1. Help operations batch generate first version of image and text drafts: -- **Input (supports direct upload and batch import of materials):** - - Product basic info: name, category, brand, material, size, color, target audience, etc.; - - Product images: white background / simple scene images; - - Each generation supports uploading additional historical bestseller screenshots or reference links, allowing for reference materials; - - Supports batch import via Excel, or online entry/upload on the page. - - Supports specifying on the page whether to save product materials to the material library for next time use -- **Output (content that can be directly listed or listed with minor edits):** - - Each product gets one "presentable, containing basic selling points" main image draft; - - One "reasonably structured, containing core keywords" title + 1-2 sentences of selling point copy. -- **Expected usage change:** - From starting from scratch for each batch of products to throwing a batch of products into the system, taking the system-generated drafts for filtering and minor adjustments. +Business logic: +1. Help operations batch-generate first-round image+copy drafts: +- **Input (support direct upload and batch import):** + - Product fields: name, category, brand, material, size, color, target users, etc. + - Product image: white background image / simple scene image + - Per generation, support additional uploads of historical bestseller screenshots or reference links + - Support Excel batch import or direct online input/upload + - Support an option to save product assets to an asset library for later use +- **Output (usable for listing with no or light edits):** + - For each product, one "acceptable, basic-selling-point" main-image draft + - One "well-structured, keyword-containing" title + 1-2 selling-point lines +- **Expected workflow change:** + Move from writing every product from scratch to dropping batches into the system and selecting/fine-tuning generated drafts. -First make the first feature, the second feature (template library) will be added later. +First implement feature 1. Feature 2 (template library) can be added later. ``` -#### 2.1.3 Programmer's Approach (Advanced): Let AI Help You Write "Prompts for Prompts" +#### 2.1.3 Advanced approach: let AI write a "prompt for your coding agent" -If you want more fine-grained control over the code generation process, you can first have AI (like ChatGPT) generate a prompt specifically for AI IDE based on your requirements: +If you want finer control over code generation, ask AI to produce a coding-agent prompt first: -``` -Based on the idea below, help me write a prompt for a coding Agent, -I need to use this prompt to generate code. +```text +Based on the idea below, write a coding-agent prompt for me. +I will use it to generate code. -[Paste your business logic description here] +[paste your business logic here] Requirements: -1. The prompt should include clear page layout descriptions -2. Clarify data structures and interaction logic -3. Specify tech stack (like React + Tailwind) -4. List core functionality points to implement +1. Include a clear page layout description +2. Define data structures and interaction logic +3. Specify the tech stack (for example React + Tailwind) +4. List core features to implement ``` -Usually AI will generate a structured prompt like below: -![](../../../zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-25-56.png) +AI will usually output a structured prompt similar to this: +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-25-56.png) -You can slightly modify this prompt, then send it to AI IDE to generate code. +You can then make small edits and pass it into your AI IDE. -### 2.2 Second Step: Let AI IDE Directly Generate Code +### 2.2 Step 2: Let AI IDE Generate the Code Directly -#### 2.2.1 Preparation: Understand AI IDE Basic Operations +#### 2.2.1 Preparation: understand basic AI IDE operations -If you're not yet familiar with the basic usage of AI IDE (like Cursor, Trae, Windsurf, etc.), it's recommended to first check the [IDE Basics Tutorial](/zh-cn/appendix/2-development-tools/ide-basics/) in the appendix to understand how to: -- Create new projects -- Dialogue with AI Agent -- Understand AI's code generation process +If you are not yet familiar with AI IDEs (Cursor, Trae, Windsurf, etc.), read the appendix first: [IDE Basics](/en/appendix/2-development-tools/ide-basics/). Learn: -#### 2.2.2 Start Generating Code +- how to create a new project +- how to chat with an AI agent +- how to understand AI-generated code flow -At this point you've obtained the initial prompt. Let's use the first prompt style as an example, letting AI help us generate code. First create a window and corresponding folder, open the folder (initialize a new project in your favorite folder location): -![](../../../zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-28-44.png) -![](../../../zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-30-00.png) +#### 2.2.2 Start generating code -In the sidebar, select a model you like (recommend gemini, gpt, glm, kimi, minimax, etc.), enter the prompt obtained in the first step: -![](../../../zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-31-41.png) +Now you already have the initial prompt. Using the first prompt style as an example, let AI help generate the project. Create/open a folder and initialize a new project: -After clicking generate, we'll see a familiar process. AI will plan the project's directory structure, necessary files, and give initial content for each file based on the prompt. +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-28-44.png) +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-30-00.png) -::: warning Special Note: AI Might Stop and Wait for Your Confirmation -During generation, AI Agent often **stops to wait for your input or confirmation**, for example: -- Asking if you want to continue to the next step -- Having you press Enter to confirm an operation -- Asking about your choice for a technical detail +In the sidebar, choose a model you like (for example Gemini, GPT, GLM, Kimi, MiniMax), then paste the prompt from step one: -**If you see AI isn't moving, first check the dialogue interface to see if it's waiting for your reply.** Many beginners think AI is thinking, but it actually stopped waiting for you long ago. Reply actively or press Enter, and AI will continue working. +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-31-41.png) + +After generation starts, AI will plan the folder structure, create needed files, and fill initial code. + +::: warning ⚠️ Important: AI may pause and wait for your confirmation +During generation, the AI agent often **stops and waits for your input**, for example: +- asking whether to continue +- asking you to press Enter to confirm +- asking for a technical choice + +**If AI appears idle, first check the chat panel to see whether it is waiting for you.** +Many beginners think AI is "thinking," but it is actually paused for input. ::: -At this point, don't forget to press Enter to confirm information (otherwise it will be stuck waiting; some AI IDEs don't have this issue): -![](../../../zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-33-03.png) +Do not forget to press Enter for confirmation where needed (some IDEs behave differently): -If you encounter the following scenario, this means a service has already started locally. You need to click skip, otherwise it will stay on this interface (if nothing appears after code generation finishes, you need to actively say "help me start this project"): -![](../../../zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-38-11.png) +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-33-03.png) -::: info Scenario Explanation -**Scenario Explanation**: You used `npm create vite@latest` to create a React + TypeScript project (easy-vibe-web). After creation, the computer will automatically "run" this webpage, making it convenient for you to see the effect immediately. +If you encounter the screen below, it usually means the local service has already started. Click skip if needed, otherwise you may stay stuck there. (If generation is done but no preview appears, ask AI directly: "Please start this project.") + +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-38-11.png) + +::: info 💡 Scenario explanation +**Scenario**: you used `npm create vite@latest` to initialize a React + TypeScript project (`easy-vibe-web`). After creation, your computer starts a local web service so you can preview immediately. + +**Local service**: a temporary web service running only on your own machine. + +**localhost**: means "this machine itself." + +**Port**: an ID for distinguishing multiple services on the same machine (this project uses port 5174). + +**Link `http://localhost:5174/`**: open this in browser to view the running project. + +**Why 5174?** 5173 may already be occupied, so Vite auto-switched to 5174. This is normal. -**Local Service**: Can be understood as your computer temporarily opening a webpage display window, running only on your own computer, others can't access it. ::: ---- +After confirmation, wait briefly, and you should see the initial result: -**🎉 Congratulations! You've completed the first version of your prototype!** +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-14-50-34.png) -Now you can see the running effect in the browser. Next, we'll expand based on this foundation. +The base function appears, but UI is rough. Now talk to AI directly to improve visual quality: + +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-15-01-16.png) + +After refinement, you can get a cleaner interface: + +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-15-05-16.png) + +Then keep iterating by need, for example: + +- "I do not need batch import now. Remove it." +- "The left-side form has too many fields. Keep only xxxx." + +You can even ask AI to reference established websites by attaching screenshots: + +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-15-13-12.png) + +Result example: + +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-15-15-18.png) + +### 2.3 What to Do When Errors Happen + +In real practice, errors are inevitable. This is normal and does not mean you failed. You do not need to fully understand every error at once; you only need to give AI the complete observed context. + +Common handling patterns: + +- **Case 1: page or terminal errors** + If the page turns red, goes blank, or the terminal shows many red logs, take a screenshot or copy all error text and send it to AI. + +- **Case 2: function is wrong but no error appears** + For example button does nothing, data does not show, styles break. Describe in plain language: "what happened" + "what I expected." Add screenshot if needed. + +- **Case 3: unsure whether it is a problem** + Ask AI directly: "Please check this feature for obvious issues and suggest whether adjustments are needed." + +#### 2.3.1 Common beginner questions + +- **Q: I do not know where the error is** + - A: find all red text in terminal/console/page, copy all of it, and send to AI. + +- **Q: AI fixed it, but the same error persists** + - A: very common. Send the latest error output again and ask AI to continue fixing on top of previous changes. + +- **Q: Do I need to fully understand the fix immediately** + - A: no. Focus on one or two points each time. Understanding grows gradually like vocabulary learning. + +- **Q: after many attempts, still broken** + - A: try these: + - use IDE version rollback in chat/history to return to a known working state + - switch model or improve prompt specificity + - package "current code + error logs + expected behavior" and ask AI to refactor that part as a whole + +## 3. Expand from Single-Page to Multi-Page Application
-## 3. Multi-page Expansion: From Single Function to Complete Application +Once the core gameplay logic is roughly generated, we can continue building remaining pages. For example, many settings buttons may still do nothing. -The single-page prototype has validated the core gameplay. Now we need to expand it into a complete application. +You can ask AI to inspect against your business requirements and generate missing parts, or directly ask AI to implement unfinished pages one by one until all page interactions work: -### 3.1 Analyze Current Prototype's Shortcomings +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-15-17-55.png) -Reviewing our single-page prototype, we'll find some obvious issues: +After a short wait, you can see multiple pages and interactive features added on top of the previous base: -1. **No navigation**: Users can only see one page, can't switch between different functions -2. **No data persistence**: Refresh the page and all data is gone -3. **No error handling**: If something goes wrong, users don't know what happened -4. **No user feedback**: No prompts after operations, users don't know if they succeeded +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-15-23-40.png) +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-15-23-53.png) -### 3.2 Design Multi-page Structure +At this stage, manually click through the key flows you care about and confirm interactions. If something is not interactive, ask AI to fix it. -Based on the business requirements we analyzed earlier, we need the following pages: - -1. **Homepage/Dashboard**: Display task list, quick actions -2. **Product Management**: Add, edit, delete products -3. **Generation Page**: Core functionality - generate images and copy -4. **Template Library**: Save and manage templates -5. **Settings**: User preferences, API configuration - -### 3.3 Let AI IDE Help You Expand - -You can tell AI IDE: - -``` -Now I need to expand this single-page application into a multi-page application. -Please help me: -1. Add routing to support switching between pages -2. Create the following pages: - - Homepage: Display task list - - Product Management: CRUD operations for products - - Generation Page: The existing single-page functionality - - Template Library: Save and manage templates -3. Add navigation bar for page switching -``` - -AI IDE will help you complete these expansions. You just need to confirm and adjust. +## 4. Make the Prototype Feel Real
-## 4. Beautification and Optimization: Make the Prototype More Professional +After multi-page structure is in place, the final step is moving from "runs" to "feels smooth and professional." That means walking the entire user flow end to end and asking AI to fix any broken parts until you can refresh and run full flows from zero as a new user. -A working prototype is just the first step. To make it impressive, we need to beautify and optimize. +Let's revisit the initial requirement: -### 4.1 UI Beautification +```text +1. Help operations batch-generate first-round image+copy drafts: +- **Input (supports direct upload and batch import):** + - Product basic data: name, category, brand, material, size, color, target audience, etc. + - Product image: white background / simple scene image + - Per generation, support extra upload of historical bestseller screenshots or reference links + - Support Excel batch import or online entry/upload + - Support a page option for saving product assets to asset library for future use +- **Output (directly listable or listable with light edits):** + - For each product, one "presentable image draft with basic selling points" + - One "well-structured, keyword-rich title" + 1-2 selling-point lines +- **Expected workflow change:** + Move from creating every batch from scratch to dropping batches into the system, then filtering and fine-tuning generated drafts. -Tell AI IDE: - -``` -Please help me beautify this application: -1. Use a consistent color scheme -2. Add appropriate spacing and alignment -3. Improve button and form styles -4. Add hover effects and transitions -5. Ensure responsive design for different screen sizes +2. Turn useful output into a reusable template library: +- **What can be saved?** + - Any output judged "useful" by operations can be saved in one click: + - full combo: main image + title + selling points + - partial save: for example title pattern only or copy snippet only +- **What can you do after saving?** + - **Reuse:** + - apply saved template to a new product batch + - or generate multiple variants on same product for A/B testing + - **Edit:** + - edit title/copy directly + - if image editing is supported, adjust text/stickers on main image + - **Manage:** + - name and tag collections (for example "men bag main image template", "campaign title structure"), and optionally categorize by store +- **How to use on next launch?** + - after importing new products, operations can choose: + - default system generation, or + - "generate using my saved template" + - system applies template structure/style to new product data and outputs new main image + title + selling-point drafts ``` -### 4.2 UX Optimization +If each test requires manual setup from scratch, testing becomes expensive. In practice we often create **test data entry points** to accelerate full-flow testing. You can ask AI: -``` -Please help me improve the user experience: -1. Add loading states for all async operations -2. Add success/error notifications -3. Add confirmation dialogs for destructive actions -4. Improve form validation and error messages -5. Add keyboard shortcuts for common actions +```text +I need to test the full user journey and ensure everything works end to end. +Please generate test-data shortcuts based on the requirement below so I can quickly validate the entire flow: +1. Help operations batch-generate first-round image+copy drafts: +- **Input (supports direct upload and batch import):** + - Product basic data: name, category, brand, material, size, color, target audience, etc. + - Product image: white background / simple scene image + - Per generation, support extra upload of historical bestseller screenshots or reference links + - Support Excel batch import or online entry/upload + - Support a page option for saving product assets to asset library for future use +- **Output (directly listable or listable with light edits):** + - For each product, one "presentable image draft with basic selling points" + - One "well-structured, keyword-rich title" + 1-2 selling-point lines +- **Expected workflow change:** + Move from creating every batch from scratch to dropping batches into the system, then filtering and fine-tuning generated drafts. ``` -### 4.3 Performance Optimization +You can quickly get a usable result (and if one case is not enough, ask AI to generate multiple test cases): -``` -Please help me optimize performance: -1. Lazy load images -2. Implement pagination for long lists -3. Add debouncing for search inputs -4. Optimize bundle size -``` +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-15-30-30.png) ---- +Click to test: -## Summary +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-15-31-23.png) -In this chapter, we learned: +At this point, the result may appear immediately without a simulated generation process. If you want realistic delay/feedback, ask AI: -1. **Requirements Analysis**: How to extract real pain points from vague business requirements -2. **Single-page Validation**: Quickly validate core functionality with AI IDE -3. **Multi-page Expansion**: Expand from single function to complete application -4. **Beautification and Optimization**: Make prototypes more professional and user-friendly +"Please simulate a real generation process so after clicking, results appear after a short delay." -**Key Takeaways:** +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-15-50-05.png) -- Don't start coding immediately — understand requirements first -- Start with the simplest version — validate before expanding -- Let AI help you — but you make the final decisions -- Iterate based on feedback — keep improving +After generation flow works, verify template-library behavior. If the "save template" interaction is missing, ask AI: -In the next chapter, we'll learn how to integrate real AI capabilities into our prototype. +"Please ensure requirement 2 works correctly: I can save a generated result as a template, open it, and view generation parameters." -## 5. 📚 Assignment: Recreate Your Own Douyin Ecommerce Workbench +Generation is usually iterative, and screenshots are often needed for correction: + +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-15-57-14.png) + +Expected final result: + +![](/zh-cn/stage-1/1.2-building-prototype/images/index-2026-01-14-16-12-56.png) + +Besides manual user-flow testing, you can also ask AI to do requirement coverage checks: + +- "Compare this app against my original requirement. Are all core features covered?" +- "Give me a checklist: completed, missing, and weak-experience parts." + +AI will usually return a checklist. Use it to decide whether to continue iterating. After several rounds, you can get a much stronger prototype. + +## 5. 📚 Assignment: Recreate Your Own Douyin E-commerce Workbench

- Use this chapter's prompts and workflow to complete a full end-to-end loop: + Follow this chapter's approach and complete one full loop:

@@ -539,8 +623,14 @@ In the next chapter, we'll learn how to integrate real AI capabilities into our ## Next Step -In the next chapter, we will integrate concrete AI capabilities into this content production workbench (text-to-text, image-to-text, and text-to-image), for example: +In the next chapter, on top of this content-production workbench, we will integrate concrete AI capabilities (text-to-text, image-to-text, text-to-image), for example: -- Automatically generate first-draft copy and multiple title options for a content task +- Auto-generate first-draft copy and multiple title candidates for a given content task - Auto-generate visual drafts from task descriptions (text-to-image) -- Auto-classify and summarize historical content tasks to help plan your next campaign +- Auto-classify and summarize historical tasks to help plan the next campaign theme + + diff --git a/docs/en/stage-1/1.3-integrating-ai-capabilities/index.md b/docs/en/stage-1/1.3-integrating-ai-capabilities/index.md index 3813e62..63b48d4 100644 --- a/docs/en/stage-1/1.3-integrating-ai-capabilities/index.md +++ b/docs/en/stage-1/1.3-integrating-ai-capabilities/index.md @@ -4,7 +4,11 @@ description: 'Integrate real AI capabilities into your existing web prototype: u --- # Beginner Level 4: Injecting AI Capabilities into Your Prototype @@ -54,7 +58,7 @@ Specifically, you need to master the following core elements: 6. **Error Handling**: Knowing how to troubleshoot when problems occur (such as incorrect API Key, too many requests, etc.) ::: info ℹ️ What is an API -For a more in-depth explanation of APIs, see the appendix: [Introduction to APIs](/zh-cn/appendix/4-server-and-backend/api-intro). +For a more in-depth explanation of APIs, see the appendix: [Introduction to APIs](/en/appendix/4-server-and-backend/api-intro). ::: warning 🔐 **API Security Notes** The API Key is your "pass" for requesting AI services — it's a secret string used for authentication and billing. @@ -89,7 +93,7 @@ Once you've grasped these concepts, you'll find that whether you're integrating In `1.2 Building Your Prototype`, you already created an interactive prototype. What we need to do next is turn the "AI-like features" in the prototype into real, working capabilities: **when the user clicks a button, the prototype sends a request to an external AI service and displays the returned text.** ::: info ℹ️ Further Reading on Principles -If you want to learn more about the underlying principles, check out the appendix: [Introduction to Large Language Models (LLM)](/zh-cn/appendix/8-artificial-intelligence/llm-principles). +If you want to learn more about the underlying principles, check out the appendix: [Introduction to Large Language Models (LLM)](/en/appendix/8-artificial-intelligence/llm-principles). ::: details Learn More: What is DeepSeek? **Hangzhou DeepSeek Artificial Intelligence Basic Technology Research Co., Ltd.**, operating under the brand name DeepSeek, is a **Chinese artificial intelligence (AI) company that develops large language models (LLMs)**. DeepSeek is headquartered in Hangzhou, Zhejiang, and is owned and funded by the Chinese hedge fund High-Flyer. DeepSeek was founded in July 2023 by Liang Wenfeng, co-founder of High-Flyer, who also serves as CEO of both companies. The company launched its eponymous chatbot and its DeepSeek-R1 model in January 2025. @@ -215,7 +219,7 @@ For more details, refer to the [MiniMax OpenAI Compatible API Documentation](htt # 3. Integrating the Image-to-Text API: Qwen3 VL ::: info ℹ️ Further Reading on Principles -If you want to learn more about the underlying principles, check out the appendix: [Introduction to Vision Language Models (VLM)](/zh-cn/appendix/8-artificial-intelligence/multimodal-models). +If you want to learn more about the underlying principles, check out the appendix: [Introduction to Vision Language Models (VLM)](/en/appendix/8-artificial-intelligence/multimodal-models). ::: details Learn More: What is Qwen3 VL? @@ -337,25 +341,158 @@ caption = caption_image(image_path) ::: -For image-generation integration, failures can be noisy and hard to debug. Ask AI IDE to display full error details every time (instead of only "generation failed"), for example: +In this scenario, we directly try asking the AI IDE to implement a feature that automatically generates ecommerce selling-point text and keywords from uploaded images, as shown below: + +```text +Based on the image-to-text API below, help us implement a feature that automatically generates ecommerce selling points and keywords from uploaded images. + + +``` + +Final generated result: +![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-15-34-36.png) + +![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-15-35-41.png) + +
+ + + +
+ +# 4. Integrating the Image Generation API: Seedream + +In the previous section, we mainly handled text-related tasks. Next, we will try integrating image generation capabilities to support generating images from text descriptions, or editing images. + +::: info ℹ️ Further Reading on Principles +If you want to learn more about the underlying principles, check out the appendix: [Introduction to Image Generation](/en/appendix/8-artificial-intelligence/image-generation). + +::: details Learn More: What is [Seedream](https://seed.bytedance.com/en/seedream4_5)? + +![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-23-15-17.png) + +> You may already know Nano Banana (developed by Google), but you should not miss Seedream. Seedream 4.5 is a next-generation image creation model built by ByteDance. It integrates image generation and image editing capabilities into one unified architecture. This enables it to handle complex multimodal tasks such as knowledge-based generation, complex reasoning, and reference consistency. In addition, its inference speed is much faster than the previous generation and it can generate stunning high-definition images up to 4K resolution. +> +> ![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-23-15-38.png) +> ![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-23-15-50.png) + +**Main capabilities:** + +- **Text-to-image**: Generate images from text prompts, supporting many styles (realistic, cartoon, ink, cyberpunk, etc.) +- **Style transfer**: Convert an image into a specified artistic style +- **Image variants**: Generate new images in similar styles from reference images +- **Resolution enhancement**: Improve image clarity and detail +- **Image editing**: Edit existing images through natural-language instructions + +**Why choose Seedream?** + +- **Stable domestic network access**: Fast access and low latency in China +- **Excellent output quality**: Reliable performance in ecommerce and asset-generation scenarios +- **Chinese-optimized understanding**: Better understanding of Chinese prompts for domestic users +- **Fast speed**: High generation efficiency and short response times +- **Stable quality**: Can generate high-definition images up to 4K + +**Typical use cases:** + +- Ecommerce: Generate main images, detail-page assets, and promotional posters +- Social media: Generate avatars, stickers, and supporting visuals +- Design: Quickly produce concept images, assets, and backgrounds +- Marketing: Create ad images, campaign banners, and holiday posters + +**How it works with Qwen3 VL:** + +These two APIs can be chained together: first use Qwen3 VL to analyze a reference image and understand scene content, then use Seedream to generate new images based on prompts derived from that analysis. +::: + +Many "AI posters / AI product main images / AI character images" you see on Douyin, Bilibili, or YouTube are fundamentally built with this kind of technology. What you need to do is simple: organize user input into one sentence, request the image API, and display the returned image. The model used here is an image generation / image editing model. + +We will demonstrate step by step how to integrate the Seedream API into your project (with AI IDE assistance). + +After visiting the [homepage](https://www.volcengine.com/experience/ark?launch=seedream), click login. + +![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-23-12-07.png) + +After logging in, find the top-right recharge option. + +![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-23-12-22.png) + +Real-name verification is required before recharge. + +![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-23-12-30.png) + +After verification succeeds, you can [recharge 1 RMB for testing](https://console.volcengine.com/finance/fund/recharge). + +Return to the [initial page](https://www.volcengine.com/experience/ark?launch=seedream) and click API Access. + +![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-23-12-43.png) + +First, create an API key, then click the model selection option. + +![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-23-13-01.png) + +This takes you to step 2. Here, confirm the service model is Seedream 4.5 and copy the provided call example. (The screenshot was taken earlier, so the model version shown there is still 4.0.) + +![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-23-13-11.png) + +Once the API Key and call example are ready, you can paste them directly into the AI IDE and ask it to generate a frontend interactive demo or integrate the capability into your current prototype. Notice that in the screenshot you can choose text-to-image or multi-image-to-single-image mode. Select the reference code according to your specific requirement. + +::: warning ⚠️ Important note +The default example here is relatively complex. Remember to disable **"Add watermark"** and **"Streaming response"** to ensure no watermark is generated and requests do not fail. +::: + +Since we later use reference-image generation mode, we first use the multi-image-to-single-image feature. The reference code is copied as follows: + +```text +curl -X POST https://ark.cn-beijing.volces.com/api/v3/images/generations \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer xxxxxxx" \ + -d '{ + "model": "doubao-seedream-4-5-251128", + "prompt": "将图1的服装换为图2的服装", + "image": ["https://ark-project.tos-cn-beijing.volces.com/doc_image/seedream4_imagesToimage_1.png", "https://ark-project.tos-cn-beijing.volces.com/doc_image/seedream4_imagesToimage_2.png"], + "sequential_image_generation": "disabled", + "response_format": "url", + "size": "2K", + "stream": false, + "watermark": true +}' +``` + +With the image reference code prepared, we ask the AI IDE to support common image-task features in ecommerce: + +```text +Please help me implement common ecommerce features in this project based on the API below (for example, poster generation, Douyin ecommerce hero-image generation, etc.) + + +``` + +Implementation result: + +![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-23-21-13.png) + +It is worth noting that image generation often encounters odd failures. It is recommended that AI IDE always shows full error details so you can copy and debug effectively. For example, you can say: ```text Don't only show "image generation failed." Please always display the full failure reason, such as model mismatch, request errors, or timeout details. ``` -If changes are not reflected in the page after multiple attempts, ask AI IDE to restart the project. +Sometimes updates after edits may still not be reflected on the page. If you keep seeing errors after multiple rounds, you can also try telling the AI IDE directly: please restart this project. -In ecommerce scenarios, you may want uploaded clothes to be automatically "worn" by virtual models, or automatically generate attractive product posters and promotional visuals. Here is an example prompt for generating an ecommerce poster: +In ecommerce scenarios, we may want clothes uploaded by users to be automatically worn by a model, or automatically generate attractive product sales images and posters. Here we try a prompt that asks for an ecommerce poster: ![](../../../zh-cn/stage-1/1.3-integrating-ai-capabilities/images/index-2026-01-20-23-14-10.png) You can combine text-to-image and image-to-image APIs based on your own business scenario ideas. -## 4. More Image Service Options +## More Different Image Service Options Below are additional choices. It's recommended to first run through a working Qwen image generation result, then replace with another service based on quality and cost. -### 4.1 Recraft Integration +### Recraft Integration If your prototype is more design-production oriented (for example brand-style illustrations, marketing posters, vector-style assets), Recraft is often a better fit. The integration method is exactly the same: **get a Key + find official examples + let AI IDE wire them into your page/button**. @@ -380,7 +517,7 @@ Then follow the same process and use official documentation examples: ::: -### 4.2 Qwen Image / Qwen Image Edit Integration +### Qwen Image / Qwen Image Edit Integration If you want a relatively simple way to integrate image generation, Qwen Image is also a good choice. The approach is unchanged: treat it as an image API and connect it to your prototype button. @@ -653,3 +790,9 @@ In the next chapter, we will connect these separate AI capabilities into one com - Connect content planning, product listing, and data analysis into one end-to-end workflow - Embed this chapter's AI capabilities (LLM copywriting, text-to-image, image editing) into concrete business nodes - Build a truly usable "Ecommerce AI Workbench" instead of isolated demos + + diff --git a/docs/en/stage-1/appendix-consumer-scenarios/index.md b/docs/en/stage-1/appendix-consumer-scenarios/index.md index c14b2cd..24080bf 100644 --- a/docs/en/stage-1/appendix-consumer-scenarios/index.md +++ b/docs/en/stage-1/appendix-consumer-scenarios/index.md @@ -1,14 +1,1334 @@ --- title: 'C-End Scenario Inspiration Direction Reference' -description: 'This page provides an extended entry for C-end consumer scenario inspiration and links to the maintained English chapter for detailed industry-ready scenario references.' +description: 'This document summarizes creative application directions of LLM large models in C-End consumer scenarios, covering inspiration across lifestyle, emotional companionship, entertainment, personal growth, social interaction, and more, providing creative references for AI application developers targeting everyday users.' --- + + # C-End Scenario Inspiration Direction Reference -This chapter name exists in the Chinese track as an extended version of consumer-scenario inspiration content. +## Chapter Overview -For the maintained English version with complete interactive selection and 16 scenario directions, use: + -- [C-End Consumer Scenario Inspiration Reference](../appendix-c-consumer-scenarios/) +This document summarizes creative application directions of LLM large models in C-End consumer scenarios. Unlike B-End products that focus on efficiency and pain points, C-End products put stronger emphasis on building feelings, psychological cues, and atmosphere, so users can gain emotional resonance and delightful experiences during use. -If you prefer to keep this page in your bookmarks, treat it as the English entry point for the same topic. + + +## Quick Atmosphere Selection + + +
Find scenario inspiration that resonates with you
+
+ Choose your desired atmosphere and current feeling. The system will recommend related scenario directions. Click tags to jump to corresponding sections. +
+ + + + +
{{ item.label }}
+
{{ item.desc }}
+
+
+
+ + + +
{{ item.label }}
+
{{ item.desc }}
+
+
+
+
+ +
+
+ Recommended {{ currentSelection.vibe }} × {{ currentSelection.feeling }} scenarios for you: +
+
+ + {{ topic.title }} + +
+ + Choose Again + +
+
+ +## Scenario Direction Quick Overview + + + + +
{{ scenario.name }}
+
{{ topicPool[scenario.key]?.length || 0 }} inspiration directions
+
+
+
+ + + +
{{ scenario.name }}
+
{{ topicPool[scenario.key]?.length || 0 }} inspiration directions
+
+
+
+ + + +
{{ scenario.name }}
+
{{ topicPool[scenario.key]?.length || 0 }} inspiration directions
+
+
+
+ +--- + +## 1. Lifestyle + +> 💡 **Core Concept**: Turn ordinary daily life into meaningful rituals, and create beauty in details + +### 1.1 Morning Ritual Awakening Assistant + +**Scenario Description**: +Every morning, generate a personalized ritual based on weather, schedule, and mood. It might be a gentle song, a cup of tea that matches today’s mood, a 5-minute stretch, or a perfectly timed encouraging sentence. + +**Key Atmosphere-Building Points**: +- Gradual awakening instead of abrupt urging +- Multi-sensory visual and auditory experience +- Make the start of every day feel worth looking forward to + +**Psychological Cue**: +> "Today will be a beautiful day, because you deserve to be treated gently." + +### 1.2 Solo Living Atmosphere Creator + +**Scenario Description**: +Design home atmosphere plans for people living alone by intelligently combining lighting, music, scent, and more, so even a one-person home feels warm and grounding. + +**Key Atmosphere-Building Points**: +- Auto-adjust atmosphere by time and mood +- Seasonal theme changes +- Create a feeling of "being accompanied" + +### 1.3 Weekend Stay-Home Healing Plan Generator + +**Scenario Description**: +On Friday night, generate a perfect weekend-at-home plan based on current mood and weather. Include movie picks, snack pairings, home setup suggestions, and even corners ideal for zoning out. + +**Key Atmosphere-Building Points**: +- Healing-oriented visual presentation +- Low-pressure choice experience +- Make staying home feel like a treat + +### 1.4 Bedtime Soul-Soothing Radio + +**Scenario Description**: +Before sleep every night, generate personalized soothing content: gentle stories, meditation guidance, white noise, or simple good-night greetings to accompany users into sleep. + +**Key Atmosphere-Building Points**: +- Soft vocal tone and rhythm +- Gradual volume fade design +- Build safety and relaxation + +### 1.5 Life Aesthetics Inspiration Hunter + +**Scenario Description**: +Help users discover beauty from daily details and provide life-aesthetics suggestions and ritual guides, such as making coffee more elegant or turning a desk into a flow-state space. + +**Key Atmosphere-Building Points**: +- Find the extraordinary in ordinary moments +- Cultivate aesthetic perception +- Let life itself become art + +--- + +## 2. Emotional Companionship + +> 💡 **Core Concept**: Unconditional acceptance and companionship as a gentle emotional container + +### 2.1 Late-Night Tree-Hole Listener + +**Scenario Description**: +A 24/7 emotional outlet that receives all worries without judgment. Whether joy, sadness, anger, or confusion, there is always a place where emotions can land. + +**Key Atmosphere-Building Points**: +- Absolute sense of safety and privacy protection +- No interruption, no preaching, just listening +- Gentle responses and empathy + +**Psychological Cue**: +> "All your emotions are valid. I am here with you." + +### 2.2 Heartbreak Healing Companion + +**Scenario Description**: +Provide gentle companionship, healing suggestions, and emotional outlets during heartbreak lows. It does not rush users to "move on," but allows them to heal at their own pace. + +**Key Atmosphere-Building Points**: +- Allow sadness to exist +- Gradual emotional guidance +- Rebuild self-worth + +### 2.3 Anxiety Relief Breathing Coach + +**Scenario Description**: +Sense user anxiety and guide breathing exercises and mindfulness meditation. In tense moments, provide a reliable anchor. + +**Key Atmosphere-Building Points**: +- Real-time emotional awareness +- Simple and effective relief methods +- Create calm and a sense of control + +### 2.4 Self-Confidence Rebuilding Mentor + +**Scenario Description**: +Help users rebuild self-identity and self-worth through positive dialogue and psychological cues. Record each small step and witness transformation. + +**Key Atmosphere-Building Points**: +- Discover overlooked strengths +- Celebrate every small win +- Build positive self-talk + +### 2.5 Intelligent Emotional Journal Interpreter + +**Scenario Description**: +Analyze users' emotional journals, discover patterns, and provide warm insights and suggestions so users understand themselves better and coexist with emotions peacefully. + +**Key Atmosphere-Building Points**: +- Visualized emotional trajectory +- Warm insights instead of cold analysis +- Actionable suggestions + +--- + +## 3. Entertainment & Leisure + +> 💡 **Core Concept**: Create immersive experiences so entertainment becomes a place where the mind can rest + +### 3.1 Immersive Script-Murder DM + +**Scenario Description**: +Play the role of script-murder host, build suspense, and drive the story. Adjust rhythm in real time based on player responses to create unforgettable gameplay. + +**Key Atmosphere-Building Points**: +- A gripping opening +- Well-paced suspense setting +- Immersive role-play + +### 3.2 Open-World Soul NPC + +**Scenario Description**: +Create lifelike NPCs that remember player stories and form genuine emotional bonds. They are not just quest givers but friends in the game world. + +**Key Atmosphere-Building Points**: +- Persistent memory and continuity +- Personalized interaction +- Authentic emotional connection + +### 3.3 Personalized Podcast Content Generator + +**Scenario Description**: +Generate personalized podcasts based on user interests, sounding as natural as chatting with friends. Content can be knowledge sharing, storytelling, or simple companionship. + +**Key Atmosphere-Building Points**: +- Relaxed and natural conversational feel +- Content aligned with personal taste +- Companionship available anytime + +### 3.4 Virtual Concert Atmosphere Crew + +**Scenario Description**: +Build live-concert atmosphere for online concerts with real-time interaction, cheering, and atmosphere rendering. Even alone at home, users can feel the excitement of a concert. + +**Key Atmosphere-Building Points**: +- Visual and auditory immersion +- Real-time interaction and resonance +- Create collective participation + +### 3.5 Interactive Novel Co-Creation Partner + +**Scenario Description**: +Co-create stories with readers where each choice affects world direction. Readers are no longer passive consumers but co-creators. + +**Key Atmosphere-Building Points**: +- Unlimited possibilities +- Real choice ownership +- Build stories that truly belong to the user + +--- + +## 4. Personal Growth + +> 💡 **Core Concept**: Growth is not ascetic suffering, but an interesting journey of self-discovery + +### 4.1 Personal Growth Witness + +**Scenario Description**: +Record user growth trajectories and provide encouragement and reflection at key milestones. Make growth visible and effort remembered. + +**Key Atmosphere-Building Points**: +- Visualized growth path +- Milestone commemoration +- Warm reflection and forward-looking encouragement + +**Psychological Cue**: +> "You have already come this far, even if you did not notice." + +### 4.2 Gamified Habit-Building Coach + +**Scenario Description**: +Turn boring habit formation into fun adventure gameplay. Every small habit kept becomes an achievement in the game. + +**Key Atmosphere-Building Points**: +- Gamified motivation mechanics +- Instant positive feedback +- Make consistency feel fun + +### 4.3 Skill-Learning Buddy Matcher + +**Scenario Description**: +Match users with like-minded learning partners for mutual accountability and progress sharing. Learning no longer feels like a lonely solo trip. + +**Key Atmosphere-Building Points**: +- Find peers on the same wavelength +- Build a mutually motivating atmosphere +- Share the joy of growing together + +### 4.4 Daily Little Happiness Discoverer + +**Scenario Description**: +Help users discover small beautiful moments in life and cultivate gratitude and positivity. Encourage recording one gratitude-worthy moment every day. + +**Key Atmosphere-Building Points**: +- Notice overlooked goodness +- Build gratitude habits +- Accumulate positive energy + +### 4.5 Life Simulation Explorer + +**Scenario Description**: +Simulate different life choices and experience alternative possibilities in parallel worlds. Help users explore possibilities and make more authentic decisions. + +**Key Atmosphere-Building Points**: +- Safe choice exploration +- Discover unknown sides of self +- No right or wrong, only experience + +--- + +## 5. Social Interaction + +> 💡 **Core Concept**: Make socializing feel natural and easy, and help users find their comfortable way of connecting + +### 5.1 Icebreaker Topic Generator + +**Scenario Description**: +Provide interesting topics for social settings to dissolve awkwardness and bring people closer. Whether it is a stranger meetup or old friends reconnecting, there is always a suitable opening. + +**Key Atmosphere-Building Points**: +- Light and interesting topics +- Suitable across different settings +- Natural conversation openings + +### 5.2 Moments Caption Atmosphere Stylist + +**Scenario Description**: +Generate tasteful social captions based on photos and mood. Make sharing a form of expression and records warmer. + +**Key Atmosphere-Building Points**: +- Align with personal style +- Tasteful but not forced +- Authentic emotional expression + +### 5.3 Date Atmosphere Planner + +**Scenario Description**: +Design complete date atmosphere plans from location to topics to surprises. Make every date a memorable experience. + +**Key Atmosphere-Building Points**: +- End-to-end experience design +- Surprises at the right level +- Build romantic atmosphere + +### 5.4 Remote Party Atmosphere Lead + +**Scenario Description**: +Liven up online gatherings by organizing games and guiding interaction. Make remote parties feel as lively as face-to-face gatherings. + +**Key Atmosphere-Building Points**: +- Fun games and activities +- Guided natural interaction +- Create collective participation + +### 5.5 Social Energy Management Assistant + +**Scenario Description**: +Help introverts manage social energy and find a comfortable social rhythm. Users do not need to force themselves to still enjoy social experiences. + +**Key Atmosphere-Building Points**: +- Respect personal boundaries +- Find what works for each individual +- No personality change required + +--- + +## 6. Creative Expression + +> 💡 **Core Concept**: Everyone has creativity, it just needs to be awakened + +### 6.1 Creative Block First-Aid Kit + +**Scenario Description**: +Offer unexpected sparks during creative bottlenecks. Not standard answers, but keys that open new ways of thinking. + +**Key Atmosphere-Building Points**: +- Break fixed thinking patterns +- Unexpected idea connections +- Activate internal creativity + +### 6.2 Personal Style Exploration Guide + +**Scenario Description**: +Help users discover unique personal style from outfit choices to self-expression. Let everyone find their own voice. + +**Key Atmosphere-Building Points**: +- Discover what is uniquely yours +- Encourage experimentation +- Build a personal brand + +### 6.3 Journal & Diary Aesthetics Advisor + +**Scenario Description**: +Provide aesthetic suggestions for journal layout, color, and content ideas. Turn recording into art and give memories better texture. + +**Key Atmosphere-Building Points**: +- Visual aesthetic guidance +- Content creativity inspiration +- Personalized style + +### 6.4 Photography Composition Atmosphere Guide + +**Scenario Description**: +Provide photography and editing suggestions based on scene and desired feeling. Make each photo deliver intended emotions. + +**Key Atmosphere-Building Points**: +- Atmosphere over pure technique +- Visual expression of emotion +- Train an eye for beauty + +### 6.5 Music Mood Matcher + +**Scenario Description**: +Recommend perfect music combinations based on current mood and context. Music is emotional resonance and an atmosphere builder. + +**Key Atmosphere-Building Points**: +- Precise emotion matching +- Scenario-based recommendation +- Healing power of music + +--- + +## 7. Travel Exploration + +> 💡 **Core Concept**: Travel is not only seeing scenery, but feeling different ways of life + +### 7.1 City Walk Exploration Guide + +**Scenario Description**: +Explore cities like a local and discover hidden gems. It is not only about check-in spots, but about sensing the city’s true pulse. + +**Key Atmosphere-Building Points**: +- Local perspective +- Unexpected discoveries and surprises +- Dive into the city's soul + +### 7.2 Travel Mood Journal Generator + +**Scenario Description**: +Transform travel photos and moods into elegant travel journals and memories. Let every trip leave a unique mark. + +**Key Atmosphere-Building Points**: +- Emotional recording +- Beautiful writing +- Lasting memories + +### 7.3 Solo Travel Companion Assistant + +**Scenario Description**: +Provide companionship, suggestions, and safety support for solo travelers. Solo trips can still feel cared for and accompanied. + +**Key Atmosphere-Building Points**: +- Build a sense of safety +- Offer enjoyable companionship +- Solo but not lonely + +### 7.4 Destination Atmosphere Preview + +**Scenario Description**: +Immersively preview destination atmosphere before departure to get in the mood early. Let anticipation become part of the journey. + +**Key Atmosphere-Building Points**: +- Immersive preview +- Spark anticipation and imagination +- Enter travel mode in advance + +### 7.5 Travel Photography Atmosphere Coach + +**Scenario Description**: +Guide users to capture story-rich travel photos based on scene and light. It is not just recording, but storytelling. + +**Key Atmosphere-Building Points**: +- Story-first composition +- Emotion capture +- Unique perspective + +--- + +## 8. Physical & Mental Health + +> 💡 **Core Concept**: Health is not an endpoint, but a gentle practice of self-care + +### 8.1 Exercise Motivation Awakener + +**Scenario Description**: +When users do not feel like moving, provide exactly the right encouragement. It is not forcing action, but awakening internal motivation. + +**Key Atmosphere-Building Points**: +- Understand resistance to movement +- Step-by-step guidance +- Celebrate every small action + +### 8.2 Healthy Diet Inspiration Kitchen + +**Scenario Description**: +Generate healing healthy recipes based on mood and available ingredients. Healthy eating can also be delicious enjoyment. + +**Key Atmosphere-Building Points**: +- Appealing food experiences +- Simple cooking methods +- Healthy balance + +### 8.3 Sleep Quality Atmosphere Optimizer + +**Scenario Description**: +Build high-quality sleep atmosphere from environment to mindset. Make sleep the most anticipated part of the day. + +**Key Atmosphere-Building Points**: +- Environmental optimization +- Psychological relaxation +- Ritualized design + +### 8.4 Body Awareness Guide + +**Scenario Description**: +Guide users to notice body signals and build mind-body connection. Pause in busy life and listen to the body. + +**Key Atmosphere-Building Points**: +- Gentle guidance +- Body awareness +- Mind-body integration + +### 8.5 Self-Care Reminder Assistant + +**Scenario Description**: +Remind users to pause and care for themselves in the middle of busy days. A small reminder can change the state of an entire day. + +**Key Atmosphere-Building Points**: +- Timely reminders +- Simple actions +- Gentle care + +--- + +## 9. Knowledge Exploration + +> 💡 **Core Concept**: Learning is an endless adventure, and curiosity is the best teacher + +### 9.1 Gamified Knowledge Exploration Guide + +**Scenario Description**: +Turn boring learning into an engaging exploration adventure. Every knowledge point becomes a treasure waiting to be discovered. + +**Key Atmosphere-Building Points**: +- Gamified experience +- Joy of exploration +- Sense of achievement + +### 9.2 Language Learning Scenario Partner + +**Scenario Description**: +Play different roles so users naturally acquire language through contextual dialogue. Not rote memorization, but learning through use. + +**Key Atmosphere-Building Points**: +- Realistic contexts +- Interesting role-play +- Natural acquisition + +### 9.3 Curiosity Satisfaction Assistant + +**Scenario Description**: +Answer all kinds of imaginative questions and satisfy curiosity about the world. There are no foolish questions, only answers waiting to be found. + +**Key Atmosphere-Building Points**: +- Encourage asking +- Interesting explanations +- Spark even more curiosity + +### 9.4 Reading Notes Inspiration Booster + +**Scenario Description**: +Help users organize reading insights and discover new thinking angles. Turn reading into dialogue with the author and with oneself. + +**Key Atmosphere-Building Points**: +- Deep thinking +- Personal perspective +- Knowledge connection + +### 9.5 Knowledge-Sharing Atmosphere Builder + +**Scenario Description**: +Transform what users learned into interesting content for sharing. Sharing is not only output, but also a process of deepening understanding. + +**Key Atmosphere-Building Points**: +- Engaging expression +- Joy of sharing +- Knowledge diffusion + +--- + +## 10. Relationship Management + +> 💡 **Core Concept**: Good relationships require care, and care does not need to be complicated + +### 10.1 Intimate Communication Coach + +**Scenario Description**: +Help users express difficult emotions and improve intimate relationships. Sometimes what is needed is simply the right way to say what is in the heart. + +**Key Atmosphere-Building Points**: +- Safe space for expression +- Gentle suggestions +- Improved mutual understanding + +### 10.2 Family Care Reminder Assistant + +**Scenario Description**: +Remind users to care for family and provide warm interaction suggestions. In busy life, do not forget what matters most. + +**Key Atmosphere-Building Points**: +- Timely reminders +- Simple care actions +- Warm connection + +### 10.3 Friendship Maintenance Atmosphere Coach + +**Scenario Description**: +Help users maintain long-distance friendships and create shared topics. Distance is not the problem; intention is the key. + +**Key Atmosphere-Building Points**: +- Create opportunities to connect +- Shared conversation themes +- Sustained friendship + +### 10.4 Confession & Surprise Planner + +**Scenario Description**: +Plan unforgettable surprises and romantic moments for important people. Make special days even more special. + +**Key Atmosphere-Building Points**: +- Personalized design +- Romantic surprise moments +- Memorable experiences + +### 10.5 Conflict-Deescalation Atmosphere Guide + +**Scenario Description**: +Provide atmosphere-softening suggestions and wording when relationships become tense. Help users find a bridge toward reconciliation. + +**Key Atmosphere-Building Points**: +- Understand both sides +- Gentle guidance +- Relationship repair + +--- + +## 11. Pet Companionship + +> 💡 **Core Concept**: Pets are family, and their companionship deserves to be recorded and cherished + +### 11.1 Anthropomorphic Pet Diary + +**Scenario Description**: +Generate diary entries from a pet perspective to record warm daily moments with owners. Imagine how pets would describe their time with you. + +**Key Atmosphere-Building Points**: +- Adorable perspective +- Warm daily moments +- Emotional connection + +### 11.2 Pet Behavior Interpreter + +**Scenario Description**: +Interpret pet behavior language to deepen pet-owner connection and better understand needs and emotions. + +**Key Atmosphere-Building Points**: +- Professional interpretation +- Better understanding +- Better care + +### 11.3 Pet Bonding-Time Planner + +**Scenario Description**: +Design creative activities for interacting with pets and strengthening bonds. Make companionship time more meaningful and fun. + +**Key Atmosphere-Building Points**: +- Creative activities +- Fun interaction +- Beautiful memories + +### 11.4 Pet Memory Story Generator + +**Scenario Description**: +Turn pet photos and memories into warm stories. Record precious moments with furry family members. + +**Key Atmosphere-Building Points**: +- Warm narrative +- Precious memory preservation +- Enduring love + +### 11.5 New Pet Parent Comfort Guide + +**Scenario Description**: +Provide warm companionship and practical guidance for new pet owners, making the pet-raising journey confident and joyful. + +**Key Atmosphere-Building Points**: +- Comprehensive guidance +- Warm encouragement +- Reassuring companionship + +--- + +## 12. Financial Health + +> 💡 **Core Concept**: Financial freedom is not the only goal; financial health is + +### 12.1 Spending Emotion Awareness Assistant + +**Scenario Description**: +Help users notice emotions behind impulse spending and build healthy spending views. Understanding why you want to buy can be more important than whether you buy. + +**Key Atmosphere-Building Points**: +- Gentle awareness +- Understanding without judgment +- Healthier habits + +### 12.2 Savings Goal Visualization Motivator + +**Scenario Description**: +Turn savings goals into visible dream-progress journeys. Make saving part of realizing dreams. + +**Key Atmosphere-Building Points**: +- Visualized progress +- Dream-linked motivation +- Sense of achievement + +### 12.3 Easy & Fun Finance Learning + +**Scenario Description**: +Learn financial knowledge in a light and enjoyable way. Finance should not be dry; it can be an engaging exploration. + +**Key Atmosphere-Building Points**: +- Relaxed communication style +- Interesting real examples +- Practical knowledge + +### 12.4 Financial Anxiety Soothing Coach + +**Scenario Description**: +Provide emotional support and practical suggestions under financial stress. Anxiety does not solve problems, but calm often does. + +**Key Atmosphere-Building Points**: +- Emotional soothing +- Practical guidance +- A sense of hope + +### 12.5 Small-Amount Investment Experience Game + +**Scenario Description**: +Use gamification to experience investing and lower the beginner barrier. Learn investing inside a safer environment. + +**Key Atmosphere-Building Points**: +- Game-like experience +- Safe trial-and-error +- Joyful learning + +--- + +## 13. Career Development + +> 💡 **Core Concept**: A career is not a fixed track, but an open field for exploration + +### 13.1 Career-Confusion Companion + +**Scenario Description**: +Offer listening, exploration, and direction suggestions during career confusion. Feeling lost is normal; facing it alone is not required. + +**Key Atmosphere-Building Points**: +- Non-judgmental listening +- Possibility exploration +- Warm companionship + +### 13.2 Work Achievement Awakener + +**Scenario Description**: +Help users rediscover value and meaning in work and reignite passion. Sometimes it is simply about seeing from a new angle. + +**Key Atmosphere-Building Points**: +- Reveal hidden value +- Reignite passion +- Restore sense of achievement + +### 13.3 Workplace Social Atmosphere Assistant + +**Scenario Description**: +Provide relaxed workplace social topics and interaction suggestions so professional socializing feels less awkward and more natural. + +**Key Atmosphere-Building Points**: +- Easy conversation starters +- Natural interaction +- Comfortable relationships + +### 13.4 Side-Hustle Inspiration Generator + +**Scenario Description**: +Generate side-hustle ideas based on personal interests and skills. Explore possibilities beyond regular work. + +**Key Atmosphere-Building Points**: +- Interest discovery +- Possibility expansion +- Action encouragement + +### 13.5 Pre-Interview Confidence Station + +**Scenario Description**: +Provide confidence-building and mental preparation support before interviews so users can meet opportunities in their best state. + +**Key Atmosphere-Building Points**: +- Confidence building +- Solid preparation +- Best-state readiness + +--- + +## 14. Home Space + +> 💡 **Core Concept**: Home is not only where we live, but where the mind can rest + +### 14.1 Home Atmosphere Designer + +**Scenario Description**: +Design home atmosphere plans by mood and season so home can change with emotional and seasonal rhythms. + +**Key Atmosphere-Building Points**: +- Atmosphere-focused design +- Seasonal variation +- Mood matching + +### 14.2 Four-Season Home Refresh Guide + +**Scenario Description**: +Update home layout and decor with the seasons to keep freshness. Let home stay full of vitality and surprise. + +**Key Atmosphere-Building Points**: +- Seasonal themes +- Fresh feeling +- Everyday ritual quality + +### 14.3 Small-Space Magic + +**Scenario Description**: +Help small spaces still feel comfortable and warm. Space size is not the key; feeling is. + +**Key Atmosphere-Building Points**: +- Space optimization +- Cozy atmosphere +- Comfortable living + +### 14.4 At-Home Ritual Creator + +**Scenario Description**: +Create rituals for daily home activities. Turn ordinary chores into meaningful moments. + +**Key Atmosphere-Building Points**: +- Ritual design +- Meaning assignment +- Better life quality + +### 14.5 Decluttering Psychological Companion + +**Scenario Description**: +Provide emotional support and decision suggestions while organizing belongings. Decluttering is not only removing objects, but also organizing the mind. + +**Key Atmosphere-Building Points**: +- Emotional support +- Decision assistance +- Inner clarity + +--- + +## 15. Food & Cooking + +> 💡 **Core Concept**: Food is a language of love, and cooking is a way to express it + +### 15.1 One-Person Healing Cuisine + +**Scenario Description**: +Design simple healing meal plans for solo living. Even alone, users deserve to eat well and care for themselves. + +**Key Atmosphere-Building Points**: +- Simple cooking process +- Comforting taste +- Self-love expression + +### 15.2 Festive Table Atmosphere Designer + +**Scenario Description**: +Design ritual-rich table setups for special days so every meal can become a memorable moment. + +**Key Atmosphere-Building Points**: +- Ritual-oriented design +- Visual enjoyment +- Beautiful memories + +### 15.3 Cooking Mood Matcher + +**Scenario Description**: +Recommend suitable food and cooking methods by current mood. Sometimes what users need is exactly that one right flavor. + +**Key Atmosphere-Building Points**: +- Mood matching +- Food as healing +- Emotional connection + +### 15.4 Kitchen Beginner Confidence Builder + +**Scenario Description**: +Provide warm encouragement and simple recipes for beginner cooks. Everyone can become their own chef. + +**Key Atmosphere-Building Points**: +- Easy starting path +- Warm encouragement +- Confidence building + +### 15.5 Food Photography Atmosphere Guide + +**Scenario Description**: +Help everyday dishes look atmosphere-rich and tempting in photos. Recording food is also recording life’s beauty. + +**Key Atmosphere-Building Points**: +- Atmosphere creation +- Visual enjoyment +- Beautiful life documentation + +--- + +## 16. Style & Outfit + +> 💡 **Core Concept**: Outfit is self-expression, and style is the external form of what is inside + +### 16.1 Today's Outfit Mood Board + +**Scenario Description**: +Generate outfit inspiration based on weather, occasion, and mood so each day’s look expresses current emotions. + +**Key Atmosphere-Building Points**: +- Mood expression +- Occasion alignment +- Confidence building + +### 16.2 Capsule Wardrobe Stylist + +**Scenario Description**: +Create limitless outfit combinations from a limited set of items. Less can be more, and simplicity can still look highly styled. + +**Key Atmosphere-Building Points**: +- Minimalist concept +- Creative combinations +- Sustainable fashion + +### 16.3 Personal Style Exploration Journey + +**Scenario Description**: +Help users discover and build unique personal style. Dressing is not only wearing clothes, but showing one’s attitude. + +**Key Atmosphere-Building Points**: +- Self exploration +- Style formation +- Confident expression + +### 16.4 Old-Clothes New-Wear Creator + +**Scenario Description**: +Provide new styling inspiration for old clothing. Revitalize old pieces and make fashion more sustainable. + +**Key Atmosphere-Building Points**: +- Creative restyling +- Eco-conscious mindset +- Fresh feeling + +### 16.5 Special-Occasion Styling Advisor + +**Scenario Description**: +Design confidence-boosting looks for important occasions so every key moment can be presented at its best. + +**Key Atmosphere-Building Points**: +- Occasion matching +- Confidence enhancement +- Polished presentation + +--- + +## Core Principles for Designing C-End Products + +### 1. From "Function" to "Feeling" + +B-End products care about "what problem this feature solves." C-End products care about "what feeling this feature creates." + +| B-End Thinking | C-End Thinking | +|---------|---------| +| Improve efficiency | Save time for things users love | +| Reduce costs | Make every dollar feel worthwhile | +| Solve pain points | Create delightful experiences | +| Full feature set | Feeling done right | + +### 2. Three Layers of Atmosphere Building + +**Sensory Layer**: design for sight, sound, and touch-like interaction feel +- Warm colors +- Soothing sounds +- Smooth motion + +**Emotional Layer**: emotional resonance and guidance +- Understand user moods +- Provide emotional support +- Create positive emotions + +**Meaning Layer**: value identity and belonging +- Make users feel understood +- Build a sense of belonging +- Give action a sense of meaning + +### 3. The Power of Psychological Cues + +Copy and design in C-End products always carry psychological cues: + +- **Positive cues**: "You are already doing great", "Take your time, it is okay" +- **Belonging cues**: "Many people feel the same", "You are not alone" +- **Growth cues**: "Every attempt is progress", "You are getting better" + +### 4. Help Users Become Better Versions of Themselves + +The best C-End products do not change users by force; they help users become who they want to be. + +- Not "you should...", but "you can..." +- Not "you must...", but "if you want..." +- Not "you are not enough yet...", but "you are already..." + +--- + +> 🌟 **Remember**: C-End users do not buy functions, they buy feelings; not tools, but companionship; not service, but understanding. diff --git a/docs/en/stage-1/appendix-industry-scenarios/index.md b/docs/en/stage-1/appendix-industry-scenarios/index.md index 6a464f2..024e664 100644 --- a/docs/en/stage-1/appendix-industry-scenarios/index.md +++ b/docs/en/stage-1/appendix-industry-scenarios/index.md @@ -316,7 +316,7 @@ This document summarizes **LLM large model applications in B-End enterprise scen
Find the application scenario suitable for you
- Select your interest direction and purpose, the system will recommend related industry directions. Click on tags to jump to corresponding chapters. + Select your interest direction and target purpose. The system recommends related industry scenarios. Click a row to jump to the corresponding chapter.
@@ -348,24 +348,43 @@ This document summarizes **LLM large model applications in B-End enterprise scen
-
- Recommended {{ currentSelection.interest }} × {{ currentSelection.purpose }} industries: +
+ {{ recommendationTopics.length }} recommended scenarios for you + + ({{ currentSelection.interest }} + {{ currentSelection.purpose }}) +
-
- - {{ topic.title }} - + + + + + + + + +
+ 💡 Click any row in the table to jump to the corresponding industry section
- - Reset Selection - +
+ +
+ 💡 Please select both interest direction and purpose + 💡 Please select an interest direction + 💡 Please select a purpose +
+ +
+ Reset Selection
@@ -411,6 +430,11 @@ Learners can choose directions based on these dimensions: | 3 | Technical Documentation Auto-Generation System | LLM auto-generates product specifications, operation manuals; supports multi-format export | | 4 | Production Equipment Inspection Report Auto-Generation | Voice input describes equipment status; structured inspection report auto-generated | | 5 | Industrial Equipment Fault Diagnosis Q&A | Builds vector knowledge base from historical fault cases; provides intelligent diagnosis suggestions | +| 6 | LLM Information-Retrieval Data Warehouse | Uses Text-to-SQL to convert natural-language queries into database queries; Superset visualizes results; Doris or ClickHouse as OLAP engine | +| 7 | Industrial Equipment Fault-Diagnosis Knowledge Q&A Assistant | Builds a vector knowledge base from historical fault cases; LLM provides diagnosis suggestions and solution plans based on fault descriptions | +| 8 | Production Quality Inspection Report Generation and Defect Classification | OCR identifies defects in inspection photos; LLM generates structured quality reports and classifies defect type and severity | +| 9 | Inventory Counting Assistant and Inventory Report Generation | Inputs stocktaking data; LLM compares with system inventory and generates discrepancy reports with abnormal-inventory alerts | +| 10 | Process Optimization Suggestion Intelligent Q&A System | Builds a RAG knowledge base from process documents; LLM provides optimization suggestions based on production issues | --- @@ -425,6 +449,11 @@ Learners can choose directions based on these dimensions: | 3 | Enterprise Internal Knowledge Intelligent Q&A | Builds vector knowledge base from internal documents; provides precise Q&A service for employees | | 4 | Customer Service Conversation Smart Summary | Automatically generates conversation summaries; extracts key information and creates follow-up tickets | | 5 | Golden Script Recommendation Knowledge Base | Analyzes excellent service cases; extracts golden scripts for team sharing and training | +| 6 | Customer Service Script Compliance Auto-Check Assistant | Customer-service staff input reply drafts; LLM checks script compliance and sensitive words in real time and provides revision suggestions | +| 7 | Customer Service Ticket Auto-Summary and Classification Tool | LLM summarizes long conversations and auto-classifies tags; Elasticsearch supports full-text ticket search | +| 8 | Customer Emotion Monitoring and Abnormality Alert Tool | Real-time analysis of voice tone and text sentiment; LLM identifies abnormal emotions and triggers alerts with WebSocket push | +| 9 | Golden Script Recommendation Knowledge-Base System for Customer Service | LLM analyzes excellent customer-service conversations, refines high-performing templates, and recommends scripts based on context | +| 10 | Intelligent Outbound-Call Conversation Analysis and QA Assistant | After outbound-call recording transcription, LLM extracts key information; automatically generates QA reports and improvement suggestions | --- @@ -439,6 +468,11 @@ Learners can choose directions based on these dimensions: | 3 | Homework Auto-Grading & Learning Diagnosis | OCR recognizes handwritten answers; AI provides grading and improvement suggestions | | 4 | Job Competency Model & Learning Map | Analyzes job requirements; generates competency models and corresponding learning paths | | 5 | Foreign Language Oral Practice with AI | LLM plays role-play partners; simulates various real-life scenarios for speaking practice | +| 6 | School-Based Curriculum Construction and Courseware Production Tool | LLM analyzes school characteristics and student needs to generate curriculum frameworks; integrates PPT generation APIs for automatic courseware creation | +| 7 | College-Application Recommendation and Career Planning Platform | LLM analyzes candidate scores, ranking, interests, and other factors, then combines admissions data to recommend schools and majors | +| 8 | Youth Programming Code Assistant | LLM explains code logic and provides coding guidance; supports switching between block languages and Python | +| 9 | Knowledge-Point Mind Map Auto-Generation and Learning-Path Recommendation Tool | Input course topics; LLM automatically generates knowledge maps and recommends next-step learning content based on progress | +| 10 | Chinese/English Essay Auto-Scoring and Correction Engine | LLM scores from dimensions such as idea, structure, language, and diversity, and generates annotations with high-quality sample comparison | --- @@ -453,6 +487,11 @@ Learners can choose directions based on these dimensions: | 3 | Unit Test Auto-Generation | Analyzes source code structure; generates boundary condition test cases automatically | | 4 | Code Quality Analysis Tool | Analyzes code complexity, security vulnerabilities; provides optimization recommendations | | 5 | UI Code Auto-Generation from Design | Uploads design draft images; AI generates responsive HTML/CSS code | +| 6 | Natural Language to SQL Auto-Generation Tool | LLM converts natural-language data requests to SQL and supports complex multi-table joins and aggregation queries | +| 7 | API Automated Testing and Documentation Generation Platform | LLM analyzes code comments and API definitions, auto-generates test cases and API docs, and integrates Postman for test execution | +| 8 | System Log Analysis and Fault Localization | ELK Stack collects log data; LLM extracts key anomaly information and locates root causes, then recommends fixes | +| 9 | Frontend UI Code Auto-Generation Tool | OCR recognizes layout structures from design images; LLM generates responsive CSS and component code with TailwindCSS integration | +| 10 | Intelligent Database Schema Design and Modeling Assistant | Input business requirement docs to LLM to auto-generate ER diagrams and schema definitions; supports exporting MySQL/PostgreSQL DDL scripts | --- @@ -467,6 +506,11 @@ Learners can choose directions based on these dimensions: | 3 | Clinical Research Data Analysis Platform | Integrates EMR data; assists in generating statistical analysis code for research | | 4 | Medical Imaging Report Auto-Generation | Describes imaging features; generates structured medical imaging reports | | 5 | Chronic Disease Medication Reminder | Generates personalized medication plans; supports drug interaction and contraindication checks | +| 6 | Drug Package-Insert Intelligent Q&A Assistant | Upload package-insert images or input drug names; LLM answers dosage, side effects, and precautions | +| 7 | Disease Knowledge Popular-Science Article Generator | Input disease name and audience type; LLM generates easy-to-understand educational content and supports multiple versions | +| 8 | Medical Imaging Report Auto-Generation Tool | Radiologists describe imaging features; LLM auto-generates structured report content and supports common exam templates | +| 9 | Surgical Record Intelligent Generation and Archiving Assistant | Voice input records key surgical steps; LLM generates structured surgical records and auto-links surgery codes | +| 10 | Chronic Disease Medication Reminder Intelligent Assistant | Patients input medication lists; LLM generates personalized reminders and supports contraindication checking and interactive Q&A | --- @@ -481,6 +525,11 @@ Learners can choose directions based on these dimensions: | 3 | Security Operations Daily Report | Aggregates security logs; automatically extracts and generates daily reports | | 4 | Penetration Test Report Generation | Inputs vulnerability descriptions; AI generates complete penetration test reports | | 5 | Threat Intelligence Analysis Assistant | Connects to threat intelligence sources; interprets and analyzes potential threats | +| 6 | Malicious Code Protection and Privacy Compliance Monitoring | Sandboxes suspicious-file behavior; LLM identifies malicious features and generates signatures; scans sensitive data exposure | +| 7 | Security Configuration Compliance Checklist Generation Tool | Input target system type; LLM generates configuration checklists supporting standards such as MLPS 2.0 and CIS | +| 8 | Threat Intelligence Intelligent Query and Analysis Assistant | Connects multi-source threat intelligence (open-source/commercial); LLM interprets intelligence and links it with enterprise assets | +| 9 | Security Incident Postmortem Report Generation Assistant | After incidents, LLM auto-generates timeline-based postmortem reports with root-cause analysis and remediation suggestions | +| 10 | Global Threat Intelligence Monitoring and Alert Center | Crawlers collect global security news and vulnerability disclosures; LLM extracts key information, assesses impact, and sends alerts | --- @@ -495,6 +544,11 @@ Learners can choose directions based on these dimensions: | 3 | IPO Prospectus Generation & Compliance Check | Uses modular templates; auto-fills business descriptions with compliance verification | | 4 | Financial Report & Anomaly Warning | Auto-generates financial analysis reports; monitors business anomalies in real-time | | 5 | Insurance Agent Practice Coach | Simulates customer scenarios; evaluates script compliance and persuasion skills | +| 6 | Compliance Case Intelligent Retrieval and Q&A Assistant | Builds knowledge bases from regulatory penalty cases; LLM answers compliance questions and provides relevant case references | +| 7 | Insurance Agent Intelligent Script Practice | LLM plays different customer personas for simulation and evaluates script compliance and persuasion with transcription analysis | +| 8 | Insurance Product Clause Analysis and Competitor Comparison Platform | Parses clauses structurally; LLM generates feature summaries and key cautions | +| 9 | Customer Script Emotion Recognition Service | Combines voice-emotion recognition with script-compliance checks and gives real-time coaching suggestions | +| 10 | Insurance Claim Progress Intelligent Query and Dialogue Assistant | Users input policy or case numbers; LLM queries claim status and answers claim-related questions | --- @@ -509,6 +563,11 @@ Learners can choose directions based on these dimensions: | 3 | Marketing Content Auto-Generation | Generates marketing copy, social media posts, and advertising scripts | | 4 | Competitor Ad Analysis Platform | Collects and analyzes competitor advertising strategies | | 5 | Hot Topic Analysis & Content Recommendation | Analyzes trending topics; recommends content creation angles | +| 6 | Resume Intelligent Parsing and Job Matching System | Parses resume PDFs to extract key information; LLM matches suitable roles and generates interview suggestions; integrates with ATS systems | +| 7 | Employee Onboarding Guidance and Q&A Assistant | Uses RAG retrieval over onboarding docs; LLM answers common new-hire questions | +| 8 | Employee Performance Feedback and OKR Management Platform | Collects OKR data; LLM analyzes goal completion and generates feedback suggestions with 360-feedback integration | +| 9 | Intelligent Meeting Minutes and To-Do Management | Transcribes meeting recordings; LLM extracts key points and action items; auto-creates tasks in task systems | +| 10 | Invoice Recognition and Expense Reimbursement Auto-Processing | OCR recognizes invoice fields and automatically checks authenticity and reimbursement compliance; integrates with finance systems | --- @@ -523,6 +582,11 @@ Learners can choose directions based on these dimensions: | 3 | Digital Human Live Streaming System | Creates digital human anchors; generates real-time dialogue for live streaming | | 4 | Short Video Script & Editing | Generates short video scripts; provides intelligent editing suggestions | | 5 | Marketing Content Design System | Generates advertising copy and designs marketing materials | +| 6 | Intelligent Marketing Content Generation and Design System | Input product information; LLM generates marketing copy and selling-point extraction; integrates with template-design tools | +| 7 | Multi-Platform Ad ROI Real-Time Monitoring and Strategy Optimization System | Connect ad-platform APIs for data collection; LLM analyzes performance and generates optimization suggestions with anomaly alerts | +| 8 | Search-Engine Keyword and Traffic Analysis | Collect keyword-tool data; LLM analyzes trend and competition and recommends topic direction | +| 9 | Competitor Ad Placement Analysis Platform | Uses third-party data APIs to collect competitor ads; LLM analyzes placement strategy and creative patterns | +| 10 | Full-Network Hot Topic Analysis and Content Recommendation System | Collects trending data; LLM analyzes trend shifts and recommends content angles with calendar scheduling | --- @@ -537,6 +601,11 @@ Learners can choose directions based on these dimensions: | 3 | Enterprise Policy Matching Platform | Analyzes enterprise profiles; intelligently matches applicable support policies | | 4 | Approval Materials Pre-Review | OCR recognizes application materials; automatically checks completeness | | 5 | City Grid Event Management | Identifies event types from reports; intelligently dispatches to responsible departments | +| 6 | Social Sentiment Big-Data Analysis and Risk Early Warning System | Fuses multiple sources such as hotlines, online sentiment, and field visits; LLM identifies risk hotspots | +| 7 | Government Archive Digitization Recognition and Intelligent Filing Platform | OCR recognizes archive text; LLM extracts key information and auto-classifies; supports full-text retrieval | +| 8 | Emergency Command and Rescue Resource Intelligent Dispatch Platform | Collects emergency-event data; LLM generates emergency response plans with resource-dispatch optimization | +| 9 | Grid-Based Atmospheric Pollution Monitoring and Precision Traceability System | Collects air-quality sensor data; CV identifies pollution sources; LLM analyzes trends and traces causes | +| 10 | Public-Safety Incident Intelligent Risk Warning Assistant | Integrates historical events and real-time reports; LLM estimates risk levels and outputs warning recommendations | --- @@ -551,6 +620,11 @@ Learners can choose directions based on these dimensions: | 3 | Legal Regulation Change Monitoring | Monitors regulatory updates; analyzes impact on business operations | | 4 | Legal Letter Auto-Drafting | Inputs case facts; AI generates standard legal letters | | 5 | Legal Terms Plain Language Explanation | Translates complex legal terms into easy-to-understand language | +| 6 | Courtroom Recording Real-Time Transcription and Dispute-Focus Extraction Recorder | ASR transcribes hearing audio; LLM extracts dispute focuses and key arguments with timestamps | +| 7 | Full-Network IP Infringement Clue Monitoring and Blockchain Evidence Preservation System | Monitors e-commerce and social media infringement; automatically collects and preserves evidence | +| 8 | LLM-Based IPO Prospectus Key-Data Consistency Check and Risk Alert Agent | Compares data across prospectus sections; LLM identifies inconsistencies and abnormal values with risk tags | +| 9 | Complex Legal Clause "Translation" Plugin in Plain Language | Users select legal clauses and LLM outputs understandable explanations | +| 10 | Case Evidence-Chain Intelligent Structuring and Visualization System | Upload evidence materials; LLM analyzes evidence relationships and timelines | --- @@ -565,6 +639,11 @@ Learners can choose directions based on these dimensions: | 3 | Visa Materials Pre-Review | OCR recognizes visa materials; automatically checks for completeness | | 4 | Real-Time Translation for Travel | Offline voice translation; recognizes and translates menu images abroad | | 5 | Travel Notes Auto-Generation | Extracts information from travel photos; generates shareable travel journals | +| 6 | Data-Driven Hotel "Pitfall Avoidance" Analyzer Based on Real Reviews | Collects hotel review data; LLM extracts positive and negative keyword patterns | +| 7 | Immersive Destination VR Preview and Virtual Room Selection Platform | Collects 360-degree panoramas; VR enables immersive previews and virtual room tours | +| 8 | Travel Footprint Auto-Generated Travel Notes and Social Copy Assistant | Extracts time/location metadata from photos; LLM generates travel notes with template-based layout | +| 9 | Enterprise Travel Invoice Aggregation and Compliance Reimbursement Management Platform | Connects travel-platform APIs for automatic invoice collection and compliance checks | +| 10 | Scenic-Area Crowd Congestion Prediction and Off-Peak Route Navigation | Collects scenic-area crowd data; ML predicts congestion windows and recommends off-peak routes | --- @@ -579,6 +658,11 @@ Learners can choose directions based on these dimensions: | 3 | Cognitive Training for Elderly | Provides cognitive games; uses old photos to trigger memory for dementia patients | | 4 | Social Anxiety Practice Coach | Creates virtual social scenarios; helps practice social interactions | | 5 | Mood Monitoring & Incentive Assistant | Analyzes mood patterns; generates positive encouragement content | +| 6 | Generative AI Customized Bedtime Story Machine for Children | Parents input themes/preferences; LLM generates customized stories with background music support | +| 7 | Deceased Digital-Life Reconstruction and LLM Cross-Time Dialogue System | Trains personalized models from pre-death voice/text data and generates memory-based conversations | +| 8 | MBTI-Based AI Personality Mirror and Empathetic Chatbot | Inputs MBTI results; LLM outputs personality analysis and empathetic responses with match suggestions | +| 9 | Privacy-Protected AI Confession Tree-Hole for Teenagers | Anonymous channel for emotional expression; LLM provides listening/suggestions with sensitive-word alerts | +| 10 | Self-Evolving AI Virtual Pet Growth System | Trains pet personality models and supports interaction-driven growth and virtual customization | --- @@ -593,6 +677,11 @@ Learners can choose directions based on these dimensions: | 3 | Interactive Novel Story Generator | Reader choices affect story development | | 4 | Esports Game Analysis & Commentary | Real-time game analysis with AI-powered commentary | | 5 | Audiobook Auto-Generation | Converts text to audio with character-specific voices | +| 6 | Personalized Humor Content Recommendation Algorithm Engine | Builds user-interest profiles and recommends matching humor content | +| 7 | AI Smart Vocal Tuning and KTV Voice Enhancement Software | Performs denoising and vocal enhancement with AI tuning algorithms | +| 8 | Film/TV Character-Centric Plot Extraction and Editing Tool | Analyzes video content, extracts character-related clips, and auto-generates edited cuts | +| 9 | Multi-Role TTS Audiobook Auto-Generation System | Assigns text roles and generates personalized voices with background music/effects | +| 10 | Board-Game Reinforcement-Learning Review Coach | Analyzes game records, simulates AI opponents, and generates review suggestions | --- @@ -607,6 +696,11 @@ Learners can choose directions based on these dimensions: | 3 | Multi-Language Translation | Localizes product descriptions for international markets | | 4 | Digital Human Live Streaming | AI-powered virtual streamers for 24/7 live commerce | | 5 | Trend Analysis & Product Selection | Analyzes market trends; suggests trending products to sell | +| 6 | Full-Network Same-Product AI Price Comparison and Trend Prediction Plugin | Crawls e-commerce prices, displays comparison charts, and predicts price trends | +| 7 | Buyer-Show Image AI Selection and Short-Video Synthesis Platform | Scores buyer-show images, auto-recommends high-quality content, and synthesizes short videos from templates | +| 8 | LLM-Based Real-Time Sales Dialogue Voice Analysis and Golden-Script Recommendation | ASR transcribes calls and performs real-time script compliance checks with recommendation output | +| 9 | Market Trend AI Insight and Best-Seller Prediction Engine | Collects and analyzes social media and e-commerce data; LLM identifies trend hotspots and recommends product choices | +| 10 | Private-Domain User Profiling AI Clustering and Precision Operations System | Clusters user behavior data, generates profile tags, and triggers automated marketing flows | --- @@ -621,6 +715,11 @@ Learners can choose directions based on these dimensions: | 3 | Electricity Price Prediction | ML predicts spot prices; generates trading strategies | | 4 | Carbon Emission Calculation | Auto-calculates enterprise carbon footprint; generates ESG reports | | 5 | Grid Load Prediction | Predicts grid load under extreme weather; generates dispatch plans | +| 6 | Gas-Station Violation AI Video Recognition and Alert Guard | Analyzes surveillance video and detects violations (calling/smoking, etc.) with alert pushes | +| 7 | Long-Distance Oil/Gas Pipeline Leak Acoustic AI Monitoring and Precision Positioning System | Collects acoustic-sensor data for leak detection and localization algorithms | +| 8 | Virtual Power Plant Resource Aggregation and AI Power-Trading Decision System | Connects distributed resources for aggregated optimization dispatch and strategy execution | +| 9 | Mine Personnel AI Position Tracking and Dangerous-Area Intrusion Alarm | Uses UWB/Bluetooth positioning for trajectory tracking and geofenced danger-zone alerts | +| 10 | Energy-Storage Battery Health AI Assessment and Thermal-Runaway Warning | Monitors battery runtime data, evaluates health status, and triggers thermal-risk alerts | --- @@ -635,6 +734,11 @@ Learners can choose directions based on these dimensions: | 3 | Video Restoration & Colorization | 4K super-resolution; AI adds color to black and white footage | | 4 | Text-to-Speech with Emotion | Generates natural-sounding speech with emotional expression | | 5 | Meeting Transcription | Multi-speaker voice separation; generates meeting transcripts with action items | +| 6 | Video Object Removal AI Engine | Uses object tracking and inpainting to remove unwanted objects with frame-level consistency | +| 7 | Copyright-Safe Background Music AIGC Auto-Composer | Uses music-generation models with controllable emotional style and copyright checks | +| 8 | Specific-Person Voice Clone and Voice Conversion Software | Trains timbre models from small voice samples and supports voice conversion | +| 9 | One-Click Script-to-Storyboard and AI Dynamic Preview Video Platform | Parses scripts into storyboards and auto-generates previsualization videos | +| 10 | Meeting Recording AI Smart Transcription and Core To-Do Extraction Assistant | Performs multi-speaker transcription and LLM-based to-do extraction with timestamps | --- @@ -649,6 +753,11 @@ Learners can choose directions based on these dimensions: | 3 | Logo & Brand Design | Generates brand logos; creates complete VI systems | | 4 | Trend Analysis & Content Ideas | Tracks trending topics; suggests marketing angles | | 5 | Video Script Generator | Generates short video scripts with shooting suggestions | +| 6 | Competitor Marketing Strategy Deep Analysis and AI Weekly Report Generator | Collects/analyzes competitor content, extracts strategy insights, and auto-generates weekly reports | +| 7 | Search-Engine Keyword AI Layout and Traffic Article Batch Writing | Analyzes keywords, generates articles at scale, and gives SEO optimization recommendations | +| 8 | Personalized Marketing Email AI Writing Expert | Uses user-profile data for personalized content generation with A/B testing | +| 9 | Brand Reputation Full-Network Monitoring and Crisis AI Alert Radar | Collects network sentiment data, runs sentiment analysis, and pushes crisis alerts | +| 10 | Short-Video Script Creative AIGC Generation and Storyboard Guidance Assistant | Inputs themes and outputs scripts, storyboards, and practical shooting guidance | --- @@ -663,3 +772,8 @@ Learners can choose directions based on these dimensions: | 3 | Data Quality Monitoring | Detects data anomalies; suggests fixes | | 4 | Report Generator | Creates reports and dashboards through conversation | | 5 | Metric Q&A Assistant | Answers questions about data metric definitions and calculations | +| 6 | Intelligent Data-Report Interpretation and Trend Analysis Assistant | Upload report images or input data; VLM interprets chart content and analyzes trends | +| 7 | Intelligent DB-Schema Interpretation and Query-Example Generation Assistant | Input table names or field descriptions; LLM generates schema explanations and sample SQL | +| 8 | Enterprise Master-Data Intelligent Alignment and AI Dedup Governance | Matches master data across sources, identifies duplicates, and supports merge-rule configuration | +| 9 | Data Requirement Doc to Test-Case Intelligent Conversion Tool | Input data requirement descriptions; LLM generates test scenarios and validation test cases | +| 10 | Data Metric-Definition Intelligent Q&A Assistant | Builds a knowledge base from metric-definition docs; LLM answers definition and calculation logic questions | diff --git a/docs/en/stage-2/ai-capabilities/2.1-dify-knowledge-base/index.md b/docs/en/stage-2/ai-capabilities/2.1-dify-knowledge-base/index.md new file mode 100644 index 0000000..acffff5 --- /dev/null +++ b/docs/en/stage-2/ai-capabilities/2.1-dify-knowledge-base/index.md @@ -0,0 +1,1069 @@ +# Dify Basics and Knowledge Base Integration + +# Review of the Previous Lesson + +In the previous lessons, we learned in groups the basics of AI coding, prompt engineering, and AI image generation. These topics helped us build an initial understanding of the boundaries and capabilities of different large language models (LLMs) and generative models. + +To help you review the previous lesson, think through these quick questions: + +1. What is AI programming? How can you use an AI coding tool (for example, [z.ai](http://z.ai)) to create a webpage? +2. What is a large language model? What are prompt engineering and context engineering? How should you write a complex prompt? +3. Across text, AI coding, and image generation, where do you think model strengths and weaknesses show up most clearly? +4. What is an API? How do you use [z.ai](http://z.ai) to connect to third-party APIs? + +If any question still feels unclear, you can revisit the previous lesson docs or ask directly in the WeChat group. + +In this lesson, we move from simple AI text/image tools to workflow-building platforms closer to real business deployment. We go from chatbots to AI agents and AI workflows, and then use APIs to turn them into interactive "intelligent" chatbot pages. + +During hands-on operation, if any step is hard to understand, do not worry. A recommended approach is to take a screenshot of the page you are on and ask a model directly. Current models can already resolve most common issues. + +If you still cannot solve it after asking, keep trying. Do not be afraid of mistakes. Every attempt is part of learning and progress. With more practice, you will become increasingly fluent and confident. + +# What You Will Learn in This Lesson + +1. Why we need to move from chatbots to agents and workflow orchestration. +2. What an agent/workflow development platform is, and how to turn AI capability into SOP-style, orchestratable processes. +3. What Dify is, and how to quickly build applications on this open-source LLM platform, especially a knowledge-base QA chatbot. +4. How RAG works and why retrieval-augmented generation is needed. +5. How to learn Dify and AI IDE Trae (`Extra Knowledge 4 - What is AI IDE and Trae`) from 0 to 1, including building agents, workflows, and a frontend chatbot webpage using Dify API. + +- Basic Dify principles, agent/workflow building methods, and API invocation. +- AI IDE usage and AI-assisted coding workflow. +- A frontend agent program that can chat. + +# 1. From Conversation to Agent + +In the previous stage, we learned how to use prompts to make models play roles, generate text, or write simple code. But if you think carefully, there is a key issue: a chatbot itself cannot actually do work. + +It can answer "how to check an order," but it cannot truly query your database for the order number. It can describe what a weekly report should include, but it cannot automatically collect project data and send the email. This "can say but cannot do" limitation makes pure conversational AI hard to truly embed into business processes. + +To upgrade AI from chat companion to digital employee, we need to give it three core capabilities: + +1. Proprietary knowledge: let it read and understand your product docs, customer materials, and internal policies. +2. Tool calling (or plugins): let it operate databases and call APIs. +3. Structured execution: let it complete tasks step by step with predefined logic, not free improvisation. + +This is the prototype of an AI Agent: an automation unit with goals, knowledge, tools, and an execution path. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image1.png) + +> Note: In current industry usage, "simple agents" usually mean enhanced applications built from LLM + tools + knowledge base, not fully autonomous planning agents. Even though these simple agents do not have true long-horizon reasoning and planning, they are already enough for many enterprise automation scenarios. We will introduce truly autonomous agents in later chapters. + +## 1.1 The Simplest Agent: Knowledge-Base QA Chatbot + +After clarifying the core capabilities of an agent, a natural question follows: can we build a practical basic agent by implementing only one of these capabilities? The answer is yes. + +In many real business scenarios, users do not need AI to execute complex operations (such as API orchestration across multiple systems). Their core need is accurate, reliable QA grounded in company-specific materials. This maps exactly to the first core capability: proprietary knowledge service. + +That leads to the simplest and most widely used agent form: a knowledge-base QA chatbot. + +Although it does not yet include tool calling or autonomous planning, the key breakthrough is this: model answers are no longer generated "from thin air." They become evidence-grounded. How is that achieved? We need to solve one core challenge: when there are thousands of pages of internal docs, how can the model quickly find the most relevant parts for each user question? + +One solution is Retrieval-Augmented Generation (RAG). + +The core RAG idea is: when a user asks a question, the system first retrieves the most semantically relevant text chunks from enterprise knowledge (for example, one paragraph from a product manual, one policy clause from HR docs), then injects these chunks into model context so the answer is generated based on real source material. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image2.png) + +Image source: [https://www.datacamp.com/blog/what-is-retrieval-augmented-generation-rag](https://www.datacamp.com/blog/what-is-retrieval-augmented-generation-rag) + +This means responses no longer rely only on generalized training knowledge. They are anchored to enterprise-authoritative information. The goal of RAG is exactly this dynamic external-knowledge injection, which significantly improves answer truthfulness, accuracy, and consistency. It can even enforce response persona/style, such as customer-support tone or technical-document style. + +In real business, this is especially important because models can hallucinate. For example, if you ask for concrete metrics as a CFO or consultant, a model may fabricate dates and events. With RAG, controllability and reliability improve significantly. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image3.png) + +Image source: [https://www.databricks.com/glossary/retrieval-augmented-generation-rag](https://www.databricks.com/glossary/retrieval-augmented-generation-rag) + +In this lesson's hands-on section, we will use Dify, a popular AI workflow platform, to build a knowledge-base QA chatbot. You can easily turn many kinds of proprietary materials into a knowledge base, such as product manuals, company policy docs, project docs, research papers, knowledge-base articles, and even personal notes. + +After setup, you can test with questions such as: + +- "What are the major upgrades in the latest version of Product A?" +- "According to the employee handbook, how is annual leave policy defined this year?" +- "In project XX, how did we solve technical challenge 'XXX'?" +- "What is the core research method described in this paper?" + +You will directly feel how RAG transforms static, scattered documents into a precise intelligent knowledge base that supports high-accuracy QA across scenarios. + +## 1.2 From Conversational Agent to Workflow + +However, even "enhanced agents" with knowledge base and tool calling are still insufficient for more complex business processes. + +Imagine this request: +"What new features were released in our newly launched SaaS product recently? Can you organize them into a client-facing brief?" + +This looks simple, but behind the scenes it requires coordinated steps: first retrieve the last month's release notes from internal docs or Notion knowledge base; then filter customer-facing key features; then call an LLM to rewrite technical descriptions into customer-friendly language; and finally send the generated content to the marketing team's email or save it into a Google Docs template. + +If we rely only on a single LLM to reason freely, it is hard to execute the entire process in one dialogue. Even if it does, it can miss key details, confuse internal terms with customer language, or fail to output in structured form. More importantly, enterprises need an auditable, reusable, monitorable standardized execution path, not one-off improvisation in each run. Monitoring and reproducibility are crucial for enterprise risk control. + +This leads to a higher-level AI application pattern: AI Workflow. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image4.png) + +Workflow means decomposing a complex task into ordered, configurable, automatically executable sub-steps, then orchestrating logic between steps (conditionals, loops, parallelism) visually or via code. Turning AI capability into SOP means solidifying "how AI completes this task" into reusable templates. + +This brings multiple benefits: non-technical roles (such as product managers or operators) can build AI apps quickly via drag-and-drop; developers can encapsulate RAG retrieval, LLM calls, API tools as standard nodes for reuse across business scenarios; and the full process can be tracked, debugged, and optimized continuously to satisfy enterprise requirements for stability and compliance. + +AI workflow users are broad. Product managers can design full interaction flows without writing code; operations can quickly build customer-service bots, content generators, or notification systems; developers and ML engineers can modularize capabilities for frontend integration; founders and indie developers can validate AI MVPs at low cost and launch prototypes with query + generation + actions in days. + +Also note that AI workflows are usually described by an intermediate representation. Platform specifics differ, but most use structured files (JSON, YAML, etc.) to define node types, inputs/outputs, and execution logic, as shown below: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image5.png) + +In short, if agents let AI move from "can chat" to "can do," workflows let AI move from "occasionally complete one task" to "stably, reliably, and at scale complete a class of tasks." In the following practice, we will build a full AI workflow on Dify and experience the full path from idea to runnable app. + +## 1.3 Common Agent / Workflow Platforms + +As generative AI develops rapidly, many low-code and no-code agent/workflow platforms have emerged to help developers and business users build intelligent processes quickly without falling into low-level coding complexity. + +First, clarify what low-code means: development tools that significantly reduce manual coding through drag-and-drop visual components, preset logic templates, and graphical rule configuration. Core idea: replace direct coding with visual node orchestration. This frees technical users from repetitive work and allows non-technical users familiar with business logic to participate in app building. It is essentially a bridge between efficiency and flexibility. + +The key value of low-code/no-code AI platforms is reducing development threshold. Work that used to take weeks of cross-functional collaboration (requirements, coding, testing, deployment) can now go from idea to launch in hours for common agent scenarios such as customer QA bots and data-processing assistants. + +Mainstream low-code AI workflow platforms include: + +| Platform | Features | Typical Scenarios | +| --------------------------------------------- | -------------------------------------------------- | -------------------------------------- | +| Dify | Open source; supports knowledge-base RAG, LLM orchestration, API output; Chinese-friendly | Enterprise knowledge QA, custom agents, API services | +| Coze (ByteDance) | Available in China, integrated with Doubao/Feishu ecosystem, rich plugins | Social bots, domestic mini-program integration | +| n8n | General automation platform with AI nodes, strong in API orchestration | Cross-system sync, AI + traditional SaaS automation | +| Baidu Qianfan AppBuilder / Alibaba Bailian / Tencent HunYuan | Cloud-native vendor stacks with in-house models | Enterprise deployment, strict compliance scenarios | + +There are many choices in the market. Although AWS, Azure, Alibaba Cloud, and others all provide workflow solutions, Dify, Coze, and n8n are currently among the most widely used due to three major advantages: + +1. Extreme usability: visual drag-and-drop UIs make onboarding easy without deep low-level understanding. +2. High flexibility: custom components and extensible APIs support both lightweight demo/MVP and agile iteration for SMB teams. +3. Mature ecosystem: detailed docs, responsive support, and active communities with reusable templates. + +All three support exposing built agents as standardized APIs, enabling seamless integration with frontend web apps, enterprise ERP systems, and mobile apps, which further lowers deployment threshold. + +### 1.3.1 Dify: Enterprise LLMOps and Application Lifecycle Platform + +Dify is positioned as an LLM application development and operations platform, focused on full lifecycle management from idea to deployment to optimization. Its core is a low-code platform helping developers and non-technical innovators rapidly build production-grade AI applications. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image6.png) + +Feature-wise, Dify includes visual workflow orchestration, agent building, knowledge-base management, and multi-model support. You can design complex processes by dragging nodes and create intent-based agents. Its knowledge-base capability can process many document formats and support efficient vector retrieval. Dify supports GPT, Claude, and many open-source models, and can publish apps as standard APIs with one click. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image7.png) + +Architecturally, Dify emphasizes open source and private deployment, with flexibility, extensibility, and enterprise compliance. Typical users include developer teams and business innovators. Typical use cases include enterprise knowledge QA/customer support, content automation, vertical AI assistants, and enterprise AI middle platforms. + +### 1.3.2 Coze (ByteDance): Popularizing Zero-Code AI Agent Building + +Coze is ByteDance's AI agent platform. Its core value is extreme usability, allowing users with no programming background to create, debug, and publish rich AI chatbots. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image8.png) + +Its core interaction is "building blocks." Users can configure bot roles and knowledge bases via UI, and use rich built-in plugin libraries for external capabilities such as news, travel, and image generation. Built bots can be published with one click to Doubao, Feishu, WeChat Official Account, and other channels. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image9.png) + +Its architecture is designed around low-threshold usage, integrating ByteDance models behind cloud services and abstracting complex flow details, with emphasis on multimodal understanding and real-time responses. Private deployment capability is relatively limited. Typical scenarios include personal assistant and entertainment bots, customer QA systems, online learning assistants, and rapid prototyping. + +### 1.3.2 n8n: Programmable Backend Workflow Automation Engine + +n8n is a general-purpose programmable workflow automation platform. Its core positioning is connecting applications, databases, and APIs to automate data movement and task execution. + +It supports hundreds of SaaS services, databases, and protocols through a large integration-node ecosystem, and combines visual design with code: you can drag nodes on canvas while injecting JavaScript/Python for custom logic. n8n is strong in backend, data-intensive workflows such as sync, ETL, and API orchestration. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image10.png) + +Its key technical characteristic is visible source code and self-hosting, allowing full control of data and environment. This is especially attractive for industries with strict data-security requirements. Main users are developers, technical operators, and data analysts. n8n's biggest strength is its powerful community ecosystem: rich online tutorials and shared templates lower learning cost. It also connects to global ecosystems such as YouTube and Instagram, helping users break cross-platform data/service barriers. + +### 1.3.3 Other Workflow Platforms + +Besides these well-known platforms, major Chinese tech vendors also launched integrated AI platforms. For example, Baidu Qianfan AppBuilder supports end-to-end model selection, RAG building, and agent publishing, deeply integrated with Wenxin models; Alibaba Bailian (Tongyi-based) emphasizes enterprise security and private deployment; Tencent Cloud TI focuses on finance/healthcare vertical templates. These are often deeply integrated with their cloud ecosystems and fit enterprises already in those stacks. + +However, in terms of generality, openness, and community ecosystem, Dify and Coze are still among the most widely adopted choices due to usability, broad model support, and active developer communities. + +Although platform positioning and ecosystems differ, the core logic is similar: visually orchestrate and connect capability modules. Once you master the design and operation of one platform, you can transfer quickly to others. In the following practice, we use Dify as the example. + +# 2. Understanding Dify Step by Step + +## 2.1 What is Dify + +We already covered basic Dify introduction earlier. For more details, visit [https://cloud.dify.ai/apps](https://cloud.dify.ai/apps), and for official information visit https://dify.ai. + +Dify is an open-source platform for developing LLM applications. It provides an intuitive interface that combines agent workflows, RAG pipelines, tool capabilities, model management, and observability, helping you move quickly from prototype to production. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image11.png) + +In Dify, you can combine large models and many tools to build a "workflow." A workflow is a business-logic chain that automates operations you would otherwise do manually step by step, such as data retrieval, LLM calls, web search, result filtering, and format organization. Without workflows, you repeatedly copy/paste similar prompts, which is inefficient, error-prone, and hard to reuse in real business. + +Building workflows is like assembling blocks/puzzle pieces. You connect LLM nodes (understanding/generation), tool nodes (specific actions such as querying DB, sending email, translating text), and data nodes (read/store info). They then collaborate automatically under your predefined logic without manual repetition. You can also think of it as "low-code programming": by drag-and-drop and input/output configuration, you can implement fairly complex business logic. + +For example, if you run an Amazon or Douyin e-commerce store and want an AI customer service system, you can design a workflow like this: + +1. Trigger node (`START`): receives user query, for example "How long is the warranty period for this product?" +2. Question classifier node (`QUESTION CLASSIFIER`): uses an LLM (for example GPT) to classify the query into after-sales (warranty), usage guidance, or other types. +3. Knowledge retrieval node (`KNOWLEDGE RETRIEVAL`): automatically queries the corresponding knowledge base based on classification. If warranty-related, retrieve precise warranty SOP content. +4. LLM node: sends user query + retrieved context to model and generates user-friendly response. +5. Condition node: checks whether response includes clear warranty period terms (for example "1 year" or "3 years"). If yes, continue; if no, return "please provide product model." +6. Output node (`ANSWER`): returns final answer and logs this consultation into a table automatically. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image12.png) + +In this process, you do not manually browse docs, repeatedly tune outputs, or separately log data. The workflow chains it all automatically. It is also flexible: if later you add a new rule like "when user asks warranty coverage, query another KB," just add one conditional node instead of rebuilding the system. + +This is a relatively simple workflow example. Fully mastering all capabilities may still feel hard at this stage. So in this lesson, we start from a more basic knowledge-base agent and gradually move to advanced workflow techniques later. + +### 2.1.1 Deploy Your Own Dify (Optional) + +This part was originally scheduled for later lessons. Because some learners currently cannot access Dify official cloud due to network constraints, we provide this optional path earlier so you can continue smoothly. + +You need to reference this tutorial for basic web deployment platform usage: +[How to Deploy a Web Application](/en/stage-2/backend/2.5-zeabur-deployment/) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image13.png) + +Learn how to deploy your own Dify on Zeabur. After deployment, register and log in via your deployment URL, then continue with the steps below. + +Note: different Dify versions may have small UI/operation differences, but overall logic is similar. If something looks different, do not panic; find equivalent entry points and continue. + +## 2.2 Create Your First Dify Chatbot App + +Visit Dify home page [https://cloud.dify.ai/apps](https://cloud.dify.ai/apps), register and log in, then choose Studio. You will see an interface similar to: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image14.png) + +Find `CREATE APP` on the left and click `Create from Blank`. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image15.png) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image16.png) + +In APP Type, choose Chatbot (if not visible at first, click "see more types" and find it in full list). Then fill app name and description and click create. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image17.png) + +After creation, you will see an interface like this: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image18.png) + +The middle "INSTRUCTIONS" area means built-in instructions (default/system prompt). + +Below that is the "Knowledge" area where we upload knowledge base later. + +The right panel is the debug window where you can test interactions in real time after editing prompts. + +You can type your own role prompt in INSTRUCTIONS, or click Generate to let the model draft one. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image19.png) + +Note the top-right model choices: you can switch different models and compare differences in tone, reasoning, and long-context handling to pick what best fits your needs. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image20.png) + +## 2.3 Support Custom Model Providers + +To fully leverage Dify flexibility, and because model availability differs by region and business constraints (cost/privacy), we often need custom models. Dify supports three core model types: LLM, Embedding, and Rerank. This section walks through custom configuration. + +Dify can connect mainstream providers (OpenAI, Azure, Anthropic) and also supports any self-hosted or third-party model that follows OpenAI API compatibility. You can do this by installing the built-in OpenAI Compatible plugin and vendor-specific plugins. + +Detailed steps: + +1. Install `OpenAI-API-compatible` and `SiliconFlow` plugins to support most LLM and Embedding models. The first supports OpenAI-compatible APIs; the second is a service hub containing many common high-quality open-source models. + 1. https://marketplace.dify.ai/plugins/langgenius/openai_api_compatible + 2. https://marketplace.dify.ai/plugins/langgenius/siliconflow +2. If you self-hosted Dify, go to plugin marketplace in system settings and install there. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image21.png) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image22.png) + +After entering plugin marketplace, search plugin names directly. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image23.png) + +3. After installation, configure model providers. In settings -> model providers, you can see all currently supported providers: + ![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image24.png) +4. Before use, complete model config first. For OpenAI-API-compatible plugin, click "Add Model" and configure any model. In "Model Type," select whether it is LLM or Embedding, and ensure type is correct. + You need model name, endpoint URL, and API key to enable it. If this feels cumbersome initially, you can skip to SiliconFlow key setup or install OpenRouter plugin for easier provider support (ensure your provider account has remaining quota). + + ![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image25.png) + + For `SiliconFlow`, just click Setup and configure key to use Embedding/Rerank for testing. You can click "Get your API Key from SiliconFlow" to obtain credentials. + + ![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image26.png) + +5. After configuration, open model list to inspect supported models. Basic model setup is now complete. + ![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image27.png) + + It supports most common Embedding and Rerank models: + + ![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image28.png) + + If you want to modify Dify's default model set, click `System Model Settings` and update defaults. + + ![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image29.png) + +## 2.4 Create Your First Dify Knowledge Base + +At this point, we created a basic agent, but it still lacks a knowledge base. Click `Knowledge` in the top menu to enter knowledge-base creation. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image30.png) + +Then click `Create Knowledge` on the left to create your first knowledge base. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image31.png) + +On this page, you can upload many file types (PDF, TXT, etc.) to build knowledge. You can upload long text or copy Wikipedia content into TXT and upload. In this example we upload an Elon Musk Wikipedia TXT file. + +After clicking Next, you enter Knowledge Base Settings. There are many options, so let us walk through step by step. + +First in **General** settings, this is the "text chunking rules" area. Because long text must be split into smaller chunks, we define chunk strategy first. For entry level, only focus on **maximum chunk length**. Try 512, 2048, or 4096, and click **Preview Chunk** to compare effects. + +You can also adjust **Chunk overlap**. It controls whether adjacent chunks preserve overlapping content. Proper overlap helps avoid splitting critical information across chunks in a way that harms comprehension. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image32.png) + +There is also **Chunk using Q&A format in English**. When enabled, the system uses LLM to convert part of knowledge into Q&A format before storage, which can significantly improve retrieval in some scenarios. + +In real business, selecting chunk strategy according to scenario greatly affects retrieval quality and whether returned content matches expectations. + +Scroll down for Embedding model settings. + +Simple explanation: Embedding models convert unstructured data (text, images, etc.) into machine-understandable numeric vectors. This enables rapid similarity computation and semantic matching, such as retrieving documents/images/products closest in meaning to user input. + +Embedding choice significantly affects retrieval quality (accuracy, latency, etc.). Here we recommend starting with Qwen 0.6B Embedding. You can switch to 4B or 8B and compare parameter-scale impact. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image33.png) + +You will also see **Rerank model**, default **Jina-rerank-m0**. (If you are outside campus environment, you may see missing Rerank model errors. In that case configure rerank model in model provider settings first.) + +Rerank's purpose is second-stage fine sorting over initial candidates, moving results most aligned with user intent to top positions, improving relevance and UX. + +Simple intuition: rerank solves "first-stage retrieval not refined enough." Search engines may retrieve 1000 potential pages by simple rules, then rerank top 10 for page one. Recommenders work similarly: from 500 possible items, rerank promotes most likely conversions. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image34.png) + +After settings are complete, click **Save & Process** to start vectorization. Embedding models transform chunked text into vectors at this stage. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image35.png) + +After processing finishes, click **Go to document** to inspect processed/stored KB content. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image36.png) + +Click KB name directly to view each chunk detail. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image37.png) + +You can precisely edit or delete unsuitable chunks here. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image38.png) + +In left sidebar, choose **Retrieval Testing** to test recall and verify retrieval quality. Each test returns several highest-similarity chunks. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image39.png) + +If you want more retrieved chunks, click `VECTOR SEARCH` settings: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image40.png) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image41.png) + +Top K means number of most similar text chunks returned from vector search. Current value 3 means top 3 chunks are returned. + +Score Threshold is a minimum score filter: only chunks with similarity score >= threshold (for example 0.5) are returned, filtering low-relevance content for higher precision. + +Now KB setup is complete. Next, click top menu "studio," find the agent we created earlier, and connect this KB. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image42.png) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image43.png) + +In each chat round, you can now see cited knowledge sources in the response. Click entries to inspect retrieved text chunks. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image44.png) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image45.png) + +## 2.5 More Common Dify Operations + +After mastering basic chatbot + KB setup, we can go deeper into common Dify operations. + +### 2.5.1 Workflow Import and Export + +Remember intermediate representation mentioned earlier? Dify supports importing/exporting workflows in DSL (Domain Specific Language) format. DSL is a JSON-based standardized representation preserving node structure, links, and config parameters. You can easily export/import DSL files to share workflows or study others' designs. + +In practice, you can find import entry on workflow workspace: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image46.png) + +For export, click the lower-right corner of a workflow block to find export action: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image47.png) + +Using DSL makes migration/sharing of complex workflows across Dify instances straightforward. + +### 2.5.2 Explore More Dify Projects + +If your own workflow feels too simple, Dify provides rich sample projects for learning more advanced application construction. These examples cover many business scenarios. Click Explore to view workflows built by others. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image48.png) + +## 2.6 Create Your First Dify Workflow App + +After starting with chatbot-style agents, we now build more complex business workflows. Workflow is Dify's core method for visualizing complex business logic. You can directly observe data flow between nodes, where decision logic is placed, where human intervention points are set, and how final business outcomes are produced. + +You can create from blank or from templates. Here we demonstrate creating from blank: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image49.png) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image50.png) + +Here you will see Chatflow and Workflow. How do you choose? Decide based on whether your core need is continuous conversation or task pipeline execution. + +Chatflow is designed for dialogue. It simulates a conversational entity with memory and context continuity, ideal for multi-turn interactions and stateful sessions. For customer support, it can handle follow-up questions coherently. Streaming output also feels more natural. If you need an agent that "converses," choose Chatflow. + +Workflow focuses on automated process execution. It acts like a predefined pipeline for one-off inputs, multi-step processing, and deterministic outputs. For example daily report generation, batch file processing, or chained API calls. These tasks are usually event-triggered and not real-time conversational. If your need is "automation," choose Workflow. + +To avoid mismatched architecture, evaluate with four questions: + +1. Does the process require repeated user input/adjustment? +2. Does output need stepwise/streaming presentation? +3. Does logic strongly depend on previous interaction history? +4. Is the task event-triggered and mostly one-shot input/output? + +If first three are yes, Chatflow is ideal (customer support, tutoring, creative collaboration). If fourth dominates, Workflow is a better fit (data cleaning, report generation, batch processing). + +Here we choose Chatflow for demonstration and enter workspace: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image51.png) + +Quick interface tour: the center canvas is where you visually build app logic. A basic workflow usually starts at `START` (input), passes data through links into `LLM`, and outputs through `ANSWER`. Each node is a function module; links determine execution order. + +Around the canvas are management controls. Top area includes global actions like `Preview` (test) and `Publish` (release). Canvas corners include zoom/undo and other view controls. + +Left panel contains app-management areas. `Orchestrate` is for flow design. After building, use `API Access` for integration credentials. `Logs & Annotations` records execution traces for debugging. `Monitoring` provides runtime status/performance visibility. + +You can type simple prompt instructions in Chatflow LLM node SYSTEM, run Preview, and verify behavior changes as expected. + +### 2.6.1 Common Node Types + +Dify provides many node types. First understand each node's role. For practical usage, test directly, learn from templates, or ask a model with screenshots about parameters and usage. A good beginner tactic: replace nodes in existing templates and infer best practices from known working patterns. + +Right-click canvas and choose `Add Node`, or inspect all available nodes from side panel: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image52.png) + +You can also open tool selection panel to view callable tool categories: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image53.png) + +Below is a brief intro to common nodes/tools. You do not need to master all at once. Keep a basic mental map and learn progressively in practice. + +1. LLM and reasoning nodes + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image54.png) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image55.png) + +These nodes are core processing components: + +- LLM node: core compute unit that calls an LLM. Key focus is prompt engineering and parameter tuning to map business tasks into executable model instructions. +- Knowledge Retrieval node: retrieves relevant information from configured KBs or external authoritative sources to support LLM and reduce hallucination risk. +- Answer node: output unit that formats processed content into final business-ready result (response template, formatting spec, etc.). +- Agent node: advanced decision unit. Beyond model call, it can do multi-step planning and dynamic tool selection, suitable for complex task chains. +- Question Classifier node: classifies user input by intent/topic and routes to appropriate downstream paths (different prompts/toolchains per category). + +2. Logic and flow-control nodes + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image56.png) + +These nodes define execution path/rules: + +- Condition node (`IF/ELSE`): Boolean-based branching. Key is strict condition design that covers business cases comprehensively. +- Iteration node: stateless batch-parallel processing, best when sub-tasks have no interdependency (batch translation, parallel review, multi-report generation). It takes input array, slices elements, runs same chain in parallel. Use `{{item}}` for current element and `{{index}}` for index. Outputs aggregate back to array. Configure parallelism to balance speed/load; configure retry/failure handling for reliability. +- Loop node: stateful recursive iterator, best when each round depends on previous output (parameter tuning loops, iterative content polishing, chained dependent calculations). Core is state variable management: initialize before loop, update each round, and define strict stop conditions (max rounds, quality threshold, external stop signal) plus timeout and exception path to avoid infinite loops. + +3. Data operation and integration nodes + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image57.png) + +- Code node: executes custom logic for data transform, complex computation, etc. Focus on syntax correctness and runtime compatibility. +- Template node: fills dynamic data into templates (custom copy/report skeleton). Focus on template syntax and variable mapping. +- Variable Aggregator node: collects outputs from multiple nodes into a unified dataset. Focus on scope and merge rules. +- Doc Extractor node: extracts text/tables from PDF/Word and converts into structured processable data. +- Variable Assigner node: defines/initializes/updates workflow variables for data passing. +- Parameter Extractor node: extracts structured parameters from user/API inputs (regex/JSON path, etc.). +- HTTP Request node: sends external API requests (GET/POST, etc.) for system integration. +- List Operator node: filters/sorts/splits list data to match downstream structure. + +### 2.6.2 Common Tools + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image58.png) + +In Dify, most tools can be used directly as canvas nodes and connected like other nodes. As long as your input matches expected parameters, the tool runs and outputs results for downstream processing. + +From side panels, you can inspect available tool nodes and extend capabilities through plugin marketplace. A few common tool categories: + +- Web search tools + - Tavily Search is a common representative, providing AI-optimized real-time factual retrieval. + - It returns structured results (title/summary/link, etc.), suitable for injecting into LLM prompts for latest-info and evidence-required answers. +- Data processing tools + - For example JSON Process plugin supports querying/filtering/transform/merge on JSON data. + - Useful when handling complex API responses and nested data, reducing repeated manual parsing code in Code nodes. +- Format processing tools + - For example Markdown Exporter can export generated content into target formats (Markdown, custom templates, etc.) for display/reporting/system integration. + +You can view install counts and descriptions in tool list. At the beginning, prioritize "Featured/Recommended" tools because they cover common scenarios. + +Tool usage can still be complex. A practical shortcut is to search official workflow DSL examples for each tool and import directly, which is often much faster than building everything from scratch. + +### 2.6.3 Build a Simple Intent Classification Workflow + +Now that we understand Dify workflow/tool basics, we need hands-on practice. Without practice, details never become fluent. We need a realistic business scenario. + +For example, in real food-ordering chat scenarios, user input is never clean parameters. Some users place orders, some complain, some chat casually, some go off topic. If all these inputs are sent to one shared LLM path, two common issues appear: + +1. Unstable response style + Same complaint may get an apology in one run but an excuse in another. Same order may trigger missing-info follow-up in one run but hallucinated order details in another. +2. Uncontrollable business logic + You want "complaints must start with apology," but model may not always comply. You want "off-topic queries should be redirected," but model may continue chatting off-domain. + +A more engineering approach is standardized pipeline decomposition: +intent classification first (determine what user wants), then intent-based routing (different prompts/roles per scenario), then unified output packaging from routed branches (for frontend/system integration). + +Goal: handle multiple dialogue types in a food-service scenario. Follow once to build familiarity. + +First define intents: + +- **buy_food**: user shows clear purchase/order intent. + - Example: "Give me one fried chicken and one cola." +- **complain**: user expresses dissatisfaction/anger/complaint. + - Example: "Why is it so slow? I've waited for an hour." +- **chitchat**: user asks open recommendations without explicit order command. + - Example: "What should I eat today? Any recommendations?" +- **other**: irrelevant to food-ordering scenario. + - Example: "Help me write a funny social post." + +For these four intents, predefine four communication personas via four dedicated LLM nodes: + +- **LLM_BuyFood**: professional and efficient. Confirm order details and proactively complete missing information. +- **LLM_Complain**: empathetic and calm. First soothe user and provide clear resolution steps. +- **LLM_Chitchat**: relaxed and friendly. Provide personalized recommendations and guide potential conversion. +- **LLM_Other**: polite and boundary-aware. Redirect off-topic conversations back to core business. + +#### Workflow Orchestration Design + +Now define node architecture. Beginners often do not know what nodes to use (and even advanced users often ask models for first-pass design because it is fast). Core structure: + +- Start: data entry node receiving raw input `user_text`. +- Question Classifier: "brain + dispatcher." It analyzes `user_text` and outputs one of four intent labels. +- Condition: "routing valve." It forwards flow based on classifier label to the corresponding handling branch. +- Four parallel LLM nodes (`LLM_BuyFood`, `LLM_Complain`, `LLM_Chitchat`, `LLM_Other`): each gets original question but responds differently based on its own SYSTEM prompt persona. +- Variable Aggregator: after branch processing, aggregate the one activated branch output into unified variable `final_reply` for stable output structure. +- Output: final structured output (for example JSON) including intent, original query, and reply, suitable for downstream integration/debugging. + +#### Workflow Orchestration Implementation + +In this tutorial we choose Workflow (not Chatflow). Select User Input: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image59.png) + +Then click Start -> User Input and define a string variable `user_text` as global flow input source. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image60.png) + +Save and click Test Run (top right). You will be prompted to provide test text. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image61.png) + +Next click `+` after input node and add Question Classifier. Configure four labels, each with clear description and examples: + +- `buy_food`: user clearly wants to buy/order food. +- `complain`: user is complaining/angry, usually with dissatisfaction. +- `chitchat`: user is chatting, discussing what to eat, asking recommendations. +- `other`: irrelevant to food scenario or hard to classify. + +Also set prompt in ADVANCED SETTING for classification behavior. Example prompt: + +```text +Choose the most appropriate label from buy_food / complain / chitchat / other. +If user both complains and orders, prioritize core emotion: if dissatisfaction is primary, classify as complain. +If complaint is minor and primary intent is ordering, classify as buy_food. +If truly hard to determine, use other as fallback. +``` + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image62.png) + +After setup, use top-right play icon on this node to test classification. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image63.png) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image64.png) + +From OUTPUT we can see classification is accurate. Test multiple input types to verify classifier stability. + +Next connect classifier to downstream LLM branches. For example, when `label == "buy_food"`, route to `LLM_BuyFood`. +Create four LLM nodes and set different SYSTEM prompts: + +- LLM_BuyFood (ordering assistant): + + You are an ordering assistant. Requirements: + 1. Confirm what user wants to order. + 2. If info is incomplete, ask follow-up questions politely. + 3. Keep tone polite and concise. + +- LLM_Complain (support specialist): + + You are a food-service customer support specialist handling complaints. Requirements: + 1. Apologize sincerely. + 2. Briefly explain likely reasons (no blame shifting). + 3. Provide clear next-step resolution. + +- LLM_Chitchat (chat companion): + + You are a casual food recommendation assistant. Requirements: + 1. Use relaxed friendly tone. + 2. Give 1-3 simple recommendations. + 3. If no preference, provide options with different styles. + +- LLM_Other (polite gatekeeper): + + You are a food-ordering assistant focused only on food topics. For irrelevant user input: + 1. Politely explain scope. + 2. Guide user back to core scenario. + +Important: in each node, after setting SYSTEM prompt, enable USER prompt variable mapping. Click `{x}`, choose `user_text` as user input variable, and prepend `user input:` to indicate source semantics. During response generation, model uses both initial user input and system prompt. + +As always, click node-level play icon to test with sample input such as "I want bubble milk tea" and verify behavior. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image65.png) + +Next process parallel branch outputs. In `Variable Aggregator`, find `ASSIGN VARIABLES` and add branch outputs one by one. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image66.png) + +Now aggregate final output including user input, intent, and reply. Because this is Workflow (not Chatflow), there is no Answer node for this exact structure, so we can use Template node for equivalent output packaging. In variable area specify intent result, user input, and aggregator final reply. In CODE, write final JSON template: + +- `intent` <- `class_name` +- `original_text` <- `user_text` +- `final_reply` <- `variable_aggregator` + +```text +{ + "intent": "{{ intent }}", + "original_text": "{{ original_text }}", + "reply": {{ final_reply }} +} +``` + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image67.png) + +Finally add Output node and all setup is complete. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image68.png) + +#### Workflow Runtime Testing + +Done. Now run this workflow and observe distinct behavior for different inputs: + +- Input (ordering): "Give me one spicy chicken burger combo and a large cola." + - Path: `buy_food` -> `LLM_BuyFood` + - Output reply: "Sure, one spicy chicken burger combo and a large cola are noted. Would you like to swap fries in the combo?" +- Input (complaint): "Why are you so slow? I've waited more than an hour!" + - Path: `complain` -> `LLM_Complain` + - Output reply: "We are truly sorry for the long wait. This was our mistake and caused you a poor experience. We are urgently checking your delivery status and will arrange compensation. Thank you for your patience and feedback." +- Input (chitchat): "What is healthier to eat today?" + - Path: `chitchat` -> `LLM_Chitchat` + - Output reply: "If you prefer healthier options, you can try our light salad series or grilled chicken breast with vegetables. Do you prefer lighter taste or richer flavor? I can recommend more specifically." +- Input (irrelevant): "Help me think of a joke for tomorrow's meeting." + - Path: `other` -> `LLM_Other` + - Output reply: "That sounds fun, but I mainly specialize in food recommendations and ordering. If you want to order something to reward your hard work, I can help anytime." + +> Hidden bug: if you encounter strange `aggregation group` issues, it is likely a built-in Dify bug. It can be triggered by certain operation sequences. If AGGREGATION GROUP was enabled and then disabled, residual group config may remain and cause errors (for example involving `any` params) even when switch appears off. Solution: delete this node and recreate it. + +After running in Test Run, you can inspect full execution path. It should follow correct branch and output expected final result. Full flow complete. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image69.png) + +## 2.7 Run Your First Template Workflow App + +After the simple classification workflow, next learn how to run workflows created by others. Usually you only need small modifications to turn them into your own. Here we use official DeepResearch workflow as example. It builds a deep-search framework using LLM + search engine and returns rich answers with citations and model-generated synthesis. + +After importing, first run directly. Then fix each error step by step based on failing node and cause. If stuck, screenshot and ask a model for debugging help. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image70.png) + +At first glance it may feel complex. That is okay. Click `Preview` on top right and run until first error appears: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image71.png) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image72.png) + +Troubleshoot the failing node. In this case Tavily API token was missing. Tavily Search is an AI-native search API providing real-time accurate factual results. Follow prompt to configure: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image73.png) + +After fixing it, search engine works normally: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image74.png) + +Then fix model-call issues as needed. You should be able to get results like this with model-understood synthesis: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image75.png) + +At the end, you can inspect referenced source links: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image76.png) + +If you want to understand each step deeply, best method is saving each node output into intermediate variables and printing all variables at final output. Another way: open `Process` view at top and inspect detailed per-step execution. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image77.png) + +## 2.8 Use Dify as an API Provider + +Next we call the knowledge-base agent via API and turn Dify into a model-hub backend. + +Recall how to call model APIs: prepare key + request/response examples from documentation, feed these to an LLM coding assistant, and ask it to generate invocation code and parse desired fields from responses. + +This time we use local code editor [Trae](https://www.trae.cn/). + +If you are not familiar with IDE concepts, read: +[Extra Knowledge 4 - What is AI IDE and Trae](https://github.com/datawhalechina/easy-vibe/blob/main/docs/extra/extra4/extra4-what-is-ai-ide-and-trae.md) + +If your local environment is not fully configured, do not worry. If you trust your coding assistant (whether [z.ai](http://z.ai) or Trae), you can directly send any issue/errors and it will provide resolution guidance. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image78.png) + +The right panel is Copilot/Agent interaction window. If not visible, click top-right sidebar icon to open. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image79.png) + +After opening sidebar, you will see `Builder` option. This is Agent mode. You can roughly treat "Builder" as the "development mode" of [z.ai](http://z.ai): it can help with local environment operations, dependency installs, opening webpages, etc. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image80.png) + +Inside Builder, there are "Chat" mode and "Builder with MCP" mode. +Chat mode mainly interacts with current folder and natural-language model chat. +(Open a folder from Trae top-left `File`, then Builder file operations occur inside that folder.) + +Builder with MCP gives Agent more tools (for example connecting to other software, retrieving weather, etc.). You can treat MCP as a capability layer that makes external tool invocation easier for models. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image81.png) + +At the bottom, there is model selection dropdown. You can choose Kimi k2 or GLM. In international Trae, you can select ChatGPT or Claude as well. With fast progress of domestic models, Kimi/Qwen/GLM are now close to Claude 3.5/3.7 for daily dev scenarios. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image82.png) + +That is a brief Trae intro. Next we reuse operational ideas from [z.ai](http://z.ai) inside Trae. + +## 2.9 Build a Frontend Chat App Using Dify API + +To build a frontend chat app with Dify API, first obtain Dify API docs and endpoint. + +Remember the agent we created? Click top-right `Publish`, then `Publish Update`, then `Access API Reference`. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image83.png) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image84.png) + +In API docs, find `Send Chat Message`, open it, then copy `Request` and `Response` examples on the right. + +Why copy these two parts? Because they are core API information. With key + request example + response example, you can ask model to generate invocation code and parse required fields from returned structure. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image85.png) + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image86.png) + +After finding request/response examples, you also need API key. In top-right docs area, find `API key` options. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image87.png) + +Click `Create new Secret key` to create your own key. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image88.png) + +Now everything is ready. Send API key + request example + response example to Trae Builder. + +Note: replace `{DIFY_API_URL}` with your actual Dify API URL. + +```json +key: +app-zKdCHUXXXXXXXX + +Please write me a front-end based on the following reference: + +curl -X POST 'http://{DIFY_API_URL}/v1/chat-messages' \ +--header 'Authorization: Bearer {api_key}' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "inputs": {}, + "query": "What are the specs of the iPhone 13 Pro Max?", + "response_mode": "streaming", + "conversation_id": "", + "user": "abc-123", + "files": [ + { + "type": "image", + "transfer_method": "remote_url", + "url": "https://cloud.dify.ai/logo/logo-site.png" + } + ] +}' + +{ + "event": "message", + "task_id": "c3800678-a077-43df-a102-53f23ed20b88", + "id": "9da23599-e713-473b-982c-4328d4f5c78a", + "message_id": "9da23599-e713-473b-982c-4328d4f5c78a", + "conversation_id": "45701982-8118-4bc5-8e9b-64562b4555f2", + "mode": "chat", + "answer": "iPhone 13 Pro Max specs are listed here:...", + "metadata": { + "usage": { + "prompt_tokens": 1033, + "prompt_unit_price": "0.001", + "prompt_price_unit": "0.001", + "prompt_price": "0.0010330", + "completion_tokens": 128, + "completion_unit_price": "0.002", + "completion_price_unit": "0.001", + "completion_price": "0.0002560", + "total_tokens": 1161, + "total_price": "0.0012890", + "currency": "USD", + "latency": 0.7682376249867957 + }, + "retriever_resources": [ + { + "position": 1, + "dataset_id": "101b4c97-fc2e-463c-90b1-5261a4cdcafb", + "dataset_name": "iPhone", + "document_id": "8dd1ad74-0b5f-4175-b735-7d98bbbb4e00", + "document_name": "iPhone List", + "segment_id": "ed599c7f-2766-4294-9d1d-e5235a61270a", + "score": 0.98457545, + "content": "\"Model\",\"Release Date\",\"Display Size\",\"Resolution\",\"Processor\",\"RAM\",\"Storage\",\"Camera\",\"Battery\",\"Operating System\"\n\"iPhone 13 Pro Max\",\"September 24, 2021\",\"6.7 inch\",\"1284 x 2778\",\"Hexa-core (2x3.23 GHz Avalanche + 4x1.82 GHz Blizzard)\",\"6 GB\",\"128, 256, 512 GB, 1TB\",\"12 MP\",\"4352 mAh\",\"iOS 15\"" + } + ] + }, + "created_at": 1705407629 +} +``` + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image89.png) + +At this stage, generated code may not run perfectly in one shot. You may see strange errors or no responses. If that happens, switch model or copy full error details and ask model to iterate based on feedback. + +This working style is already close to real development. In daily collaboration with models, you often need to provide more context to solve issues. Besides error messages, you can copy more doc context (for example from "Send message" docs section) and send together for higher-quality fixes. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image90.png) + +The browser is embedded inside Trae. Click the compass icon at top to open full screen in external browser. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image91.png) + +If you are lucky, first attempt may already yield a functional interactive frontend page. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image92.png) + +Because LLMs are stochastic, a single round may work while multi-turn chat fails. So always do multi-round testing to verify stability in conversational scenarios. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image93.png) + +At this point, you can build a simple Dify knowledge-base agent and use Trae (instead of [z.ai](http://z.ai)) to build an interactive frontend. From now on, Trae will become our primary prototyping tool, gradually replacing [z.ai](http://z.ai). You can try re-implementing the snake game in Trae and compare the experience. Keep going. + +# 3. More Business Workflow References + +You can search engines with keywords like `Dify workflow reference`, or find workflow-sharing repositories on GitHub. Quality varies, so compare multiple sources. Remember, workflow is essentially mapping business SOP into executable process. Think about repeated workflows in your daily work or learning that can be solidified. + +Below are AI-generated workflow design references (real implementations are often similar; high-quality human-crafted workflows still require skill). If any idea interests you, send it to a model for deeper refinement into concrete Dify node design and configuration details. + +## 3.1 Social Media Platform Workflows + +1. One-click cross-platform content distribution workflow (complex) + 1. Idea: treat one core draft as "raw material," automatically produce platform-adapted variants. + 2. Implementation: `Start` article input -> `LLM` polish -> parallel `LLM` nodes for platform experts (for example Xiaohongshu viral copy expert, Zhihu professional answerer) -> `Iterator` for platform format rules -> `Variable Aggregator` merge -> `Answer` output all versions. +2. Hot-topic planning and first-draft generator (medium) + 1. Idea: automatically capture trends and quickly generate topic suggestions and drafts. + 2. Implementation: `Start` keyword -> `Tool` search API for trend data -> `LLM` extract 3-5 topics -> `LLM` generate outline/draft. +3. Comment-section intelligent classification and reply assistant (complex) + 1. Idea: classify comment sentiment/intent and generate categorized reply suggestions. + 2. Implementation: `HTTP Request` to fetch comments -> `Question Classifier`/`LLM` multi-label classification (positive/question/complaint/spam) -> `Condition` routing -> parallel `LLM` reply drafting -> `Answer`. +4. Short-video script and storyboard auto generator (complex) + 1. Idea: given trend topic/product description, auto-generate script, storyboard, and recommended tags. + 2. Implementation: `Start` topic -> `LLM` script ideation -> second `LLM` scene decomposition (visuals/dialogue/duration) -> `Tool` TTS sample generation -> `Variable Aggregator` merge -> `Answer` structured script. +5. Live-stream interaction QA summarizer (medium) + 1. Idea: process live comments in near real time and summarize key questions/audience sentiment. + 2. Implementation: `HTTP Request` streaming comments -> `Iterator` windowed batches -> `LLM` per-window trend summary -> `Answer`/`Webhook` output to host. + +## 3.2 Workplace Workflows + +1. Intelligent meeting minutes and task auto-assignment system (complex) + 1. Idea: extract minutes from transcript and auto-create tasks. + 2. Implementation: `Start` meeting text -> `LLM` agenda/conclusion summary -> `Parameter Extractor` action items (task/owner/deadline) -> `LLM` format minutes email -> parallel `HTTP Request` Jira/Trello/Feishu task creation. +2. Batch resume screening and initial evaluation assistant (medium) + 1. Idea: parse resumes, evaluate fit, and generate interview questions. + 2. Implementation: `Start` upload resumes + JD -> `Document Extractor` parse text -> `LLM` HR-style matching evaluation -> for high matches, another `LLM` generates deep interview questions. +3. One-click multilingual email translation and draft reply (simple) + 1. Idea: auto-translate incoming email and draft response. + 2. Implementation: `Start` email -> `LLM` language detection + translation -> `LLM` reply points -> `LLM` translate back and polish. +4. Weekly/monthly report auto aggregation and insight generation (complex) + 1. Idea: connect multiple data sources and auto-generate structured report. + 2. Implementation: parallel `HTTP Request`/`Tool` calls to CRM/Git/PM APIs -> `Code`/`LLM` data cleaning/calculation -> `LLM` trend/highlight/risk narrative -> `Answer` rich report. +5. Contract/document intelligent review and key-point extraction (medium) + 1. Idea: quickly review legal/business documents, surface risks, and extract key clauses. + 2. Implementation: `Start` contract PDF -> `Document Extractor` text extraction -> `LLM` legal-expert clause review -> `Parameter Extractor` dates/amounts/parties extraction -> `Answer` risk summary + key table. + +## 3.3 Learning and Life Workflows + +1. Academic paper deep analysis and note generator (complex) + 1. Idea: upload paper PDF and auto-generate structured notes. + 2. Implementation: `Start` PDF -> `Document Extractor` full text -> parallel `LLM` summaries (abstract/method/findings/references) -> `Variable Aggregator` merge -> `Answer` markdown notes. +2. Personalized travel planner (medium) + 1. Idea: auto-plan detailed itinerary from user preferences. + 2. Implementation: `Start` destination/days/budget/interests -> `Tool` search/map APIs -> `LLM` daily itinerary with schedule/activities/budget estimates. +3. Interactive foreign-language speaking partner (simple) + 1. Idea: role-play dialogue bot with grammar correction. + 2. Implementation: system role setup -> `Start` user utterance -> `LLM` dual tasks (role reply + grammar correction/explanation) -> `Answer`. +4. Personal knowledge-base QA and related-link recommender (complex) + 1. Idea: build a QA system over your saved docs/notes/links with related old-knowledge recommendations. + 2. Implementation: offline indexing with `Document Extractor` + `Embedding`; online flow: `Start` question -> `Retrieval` from vector store -> `LLM` context-grounded answer; parallel branch uses retrieved content and `LLM` to produce related-old-knowledge list -> `Answer` merged output. +5. Fitness/diet tracking and adjustment advisor (medium) + 1. Idea: analyze daily diet/training logs and output nutrition/training suggestions. + 2. Implementation: `Start` text log (for example lunch + training record) -> `Parameter Extractor` structure parsing -> `LLM` fitness-coach analysis of nutrition/training volume -> compare with long-term goals -> micro-adjustment suggestions. + +# 6. Limitations of Workflow Platforms + +Workflow (low-code) platforms are not universal solutions. They are business-friendly and lower direct coding threshold, but from another angle, "low code" can also be "high code": users still need to understand platform concepts, rules, and operation logic. That itself is a learning cost. + +You may ask: many simple workflows are just chained function calls around model APIs. In code, a few lines may solve it. Why use heavy visual wrappers and make API calling more cumbersome? + +That point is valid. With rapid vibe-coding progress and AI code generation, directly reading or generating code can sometimes be more efficient. Ideally, we should be able to manipulate application logic directly in natural language. But current workflow platforms still have an unavoidable "middle layer" between user intent and final implementation. Learning this middle layer takes time. Ideally, future platforms should support full AI dialogue-driven operation for both workflow construction and parameter-level control. + +Even so, becoming proficient in these platforms is increasingly a foundational skill, similar to office software: widely used and practically valuable in business contexts. + +In later advanced courses, we will introduce code-level workflow and RAG development platforms, where you can compare complexity/flexibility tradeoffs across implementation styles. (Also note that many simple dialogue apps and nested logics are still straightforward in workflow form.) + +# 📚 Homework + +## Master Basic Dify Operations + +To verify you understand common Dify operations, complete one basic assignment plus two mini-challenges: + +You need to import the two provided DSL files into Dify workflows and complete the corresponding challenges successfully (if confused, screenshot and ask a model, or explore each parameter yourself until target behavior is reached): + +1. Based on the intent-classification workflow approach, ask a model to suggest a completely different scenario, but you must still use intent classification workflow. Submit workflow runtime screenshot, scenario description, and result. +2. `Log in workflow` decryption challenge: + +In this challenge, make workflow support: + +- Find the correct password. +- Change password to `0925`. +- Provide a second attempt when password is wrong (no third attempt). +- When user asks to log in again, allow password re-entry. + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image94.png) + +Reference input/output: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image95.png) + +3. `Love loop workflow` decryption challenge: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image96.png) + +Fix current workflow issues so final output looks similar to: + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image97.png) + +If you cannot solve a problem, screenshot and ask a model, or check official docs: +[https://docs.dify.ai/en/use-dify/getting-started/quick-start](https://docs.dify.ai/en/use-dify/getting-started/quick-start) + +## Implement Dify API Invocation + +To verify you truly mastered Dify API usage, complete: + +1. Deploy Dify and create a simple knowledge base (choose any materials you like). +2. Build a chat frontend in Trae IDE and integrate Dify knowledge base via API. +3. Test multi-turn dialogue behavior and ensure program runs normally. + +Submit final runtime screenshots and KB processing screenshots. + +## Try Third-Party Workflow / Build Your Own Business Workflow + +Find a Dify workflow shared by others on GitHub, WeChat public articles, Reddit, X, etc., import and run successfully; or build your own workflow from business references above based on real needs. + +Finally submit successful runtime screenshot and explain workflow purpose. + +# [Bug] How to Fix HTTP Request Errors + +Only refer to this section if you encounter the issue shown below. Otherwise you can ignore this part. + +Sometimes you deploy Dify on your own server where public endpoint is HTTP (not HTTPS). If you request an HTTP-only service, you may see errors like this (enable browser F12 debug info to inspect): + +![](/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/images/image98.png) + +Root cause: Dify is deployed on a server that supports HTTP but not HTTPS. +HTTPS (HyperText Transfer Protocol Secure) adds SSL/TLS encryption over HTTP, basically a more secure HTTP. + +To support HTTPS, common options are: + +- Forward requests through another service (for example reverse proxy on certificate-enabled nginx), or +- Bind domain and issue TLS certificate. + +These are relatively complex, so here we use Zeabur as network forwarding gateway. + +Zeabur pages are accessed via HTTPS by default. So if you forward the original domain to Zeabur domain, the issue is fixed. + +- Original URL: `http://{DIFY_API_URL}/v1/chat-messages` +- New URL: `https://{DIFY_NEW_API_URL}.zeabur.app/v1/chat-messages` + +You only need to replace URL domain (public IP/domain) with your deployed Zeabur domain. Forwarding is preconfigured in service. + +If interested, you can deploy your own forwarding service on Zeabur. Create a Python service and use the following code. After deployment you get an HTTPS endpoint that works normally. + +After deployment, set service listen port to local `8080` and expose this port publicly. + +Note: replace `{DIFY_API_URL}` with your actual Dify API URL. + +```python +from flask import Flask, request, Response +import requests + +app = Flask(__name__) + +TARGET_BASE_URL = "{DIFY_API_URL}" +LISTEN_PORT = 8080 + +@app.route('/', defaults={'path': ''}, methods=['GET', 'POST', 'PUT', 'DELETE', 'PATCH', 'OPTIONS', 'HEAD']) +@app.route('/', methods=['GET', 'POST', 'PUT', 'DELETE', 'PATCH', 'OPTIONS', 'HEAD']) +def proxy_request(path): + target_url = f"{TARGET_BASE_URL}/{path}" + if request.query_string: + target_url += f"?{request.query_string.decode('utf-8')}" + + headers = {key: value for key, value in request.headers if key.lower() not in ['host', 'connection', 'content-length', 'accept-encoding']} + + try: + resp = requests.request( + method=request.method, + url=target_url, + headers=headers, + data=request.get_data(), + cookies=request.cookies, + allow_redirects=False, + timeout=30 + ) + + excluded_headers = ['content-encoding', 'content-length', 'transfer-encoding', 'connection'] + response_headers = [(name, value) for name, value in resp.raw.headers.items() if name.lower() not in excluded_headers] + + return Response(resp.content, resp.status_code, response_headers) + + except requests.exceptions.RequestException as e: + print(f"Error forwarding request to {target_url}: {e}") + return Response(f"Proxy Error: Could not reach target server or invalid response: {e}", status=502) + except Exception as e: + print(f"An unexpected error occurred: {e}") + return Response(f"Internal Proxy Error: {e}", status=500) + +if __name__ == '__main__': + app.run(host='0.0.0.0', port=LISTEN_PORT, debug=True) +``` diff --git a/docs/en/stage-2/assignments/2.1-fullstack-app/index.md b/docs/en/stage-2/assignments/2.1-fullstack-app/index.md new file mode 100644 index 0000000..b93da08 --- /dev/null +++ b/docs/en/stage-2/assignments/2.1-fullstack-app/index.md @@ -0,0 +1,3 @@ +# Build Your First Modern App: Full-Stack Application + +> This chapter is currently being written. Stay tuned... diff --git a/docs/en/stage-2/assignments/2.2-modern-frontend-trae/index.md b/docs/en/stage-2/assignments/2.2-modern-frontend-trae/index.md new file mode 100644 index 0000000..6d6b218 --- /dev/null +++ b/docs/en/stage-2/assignments/2.2-modern-frontend-trae/index.md @@ -0,0 +1,3 @@ +# Assignment 2: Modern Frontend Component Library + Trae Practice + +> This chapter is currently being written. Stay tuned... diff --git a/docs/en/stage-2/backend/2.2-database-supabase/index.md b/docs/en/stage-2/backend/2.2-database-supabase/index.md new file mode 100644 index 0000000..7bb1b4b --- /dev/null +++ b/docs/en/stage-2/backend/2.2-database-supabase/index.md @@ -0,0 +1,1747 @@ +# From Database to Supabase + +In the previous lesson, we learned the basics of UI design tools (Mastergo and Figma), how to use GitHub for code retrieval and version control, and how to deploy websites with Zeabur so more people can access our apps. + +To make this lesson easier to connect, let's quickly review the previous core points with a few short questions: + +1. What are frontend design tools, and how do Figma and MasterGo work? +2. What are the basic methods for turning design drafts into code? +3. What is GitHub, how do you configure SSH, and how do you create your first repository? +4. What does deployment mean, how do you use Zeabur, and how do you deploy GitHub/local code to a public network? + +If any of the above still feels blurry, review the previous lesson notes first. You can always ask questions in the WeChat study group. + +In this lesson, we move from "an app that can run" to "an app that looks like a real online product." That means not only managing data changes with a database, but also building a complete user system (registration, login, authorization) and other core backend capabilities. We use Supabase as the main path: first implement "database + user system," then use Supabase modules to understand the core components of modern cloud backend services. + +# What you will learn + +1. What data is, what a database is, and common database usage +2. What Supabase is and how to do basic database operations with it +3. How to add basic user management with Supabase +4. Supabase advanced features: realtime, storage, and edge functions +5. How to enable Google and GitHub login for Supabase + +- A basic app that supports user sign-up/sign-in and stores data in an online database +- A reusable Supabase backend template (database + user management, etc.) for future projects + +# 1. What is Database + +## 1.1 What is Data + +In the digital world, data is everywhere. Data is simply the carrier of information: your friend's contact info, a WeChat article, a short video, a game character level. In apps, data is everything that needs to be recorded and managed: user profiles, order history, app settings, and so on. + +In programs, data has different forms. The simplest form is variables: + +```python +# Python variable definition examples + +# Integer variable: stores age information +age = 30 + +# Boolean variable: stores status (whether active) +is_active = True # True means active, False means inactive + +# List variable: stores a set of score data +scores = [85, 92, 78, 90] # Contains 4 integer elements representing different scores + +# Dictionary variable: stores multiple related information of a user +user_info = { + "age": 30, # Key "age" corresponds to the value of age + "height": 1.80, # Key "height" corresponds to the value of height (unit: meter) + "login_count": 156 # Key "login_count" corresponds to the value of login times +} +``` + +For more complex data such as user profiles and order history, tables are usually used: + +| user_id | name | email | +| ------- | ----- | ----------------- | +| 1001 | Alice | alice@example.com | +| 1002 | Bob | bob@example.com | + +| order_id | user_id | amount | status | +| -------- | ------- | ------ | --------- | +| 901 | 1001 | 29.99 | completed | +| 902 | 1002 | 15.50 | pending | + +For hierarchical, variable-structure data, JSON is often better. JSON is a universal internet data format that almost all systems can parse. For example, one order may contain multiple items, and each item has its own fields. + +```json +{ + "order_id": 901, + "user_id": 1001, + "amount": 29.99, + "status": "completed", + "items": [ + { "sku": "BG-001", "name": "Beef Burger", "quantity": 1, "price": 18.00 }, + { "sku": "SD-003", "name": "French Fries", "quantity": 1, "price": 6.99 }, + { "sku": "DK-002", "name": "Cola", "quantity": 1, "price": 5.00 } + ], + "shipping_address": { + "street": "123 Tech Park Road", + "city": "Shenzhen", + "zip_code": "518057" + } +} +``` + +There is also vector data. After unstructured data (text/images/audio) is processed by AI embedding models, the output is typically a high-dimensional float array: + +`[0.123, -0.456, 0.789, ..., -0.234]` + +In real projects, there are many data shapes and many corresponding storage systems: + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image1.png) + +## 1.2 Why We Need Database + +Real-world data is complex. To store and use data efficiently, we need a dedicated system to manage it: this is the purpose of databases. + +A database is a specialized program that organizes, stores, manages, and queries data safely and efficiently. + +Without a database, app data quickly breaks down: + +- once users close the browser, in-memory data disappears +- login state and preferences cannot be persisted +- key shared data (inventory, orders) cannot be coordinated across users + +Databases can be deployed locally or in the cloud. Cloud databases support elastic scaling and can handle high concurrency and larger data volume. + +Core problems databases solve: + +- **Persistent storage**: data survives app restarts +- **Efficient query and analysis**: SQL supports filtering, aggregation, analysis +- **High performance and high concurrency**: indexing, caching, pooling, distributed architecture +- **Integrity and consistency**: constraints, uniqueness, data validity guarantees +- **Security and recovery**: authentication, authorization, encryption, backup/restore + +## 1.3 Relational Database VS Non-Relational Database (NOSQL) + +In practice, you typically choose between relational databases and NoSQL databases. + +Relational databases are like strictly structured spreadsheets. You define schema in advance (field types and rules) and connect tables by relational keys. This is highly reliable and great for scenarios such as finance and inventory where correctness is critical, but schema changes can be less flexible. + +NoSQL databases are more like flexible containers. They can store documents, key-value data, and changing structures without fixed schema upfront. They are easier to scale for rapidly changing and large-volume internet scenarios, but they trade off some relational query power and strict consistency. + +In typical usage: + +- relational DBs: transactions, inventory, order systems, accounting, strong consistency +- NoSQL DBs: social content, logs, IoT high-write streams, recommendation features + +In early-stage startups, you usually do not need to over-optimize database type at day one. Mature cloud providers already offer strong defaults. In real business settings, teams usually match business needs with vendor support first, then optimize later. + +You can also refer to cloud vendor database selection guides, such as: +[Aliyun database selection recommendation](https://help.aliyun.com/zh/govcloud/getting-started/select-database-services) + +| Database Type | Database | Price | Typical Scenarios | +| ------------ | ---------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Relational | RDS MySQL | Low | Basic: learning and small websites. HA: medium pressure business scenarios. Cluster: no-interruption and heavier traffic | +| | RDS SQL Server | High | Basic: testing and small commercial sites. HA: enterprise websites. Cluster: no-interruption enterprise business | +| | RDS PostgreSQL | Lowest | Basic: learning and small websites. HA: medium business pressure. Cluster: heavy access and often better performance than common MySQL setups | +| | RDS PPAS | High | General and dedicated enterprise Oracle-compatible scenarios | +| | DRDS | Medium | Entry to enterprise and high-concurrency online business | +| NoSQL | Redis | Medium | Hot standby persistent data and cache acceleration under read pressure | +| | MongoDB | Medium | Single node for dev/test, replica set for read-heavy scenarios, sharded clusters for high-scale online workloads | + +Let's use one concrete "blog platform" example to compare SQL and NoSQL storage models. + +Assume we need: + +- Users: id, username, email +- Posts: id, title, content, author_id +- Comments: id, content, commenter_id, post_id +- Tags: id, name +- Post-tag many-to-many relationships + +### Relational database (SQL) example + +In SQL, we normalize entities into separate tables and connect with foreign keys. + +- `users` table + +| user_id (PK) | username | email | +| -------------- | -------- | ----------------- | +| 101 | Alice | alice@example.com | +| 102 | Bob | bob@example.com | + +- `posts` table + +| post_id (PK) | title | content | author_id (FK) | +| -------------- | --------- | ------------------------------ | ---------------- | +| 1 | SQL Intro | This is an article about SQL... | 101 | +| 2 | NoSQL Intro | NoSQL provides flexible models... | 102 | + +- `comments` table + +| comment_id (PK) | body | commenter_id (FK) | post_id (FK) | +| ----------------- | ---------------- | ------------------- | -------------- | +| 1001 | Great article! | 102 | 1 | +| 1002 | Learned a lot. | 101 | 2 | +| 1003 | Any more examples? | 101 | 1 | + +- `tags` table + +| tag_id (PK) | tag_name | +| ------------- | -------- | +| 51 | database | +| 52 | technology | +| 53 | beginner | + +- `post_tags` table (many-to-many relation) + +| post_id (FK) | tag_id (FK) | +| -------------- | ------------- | +| 1 | 51 | +| 1 | 52 | +| 2 | 51 | +| 2 | 52 | +| 2 | 53 | + +To fetch complete post information (post + author + comments + tags), we use multi-table joins: + +```sql +SELECT + p.title, + p.content, + u.username AS author, + c.body AS comment, + t.tag_name AS tag +FROM + posts p +JOIN + users u ON p.author_id = u.user_id +LEFT JOIN + comments c ON p.post_id = c.post_id +LEFT JOIN + post_tags pt ON p.post_id = pt.post_id +LEFT JOIN + tags t ON pt.tag_id = t.tag_id +WHERE + p.post_id = 1; +``` + +This is SQL's strength: flexible complex queries with consistency and low redundancy. + +### NoSQL database (NoSQL) example + +In NoSQL document databases (for example MongoDB), related business data is often aggregated into a single document, reducing joins at read time. + +A sample document in `posts`: + +```json +{ + "_id": 1, + "title": "SQL Intro", + "content": "This is an article about SQL...", + "author": { + "user_id": 101, + "username": "Alice", + "email": "alice@example.com" + }, + "tags": [ + "database", + "technology" + ], + "comments": [ + { + "comment_id": 1001, + "body": "Great article!", + "commenter": { + "user_id": 102, + "username": "Bob" + } + }, + { + "comment_id": 1003, + "body": "Any more examples?", + "commenter": { + "user_id": 101, + "username": "Alice" + } + } + ] +} +``` + +The advantage is obvious: one lookup can return full business context. + +The trade-off is data redundancy. If `username` changes, many documents may need updates. In read-heavy scenarios (blogs, product pages), this trade-off is often acceptable for faster reads. In write-heavy scenarios, you need careful design trade-offs. + +If you want to explore more databases: + +Examples of SQL databases: +[Db2](https://www.ibm.com/products/db2-database), [MySQL](https://cloud.ibm.com/catalog#highlights), [PostgreSQL](https://www.ibm.com/think/topics/postgresql), [YugabyteDB](https://www.yugabyte.com/), [CockroachDB](https://www.cockroachlabs.com/), [Oracle Database](https://www.ibm.com/products/postgres-enterprise), [Azure SQL Database](https://www.ibm.com/consulting/microsoft) + +Examples of NoSQL databases: +[Redis](https://www.ibm.com/think/topics/redis), [CouchDB](https://www.ibm.com/think/topics/couchdb), [MongoDB](https://www.ibm.com/think/topics/mongodb), [Cassandra](https://cloud.ibm.com/catalog#highlights), [Elasticsearch](https://www.ibm.com/think/topics/elasticsearch), [BigTable](https://www.techtarget.com/searchdatamanagement/news/252512583/Google-scales-up-Cloud-Bigtable-NoSQL-database), [Neo4j](https://neo4j.com/users/ibm/), [HBase](https://www.ibm.com/think/topics/hbase) + +# 2. Supabase + +Above, we discussed database categories and usage. But in real projects, a database is only one backend module. You also need sign-in/sign-up, permissions, file upload/storage, APIs, scheduled jobs, realtime notifications, and more. + +That broader context is **backend services**. A complete app is usually frontend + backend. In traditional workflows, teams had to build servers, configure databases, design APIs, implement security, and maintain operations manually. + +To reduce repeated backend groundwork, the industry created **BaaS (Backend as a Service)**: package common backend capabilities (DB/auth/storage/realtime, etc.) as cloud services that developers can call directly via SDK/API. + +[Supabase](https://supabase.com/) is a modern BaaS representative. It uses PostgreSQL as the core and integrates Auth, Storage, Realtime, Edge Functions, Vector, and more into a "Postgres-centered one-stop backend platform." + +Next, we move from "choosing only a database" to "choosing a complete backend development platform." + +## 2.1 Step by Step Guide + +After understanding Supabase's positioning, let's walk along the Supabase console path and break down each capability and responsibility. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image2.png) + +After signing in at Supabase and clicking **New project**: + +- set project name +- set DB password +- choose region near your target users + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image3.png) + +After creation, the left sidebar shows key modules: Table Editor, SQL Editor, Database, Authentication, and so on. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image4.png) + +### Table Editor + +Table Editor is Supabase's visual data table editor. You can inspect and edit DB data without writing SQL, similar to spreadsheet interaction. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image5.png) + +The key concept here is **Schema**. + +Schemas are resource containers for tables, views, functions, indexes, etc. They help with: + +- avoiding naming conflicts +- permission isolation + +In daily development, most people mainly use: + +- `public`: default business tables (posts/comments/orders/etc.) +- `auth`: authentication tables (for example `auth.users`), usually do not edit built-in auth schema tables manually + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image6.png)![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image7.png) + +### SQL Editor + +SQL Editor is the SQL execution console. You can run model-generated SQL directly and inspect results quickly. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image8.png) + +After executing SQL, you can view new tables in Table Editor (`public` schema). Executed SQL is also saved in the left private history, and can be starred. + +### Database + +Database is the management center where you inspect tables and relationships (foreign key constraints) visually. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image9.png) + +You can also create tables manually in `Database -> Tables`. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image10.png) + +### Authentication + +Authentication manages sign-up/sign-in and permissions. It supports registration, login, password reset, email verification, and OAuth providers (Google/GitHub/others). User data is synced automatically into `auth.users`. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image11.png) + +Provider options are visible in the Provider panel. By default, email login is enabled. For GitHub/Google login, extra provider config is required. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image12.png) + +In `Sign In / Providers`, you can configure registration behavior (for example, whether email confirmation is required). + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image13.png) + +You can also use third-party auth systems in `Third Party Auth` (for example Clerk). + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image14.png) + +You can enable rate-limiting policies in `Rate Limits` to control abusive traffic. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image15.png) + +### Storage + +Storage is Supabase file storage and is S3-compatible in concept. It stores files (images/videos/docs/audio), supports public/private access control, and supports permanent/temporary link generation. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image16.png) + +We cover concrete usage in later project sections. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image17.png) + +If needed, you can operate via S3-compatible settings. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image18.png) + +> Amazon Cloud (AWS) is a cloud platform. S3 is AWS's object storage service and has effectively become an industry standard for object storage APIs. +> +> **Why S3-compatible APIs matter:** there is a large ecosystem of SDKs/tools/docs. Compatibility dramatically reduces integration cost. + +### Edge Functions + +If you do not want to self-host a full backend, but still need secure server-side logic, use Edge Functions. They are globally distributed server functions managed by Supabase. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image19.png) + +A core use case is secure API proxying. Never expose sensitive keys (OpenAI/Stripe/etc.) in frontend code. Instead: + +- frontend calls your Supabase function +- function securely uses secrets stored in Supabase + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image20.png) + +Function secrets are injected as environment variables (for example through `Deno.env.get`), so keys are never exposed to browsers. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image21.png) + +Minimal Edge Function request example: + +```javascript +// Core config (replace with your own values) +const projectId = "your Supabase project ID"; +const functionName = "target Edge Function name"; +const supabaseKey = "Supabase anon_key"; + +async function callEdgeFunction() { + const url = `https://${projectId}.supabase.co/functions/v1/${functionName}`; + + try { + const response = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + "Authorization": `Bearer ${supabaseKey}` + }, + body: JSON.stringify({ order_id: "123", action: "refund" }) + }); + + const result = await response.json(); + console.log("Success:", result); + } catch (error) { + console.error("Failed:", error.message); + } +} + +callEdgeFunction(); +``` + +Edge Functions integrate with Supabase auth sessions and RLS. They can identify current users and operate with your security model. + +Typical scenarios: + +- third-party webhooks +- email notifications +- PDF generation +- custom API endpoints and business rules + +Example: Clerk only manages auth identity. If you need user data synchronized into business tables, you can listen to Clerk webhooks via Edge Functions and write into Supabase automatically. + +### Realtime + +Realtime allows clients to receive DB changes instantly through WebSocket instead of polling. + +It includes: + +1. **Postgres Changes**: subscribe to row-level `INSERT`/`UPDATE`/`DELETE` +2. **Broadcast**: low-latency temporary channel messages +3. **Presence**: online status tracking/synchronization + +We will use it in project-based sections later. + +### Project Settings + +Project Settings is for deeper resource and parameter configuration. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image22.png) + +At beginner stage, focus on: + +1. **Data API**: your Supabase URL (`https://xxx.supabase.co`) +2. **API Keys**: anon key vs service_role key + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image23.png) + +`anon` is for restricted client access under RLS. `service_role` is high-privilege server key and must never be exposed publicly. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image24.png) + +## 2.1 Create Your First SQL Table + +After understanding the console, let's move to core DB operations. + +There are two common ways to create tables in Supabase: + +1. (recommended) generate SQL via LLM and run it in SQL Editor +2. visual creation via `Database -> Tables -> New table` + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image25.png) + +You can define table name and column types in `Columns`. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image26.png) + +Relational DBs rely on table relationships. Configure relations in `Foreign keys`. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image27.png) + +Example (student table referencing class table): + +```sql +CREATE TABLE students ( + student_id INT PRIMARY KEY, + student_name VARCHAR(50), + class_id INT, + FOREIGN KEY (class_id) REFERENCES classes(class_id) +); +``` + +Visualized example: + +Classes table: + +| class_id | class_name | +| -------- | ---------- | +| 101 | Grade 1 Class 1 | +| 102 | Grade 1 Class 2 | + +Students table: + +| student_id | student_name | class_id | +| ---------- | ------------ | -------- | +| 2024001 | Zhang San | 101 | +| 2024002 | Li Si | 102 | +| 2024003 | Wang Wu | 101 | + +In Supabase, after adding a foreign key, choose referenced table and column directly. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image28.png) + +## 2.3 SQL Editor 简介与数据库基本操作 + +Now we run a series of SQL scripts and practice CRUD step by step. + +All sample SQL files are available here: + +https://github.com/THU-SIGS-AIID/Project5-Supabase-Demos/tree/main/apps/sql-examples + +### **2.3.1 **`CREATE`** - 创建表结构** + +`CREATE TABLE` defines schema, columns, data types, and constraints. + +```sql +-- Step 1: Create the 'orders' table +-- This file is fully independent and creates a sample table for later steps. +CREATE TABLE IF NOT EXISTS orders ( + id serial PRIMARY KEY, + user_id int NOT NULL, -- User ID + status text NOT NULL, -- Order status (e.g. paid, pending) + amount numeric(10, 2) NOT NULL, -- Order total amount + details jsonb, -- Item and extra details as JSON + placed_at timestamptz DEFAULT now(), -- Order creation time + is_paid boolean DEFAULT false -- Paid flag +); +``` + +After execution, check Table Editor: + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image29.png) + +### **2.3.2 **`INSERT`** - 填充初始数据** + +After creating the table structure, the next step is to use `INSERT INTO` to add data rows into the table. + +```sql +-- Step 2: Insert initial rows into the orders table +-- Provides realistic, varied data for demo/testing. All values are self-contained. +INSERT INTO orders (user_id, status, amount, details, placed_at, is_paid) VALUES + (2001, 'pending', 23.50, '{"items":[{"sku":"BGR001","name":"Beef Burger","qty":1,"price":12.00}]}', now() - interval '2 days', false), + (2002, 'paid', 50.00, '{"items":[{"sku":"BGR002","name":"Chicken Burger","qty":2,"price":10.00},{"sku":"DRK001","name":"Lemonade","qty":2,"price":5.00}]}', now() - interval '1 day', true), + (2003, 'cancelled', 15.00, '{"items":[{"sku":"FRY001","name":"French Fries","qty":3,"price":5.00}], "reason":"Not available"}', now() - interval '45 days', false), + (2004, 'paid', 22.98, '{"items":[{"sku":"BGR003","name":"Veggie Burger","qty":2,"price":9.99}], "promo":"SUMMER22"}', now() - interval '10 days', true), + (2005, 'pending', 18.75, '{"items":[{"sku":"SAL001","name":"Salad","qty":1,"price":6.75},{"sku":"BGR001","name":"Beef Burger","qty":1,"price":12.00}]}', now() - interval '7 hours', false), + (2006, 'paid', 8.00, '{"items":[{"sku":"DRK002","name":"Cola","qty":2,"price":4.00}]}', now() - interval '3 hours', true), + (2007, 'refunded', 14.50, '{"items":[{"sku":"BGR003","name":"Veggie Burger","qty":1,"price":9.99},{"sku":"FRY001","name":"French Fries","qty":1,"price":4.51}], "refund_reason":"Late delivery"}', now() - interval '15 days', false), + (2008, 'paid', 26.99, '{"items":[{"sku":"BGR002","name":"Chicken Burger","qty":2,"price":10.00},{"sku":"DRK001","name":"Lemonade","qty":1,"price":6.99}]}', now() - interval '12 days', true), + (2009, 'pending', 9.99, '{"items":[{"sku":"BGR003","name":"Veggie Burger","qty":1,"price":9.99}]}', now() - interval '30 minutes', false), + (2010, 'paid', 19.89, '{"items":[{"sku":"BGR001","name":"Beef Burger","qty":1,"price":12.00},{"sku":"DRK002","name":"Cola","qty":2,"price":3.95}]}', now() - interval '5 days', true), + (2011, 'cancelled', 0.00, '{"items":[], "reason":"User cancelled"}', now() - interval '2 days', false); + +-- Expected Output: +-- After running this script, SELECT * FROM orders will show about 11 rows with varied user_id, status, amount, details (JSON), placed_at, and is_paid fields. +-- For example: +-- | id | user_id | status | amount | is_paid | placed_at | +-- |----|---------|-----------|--------|---------|---------------------| +-- | 1 | 2001 | pending | 23.50 | false | 2025-10-28 13:40:00Z| +-- | 2 | 2002 | paid | 50.00 | true | ... | +-- |... | ... | ... | ... | ... | ... | +``` + +After the script executes successfully, initial data is now inserted into the table. You can refresh Table Editor to see the result, or open a new SQL Editor tab and run `SELECT * FROM orders;` to view it directly: + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image30.png) + +### **2.3.3 **`SELECT`** - 读取与查询数据** + +`SELECT` is used to query, filter, and format data: + +```sql +-- Example 1: Select all fields for all orders +SELECT * FROM orders; + +-- Example 2: Select only pending orders +SELECT id, user_id, amount FROM orders WHERE status = 'pending'; + +-- Example 3: Select paid orders +SELECT id, status, is_paid, amount FROM orders WHERE is_paid = true; + +-- Example 4: Extract JSON item list +SELECT id, details -> 'items' AS item_list FROM orders; +``` + +Example 2 result: + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image31.png) + +Example 3 (paid orders): + +| id | status | is_paid | amount | +| --- | ------ | ------- | ------ | +| 2 | paid | true | 50.00 | +| 4 | paid | true | 22.98 | +| 6 | paid | true | 8.00 | +| 8 | paid | true | 26.99 | +| 10 | paid | true | 19.89 | + +Example 4 (JSON array extract): + +| id | item_list | +| --- | -------------------------------------------------------------------------------------------------------------------- | +| 1 | `[{"qty":1,"sku":"BGR001","name":"Beef Burger","price":12}]` | +| 2 | `[{"qty":2,"sku":"BGR002","name":"Chicken Burger","price":10},{"qty":2,"sku":"DRK001","name":"Lemonade","price":5}]` | +| 3 | `[{"qty":3,"sku":"FRY001","name":"French Fries","price":5}]` | +| ... | ... | + +### **2.3.4 **`INSERT`** - 插入单条记录** + +In 2.3.2, we demonstrated batch initialization inserts at the beginning. Now let's see how to insert a single new row. + +```sql +-- Step 4: INSERT a new order (single row) +-- Example: Add a new paid order for user 2012 with one Chicken Burger +INSERT INTO orders (user_id, status, amount, details, is_paid) +VALUES ( + 2012, 'paid', 9.99, + '{"items":[{"sku":"BGR002","name":"AIID Burger","qty":100,"price":1000}]}', + true +); +-- Expected Output: +-- Before (table fragment): +-- | id | user_id | status | amount | is_paid | +-- | ...| ... | ... | ... | ... | +-- +-- After (last row): +-- | id | user_id | status | amount | is_paid | +-- | xx | 2012 | paid | 9.99 | true | +-- (where xx = next serial value) +``` + +Now run `SELECT * FROM orders;` again. You will see the `orders` table increase successfully from 11 rows to 12 rows. + +### **2.3.5 **`UPDATE`** - 修改现有数据** + +In practical work, we frequently update table data. We can use `UPDATE` to modify existing records in a table. + +```sql +-- Step 5: UPDATE example +-- Example: Mark order with id=1 as paid and update its status +UPDATE orders SET status = 'paid', is_paid = true WHERE id = 1; +-- Expected Output: +-- Before (row with id=1): +-- | id | status | is_paid | +-- | 1 | pending | false | +-- After (row with id=1): +-- | id | status | is_paid | +-- | 1 | paid | true | +-- All other rows remain unchanged. +``` + +### **2.3.6 **`DELETE`** - 删除数据** + +`DELETE` can be used to remove records from a table, and with conditions, it can target only a specific subset of data. + +```sql +-- Step 6: DELETE example +-- Example: Delete orders older than 2 days to clean up old data +DELETE FROM orders WHERE placed_at < now() - interval '2 days'; +-- Expected Output: +-- Before (filtered for affected rows): +-- | id | status | placed_at | +-- | 3 | shipped | 2025-10-13 ... | <-- will be deleted +-- +-- After: +-- No such rows remain. SELECT * FROM orders WHERE placed_at < now()-interval '2 days' yields zero rows. +-- Other rows in orders table are unaffected. +``` + +Before executing, you can run `SELECT id, status, placed_at FROM orders WHERE placed_at < now() - interval '2 days';` to inspect the rows matching the condition. After running `DELETE`, execute the same query again: `SELECT id, status, placed_at FROM orders WHERE placed_at < now() - interval '2 days';`. It should return an empty result, which means those rows were deleted successfully. + +## 2.4 RLS (Row level security) + +After basic CRUD, we need one key security concept: **RLS (Row Level Security)**. + +RLS solves data isolation: + +- user A should see only user A's rows +- user B should not access user A's private rows + +For example, in `orders`, define policy: users can read only rows whose `user_id` matches current authenticated user. + +Once RLS is enabled, every `SELECT`/`INSERT`/`UPDATE`/`DELETE` request must pass at least one matching policy, or the DB will reject it. + +Supabase provides `auth.uid()` to reference the current authenticated user id, making policy writing straightforward. + +You can configure policies in the Supabase RLS UI: + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image32.png) + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image33.png) + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image34.png) + +In practice, policies are often created in initialization SQL: + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image35.png) + +# 3. The First SQL Application + +Now we move to practical project exercises. We use a burger-shop scenario to practice Supabase end to end: DB initialization, app connection, auth, and RLS behavior. + +## 3.1 Clone and Run Supabase Demos + +Clone the demo repository: + +https://github.com/THU-SIGS-AIID/Project5-Supabase-Demos + +If you already configured SSH keys, prefer SSH clone: + +`git@github.com:THU-SIGS-AIID/Project5-Supabase-Demos.git` + +If network/SSH has issues, use **Download ZIP**. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image36.png) + +After cloning, ask Trae or Claude Code to run a target project directory directly. + +## 3.2 Project1 - burger-shop-menu-crud + +In `project-burger-shop-menu-crud-1`, we initialize Supabase with SQL scripts and connect frontend reads/writes to Supabase. + +### Create a Database Using Scripts + +First, we need to create the required tables in Supabase. In the Project1 directory, there is a folder named `scripts`, which contains one database script file `init.sql`. It can automatically create all related database resources (including table schemas and initial data). We will frequently use this file later to initialize tables in the database. + +```sql +...... + +-- ============================================================================ +-- 2. Create Menu Items Table +-- ============================================================================ + +create table if not exists public.menu_items ( + id uuid primary key default gen_random_uuid(), + name text not null, + description text, + category text check (category in ('burger','side','drink')) default 'burger', + price_cents int not null check (price_cents > 0), + available boolean default true, + emoji text, + created_at timestamptz not null default now(), + updated_at timestamptz not null default now() +); + +-- Comments for documentation +comment on table public.menu_items is 'Burger shop menu items for CRUD demo'; +comment on column public.menu_items.id is 'Unique identifier for each menu item'; +comment on column public.menu_items.name is 'Display name of the menu item'; +comment on column public.menu_items.description is 'Detailed description of the menu item'; +comment on column public.menu_items.category is 'Category: burger, side, or drink'; +comment on column public.menu_items.price_cents is 'Price in cents (integer) to avoid floating point issues'; +comment on column public.menu_items.available is 'Whether the item is currently available for order'; +comment on column public.menu_items.emoji is 'Optional emoji representation of the menu item'; +comment on column public.menu_items.created_at is 'Timestamp when the item was created'; +comment on column public.menu_items.updated_at is 'Timestamp when the item was last updated'; + +...... +``` + +After running the initialization SQL script in SQL Editor, you can see the created tables in Table Editor. The specific execution logic of the database initialization code is: + +1. Create the `menu_items` table: +2. This table stores all items in the burger shop menu. It includes fields such as `name` (product name), `description`, `price_cents` (price in cents to avoid floating-point precision issues), `category`, and `available` (whether it is currently sellable). This covers the information required by a menu item. +3. Create the `promo_codes` table: +4. This table manages promotions such as discount codes. It defines fields like `code`, `discount_type` (percentage or fixed amount), and `discount_value`. +5. Disable Row Level Security (RLS): +6. For convenience during development and testing, RLS is explicitly disabled in the script. But based on the RLS core logic we learned earlier: RLS is a key security capability in Supabase, and can precisely control "who can access/modify which data" through policies (for example, only admins can edit promo codes while regular users can only view menus). Therefore, in production, you must enable RLS and configure proper policies to block unauthorized access at the data layer. +7. Insert seed data: +8. To let the frontend display realistic menu and promo data right after startup (without manual test-data entry), the `init.sql` script also inserts seed data into `menu_items` and `promo_codes`. For example, you can see various burgers, sides, drinks, and multiple discount codes. + +### Set up the connection with database + +Once the database is ready, we need to connect this frontend project with Supabase so it can read data normally. We need to place the Supabase project URL and anon key into the expected configuration. This project provides two flexible approaches: + +1. Configure via environment variables + +Create a `.env` file in the project root and fill in your Supabase credentials: + +``` +NEXT_PUBLIC_SUPABASE_URL=https://your-project.supabase.co +NEXT_PUBLIC_SUPABASE_ANON_KEY=your-anon-key +``` + +2. Configure directly in the project page + +To make quick demos and switching among different Supabase projects easier, the homepage provides a Settings button in the upper-right corner. You can click it and directly input or paste the Supabase URL and anon key in the popup modal. + +After clicking "Save", this information is used to dynamically create a Supabase client instance, similar to the following code: + +Client creation example: + +```JavaScript +import { createClient, type SupabaseClient } from '@supabase/supabase-js'; + +export function maybeCreateBrowserClient(): SupabaseClient | null { + const url = process.env.NEXT_PUBLIC_SUPABASE_URL; + const anon = process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY; + if (!url || !anon) return null; + return createClient(url, anon); +} +``` + +After creating the database and filling the Supabase link configuration, you can see an interface like the following. You can try CRUD operations on products and observe corresponding table changes in Supabase. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image37.png) + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image38.png) + +### 📚 Assignment + +1. Try adding and deleting items, then inspect changes in Table Editor. + +## 3.4 Project2 - burger-shop-auth-users + +Project1 focuses on menu CRUD and DB connection. Project2 adds user authentication and RLS permission control. + +The login page supports email/password registration and sign-in via Supabase Auth native methods: + +```javascript +const { error: err } = await supabaseClient.auth.signUp({ + email, + password, + options: { + data: { + full_name: fullName || null, + birthday: birthday || null, + avatar_url: avatarUrl || null + } + } +}); +``` + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image39.png) + +After login, Supabase creates session automatically. With RLS, each user only sees their own account data. + +Initialize with `init.sql` first (if initialization fails, clean old tables or recreate the Supabase project). + +After sign-up and email verification, you can enter shop UI: + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image40.png) + +To access admin UI, modify corresponding role field to `admin` in DB: + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image41.png) + +By default, each new email sign-up requires email confirmation. You can disable forced confirmation in `Authentication -> Sign In / Providers -> Confirm email`. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image42.png) + +### 📚 Assignment + +1. Claim starter pack and complete purchase flow. +2. Locate role-related table and set role to `admin`, then modify product quantities in admin page. +3. Locate wallet balance table and modify values to increase remaining wallet amount. + +# 4. Build Your First Supabase App + +Now that you understand DB operations, auth, and RLS, build your own app with database + user login. + +## 4.1 为任意应用接入 Supabase 数据库的标准化流程 + +Use this standardized process: + +1. Clarify requirements and tell AI clearly. + 1. Describe app function and required DB behavior (for example: local React Todo needs cloud sync with Supabase). + 2. Add constraints if needed (timestamp format, money precision, per-user visibility). + 3. Review AI output and correct missing fields. +2. Ask AI to generate `init.sql` based on confirmed schema; run in SQL Editor; if errors, feed error back and iterate. +3. Ask AI to refactor code according to SQL schema and communication logic. +4. Configure Supabase URL/key and test end-to-end. + 1. run app and test DB interactions + 2. inspect Table Editor sync behavior + 3. if failures occur, report exact symptoms to AI and iterate + +For auth pages, ask AI directly to integrate email sign-up/sign-in and define page routing expectations. + +You can also ask AI to migrate implementation patterns from an existing project path directly. + +## 4.2 Case Study : Build an Online Snake Game + +Following the SOP above, use `Project5-Supabase-Demos/apps_snakegame` as concrete practice: add leaderboard + user auth. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image43.png) + +### 4.2.1 分析项目,识别数据需求 + +First, similar to the standardized process above, we can clarify requirements with AI and let AI provide a corresponding modification plan based on our project and requirements. We then implement based on that plan. + +**You can use the following prompt to guide AI:** + +> "I have a snake game. The directory is at {paste the absolute path of the snake game here}. Now I want to add an online leaderboard with Supabase, and also support a user login system. The leaderboard should display rankings by username and email. +> +> Please help me analyze what tables I need to create to implement this feature. What fields should each table include?" + +You will then get a response similar to: + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image44.png) + +### 4.2.2 生成 `init.sql` 脚本 + +Then ask AI to generate `scripts/init.sql` for Supabase initialization: + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image45.png) + +### 4.2.3 改造项目代码 + +Then ask AI to refactor game code for: + +- leaderboard as independent page +- auth via email +- registration/login required before game + +If conversation context gets too long, start a fresh chat and pass `init.sql` as context. + +If auth is unstable, reference: + +`Project5-Supabase-Demos/apps/project-burger-shop-auth-users-2` + +Successful result criteria: + +- users can register and sign in +- signed-in users can view leaderboard correctly + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image46.png) + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image47.png) + +### 📚 课程作业 + +1. Integrate user auth into snake game demo. +2. Integrate user auth into your own application. + +# 5. Become Supabase Master + +The above covered basic operations. Next are advanced concepts and features: why Supabase is selected in this curriculum, and how to implement more complex interactions. + +You do not need to master everything immediately. Learn on demand as projects require. + +## 5.1 Why We choose Supabase + +Why choose Supabase among many backend options? + +Startups face a common tension: + +- want full backend control +- must ship quickly + +Self-building backend from scratch often consumes months (DB/realtime/auth/API/storage/jobs/monitoring, etc.). Supabase packages these capabilities into ready-to-use services, letting teams focus scarce time on product features instead of infrastructure. + +Supabase alternatives exist (PocketBase, Appwrite, etc.), but Supabase is often stronger for full SQL ecosystem maturity and community scale. + +Compared with closed systems like Firebase, Supabase's open-source approach reduces vendor lock-in risk and supports self-hosting. + +Selection is context-dependent: + +- tiny personal experiments: ultra-light tools may be enough +- enterprise compliance scenarios: specialized enterprise identity stack may fit better +- MVP and early growth: Supabase is often sufficient and can scale with integrations (Stripe, Resend, Cloudflare, etc.) + +## 5.2 Google & Github Login Support + +Earlier we covered email sign-up/sign-in. In production UX, social login usually improves conversion and user convenience. + +This section explains full details for Google and GitHub OAuth and password reset. + +Reference project: +`Project5-Supabase-Demos/apps/project-burger-shop-auth-advanced-supabase-6` + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image48.png) + +### 5.2.1 OAuth 流程:第三方登录是如何工作的? + +Third-party login uses OAuth 2.0. Its essence is delegated authorization: users grant limited profile access without exposing provider passwords to your app. + +Typical flow: + +1. user clicks Google sign-in button +2. user is redirected to Google authorization page +3. user consents; Google returns one-time authorization code via callback URL +4. Supabase backend exchanges code for access token +5. Supabase fetches profile, creates/links account, and establishes session + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image49.png) + +### 5.2.2 配置 Google Cloud 获取 Client ID 和 Secret + +No matter which third-party login method you use, you normally need to configure a Client ID and Client Secret. For Google login, you first need to create an OAuth 2.0 Client ID in Google Cloud Platform to obtain these values. + +1. **Enter Google Cloud Console**: +2. Visit [Google Cloud Console](https://console.cloud.google.com/). +3. Create a new project or select an existing one. +4. **Configure OAuth consent screen**: +5. In the left navigation, go to `APIs & Services` -> `OAuth consent screen`. +6. Select the `External` user type, then click `Create`. +7. Fill required information such as app name and user support email. +8. In `Authorized domains`, add your Supabase project domain in the format `*.supabase.co`. +9. Save and continue. In the `Scopes` and `Test users` steps, you can skip for now and save directly. +10. **Create credentials**: +11. Go to `APIs & Services` -> `Credentials`. +12. Click `+ CREATE CREDENTIALS`, then select `OAuth client ID`. +13. Select `Web application` for `Application type`. +14. Give it a name, for example `Supabase Auth`. +15. In `Authorized redirect URIs`, click `ADD URI` and fill your Supabase callback URL. You can find this URL in Supabase Dashboard at `Authentication` -> `Providers` -> `Google`. The format is usually `https://.supabase.co/auth/v1/callback`. + ![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image50.png) +16. Click `CREATE`. +17. **Get Client ID and Client Secret**: +18. After creation succeeds, a popup shows your **Client ID** and **Client Secret**. Be sure to copy and store them immediately. + +### 5.2.3 配置 GitHub 获取 Client ID 和 Secret + +Similarly, you need to register an OAuth application on GitHub. + +1. **Enter GitHub Developer Settings**: + 1. Sign in to your GitHub account. + 2. Click your avatar in the upper-right corner and enter `Settings`. + 3. At the bottom of the left navigation, find `Developer settings`. + +2. **Register a new application**: +3. Select `OAuth Apps`, then click `New OAuth App`. +4. Fill in an app name, for example `My Burger Shop`. +5. **Homepage URL**: fill your online app URL, or local development URL `http://localhost:3000`. +6. **Authorization callback URL**: fill in your Supabase project callback URL. You can find it in Supabase Dashboard at `Authentication` -> `Providers` -> `GitHub`. The format is `https://.supabase.co/auth/v1/callback`. +7. Click `Register application`. +8. **Get Client ID and Client Secret**: +9. After registration, the page displays your **Client ID**. + ![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image51.png) +10. Click `Generate a new client secret` to generate your **Client Secret**. Again, copy and store it immediately. + +### 5.2.4 在 Supabase 中配置 Provider + +Now configure the credentials you obtained in Supabase. + +1. **Enter Supabase Dashboard**: +2. Select your project, then go to `Authentication` -> `Providers`. +3. **Enable and configure Google**: +4. Find `Google` and enable it. +5. Paste the **Client ID** and **Client Secret** from Google Cloud into the corresponding fields. +6. Click `Save`. +7. **Enable and configure GitHub**: + 1. Find `GitHub` and enable it. + 2. Paste the **Client ID** and **Client Secret** from GitHub into the corresponding fields. + 3. Click `Save`. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image52.png) + +At this point, your website can already support third-party account login. You can directly ask AI to use `Project5-Supabase-Demos/apps/project-burger-shop-auth-advanced-supabase-6` as reference and add user login support to your own project, integrating both GitHub and Google authentication with minimal cost. + +### 5.2.6 密码重置实现 + +Password reset is a core production auth feature. + +Reference project includes full implementation: +`project-burger-shop-auth-advanced-supabase-6` + +Core flow: + +1. user enters email; frontend calls `supabase.auth.resetPasswordForEmail()` with redirect URL +2. Supabase sends reset email +3. user clicks email link and is redirected to reset page +4. user submits new password through `supabase.auth.updateUser()` + +You can customize reset templates in: +`Authentication -> Email Templates` + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image53.png) + +## 5.3 Realtime Function + +Supabase Realtime is one of its strongest capabilities. It is useful for collaborative docs, live dashboards, game lobbies, and customer-support systems. + +Project: +`Project5-Supabase-Demos/apps/project-burger-shop-realtime-orders-3` + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image54.png) + +### 5.3.1 数据库实时变动 Postgres Changes + +Postgres Changes subscribes to row changes in specific tables/events. + +Enable realtime replication with SQL: + +```sql +ALTER TABLE public.chat_messages REPLICA IDENTITY FULL; +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_publication_tables + WHERE pubname = 'supabase_realtime' + AND schemaname = 'public' + AND tablename = 'chat_messages' + ) THEN + ALTER PUBLICATION supabase_realtime ADD TABLE public.chat_messages; + END IF; +END $$; +``` + +Client subscription example: + +```typescript +const sub = supabase + .channel('chat_messages_channel') + .on('postgres_changes', { + event: 'INSERT', + schema: 'public', + table: 'chat_messages' + }, (payload: any) => { + console.log('New message received:', payload.new); + const newMessage = payload.new as Message; + }) + .subscribe((status: string) => { + console.log('Chat subscription status:', status); + }); +``` + +Key points: + +- `.channel(...)`: isolate communication scope +- `.on('postgres_changes', ...)`: subscribe event source and filter +- `payload.new`: newly inserted row content +- `.subscribe()`: activate channel + +### 5.3.2 信息广播同步 Broadcast & Presence + +For low-latency temporary states (for example cursor tracking), use Broadcast + Presence rather than DB writes. + +- Presence: shared online-state synchronization +- Broadcast: temporary low-latency message passing + +Presence implementation steps: + +1. Create presence-enabled channel + +```text +const ch = supabase.channel('lobby_presence', { + config: { + presence: { key: anonymousUser.id }, + } +}); +``` + +2. Subscribe and track current user + +```text +const me = { + id: anonymousUser.id, + name: anonymousUser.name, + color: anonymousUser.color +}; + +ch.subscribe(async (status) => { + if (status === 'SUBSCRIBED') { + await ch.track(me); + } +}); +``` + +3. Sync full online list + +```text +ch.on('presence', { event: 'sync' }, () => { + const state = ch.presenceState(); + const flat = {}; + Object.values(state).forEach((arr) => { + arr.forEach((u) => { flat[u.id] = { ...u }; }); + }); + setOnline(flat); +}); +``` + +4. Listen join/leave events + +```text +ch.on('presence', { event: 'join' }, ({ key, newPresences }) => { + console.log('User joined:', key, newPresences); +}); + +ch.on('presence', { event: 'leave' }, ({ key, leftPresences }) => { + console.log('User left:', key, leftPresences); +}); +``` + +Broadcast cursor example: + +Sender: + +```typescript +const handleMouseMove = (e) => { + const payload = { + id: anonymousUser.id, + x: e.clientX, + y: e.clientY, + name: anonymousUser.name, + color: anonymousUser.color + }; + + channelRef.current?.send({ + type: 'broadcast', + event: 'cursor', + payload + }); +}; + +document.addEventListener('mousemove', handleMouseMove); +``` + +Receiver: + +```typescript +ch.on('broadcast', { event: 'cursor' }, ({ payload }) => { + setOnline((prev) => ({ + ...prev, + [payload.id]: { + ...(prev[payload.id] || {}), + x: payload.x, + y: payload.y + } + })); +}); +``` + +Presence keeps "who is online"; Broadcast carries temporary shared states. + +## 5.4 Storage + +A real app handles not only structured data (orders/users), but also unstructured files (avatars, product images, documents). + +If such files are all stored in business servers directly, storage pressure and IO bottlenecks can become severe. + +In practice, files are stored in object storage systems (S3/OSS/etc.), and apps access files through URL addresses. + +Project: +`project-burger-shop-storage-uploads-4` + +This project demonstrates avatar upload flow and uses `Uppy` + `Tus` resumable upload against Supabase upload endpoint. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image55.png) + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image56.png) + +### 5.4.1. Bucket + +Storage is organized by buckets (like folders), each with independent policies and settings. + +Like DB RLS, Storage permissions are controlled with SQL policies on `storage.objects` and `storage.buckets`. + +Example: only allow authenticated users to upload image files under user-specific folder in `avatars` bucket: + +```text +CREATE POLICY "Allow authenticated uploads to avatars bucket" +ON storage.objects FOR INSERT +TO authenticated +WITH CHECK ( + bucket_id = 'avatars' AND + auth.uid() = (storage.foldername(name))[1]::uuid AND + (storage.extension(name) IN ('png', 'jpg', 'jpeg')) +); + +CREATE POLICY "Allow public read access to avatars" +ON storage.objects FOR SELECT +USING ( bucket_id = 'avatars' ); +``` + +### 5.4.2 获取可访问文件 URL + +In this project, create a public bucket named `avatars`. After upload, you get a storage path (for example `public/avatar1.png`) and need to convert it to HTTP-accessible URL. + +Two URL strategies: + +#### 1. 公开 URL (Public URL) - 永久链接 + +For files in public bucket: + +```typescript +const { data } = supabase.storage + .from('avatars') + .getPublicUrl('public/avatar1.png'); +const publicUrl = data.publicUrl; +``` + +Pros: + +- simple fixed URL structure +- cache-friendly (CDN/browser) + +Best for truly public resources (logo/public posters). + +Risk: + +- hotlink traffic abuse can increase bandwidth costs + +#### 2. 签名 URL (Signed URL) - 临时授权链接 + +Recommended for most production private/controlled assets: + +```typescript +const { data, error } = await supabase.storage + .from('avatars') + .createSignedUrl('private/user-invoice.pdf', 3600); +const signedUrl = data?.signedUrl; +``` + +Benefits: + +- expiring authorization +- safer permission boundaries +- much better anti-hotlink behavior + +For private assets (avatars, paid content, invoices), prefer signed URLs by default. + +## 5.5 Edge Function + +Edge Function is a core serverless pattern. "Serverless" does not mean no servers; it means you do not manage server provisioning/ops yourself. You write function logic, provider runs it on trigger and charges by usage. + +Common edge-function providers: + +- AWS Lambda@Edge +- Cloudflare Workers +- Vercel Edge Functions + +In Supabase, Edge Functions run on Deno + TypeScript and are deployed globally for low-latency execution close to users. + +Project: +`Project5-Supabase-Demos/apps/project-burger-shop-edge-function-5` + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image57.png) + +### 5.5.1 LLM Chat 案例解析 + +If you want ChatGPT-like features, never expose model API keys in frontend code. Use edge function as secure proxy. + +```typescript +// scripts/llm-chat.ts +import "jsr:@supabase/functions-js/edge-runtime.d.ts"; +import { OpenAI } from "npm:openai"; + +const OPENAI_API_KEY = Deno.env.get("OPENAI_API_KEY"); + +Deno.serve(async (req) => { + try { + const openai = new OpenAI({ apiKey: OPENAI_API_KEY }); + const { prompt } = await req.json(); + + const stream = await openai.chat.completions.create({ + model: "gpt-3.5-turbo", + messages: [{ role: "user", content: prompt }], + stream: true, + }); + + return new Response(stream.toReadableStream(), { + headers: { "Content-Type": "text/event-stream" }, + }); + } catch (err) { + } +}); +``` + +Key idea: API key remains server-side in Supabase secrets. + +### 5.5.2 创建并部署函数 + +Supabase provides a very user-friendly interface, so you can complete deployment without touching the command line. + +1. **Open the Edge Functions panel**: +2. Sign in to your Supabase project Dashboard. +3. In the left navigation, click the code-like icon and enter `Edge Functions`. +4. **Create a new function**: +5. Click `Create a new function`. + ![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image58.png) +6. Name the function, for example `llm-chat`. +7. **Paste code**: + ![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image59.png) +8. In the online editor popup, **delete all default placeholder code**. +9. Open your local `llm-chat.ts` file and **copy all content**. +10. **Paste** the copied code into the Supabase online editor. +11. **Configure environment variables (Secrets)**: + 1. Find `Secrets` in the sidebar. + ![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image60.png) + 2. `Name`: enter `OPENAI_API_KEY`. + 3. `Value`: paste your own OpenAI API Key. + 4. Click `Save`. The secret set here is encrypted and securely injected into the runtime environment of your function. + +If a function needs to be updated, remember to run `Deploy updates` in the Edge Function section. Supabase will build and deploy this function in the cloud. After a few minutes, your function can be accessed online. + +Beyond being a secure proxy for language-model calls, Edge Functions are useful in far more scenarios. In fact, any task requiring server-side logic, from simple API calls and data validation to more complex computation, can be implemented with Edge Functions. It gives you a lightweight and scalable backend without managing server infrastructure. + +If you want to explore more possibilities, refer to other examples in this project. For example: + +- Image generation (`txt2img.ts`): this function shows how to call third-party text-to-image APIs (such as Stability AI or Midjourney) through Edge Functions to generate images dynamically. This is a typical compute-intensive or external-service-secure-call scenario. Just like `llm-chat`, the API key is stored securely in Supabase backend. The frontend only sends text prompts and displays generated images, making the flow secure and efficient. +- Send email (`send-email.ts`): sending welcome emails, transaction notifications, or password-reset emails is a common requirement. The `send-email.ts` example demonstrates integrating email services (such as Resend or SendGrid) through Edge Functions. You do not need to expose sensitive email-service API keys in client code. Just create a function and let the frontend trigger email sending through this function. + +## 5.6 Clerk Login + +Clerk is a specialized identity/auth platform. It covers registration, login, MFA, session, permission management, and more. + +This part explains full integration with Supabase. + +Project: +`project-burger-shop-auth-advanced-clerk-7` + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image61.png) + +### 5.6.1 创建 Clerk 应用与获取密钥 + +Before using this project, you need a Clerk account and an application. + +1. Register and create: + 1. Visit [dashboard.clerk.com](https://dashboard.clerk.com/) and register an account. + 2. Click `Create application`. + ![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image62.png) + 3. Enter your application name (for example, `Burger Shop`). + 4. In `How will your users sign in?`, keep `Email`, `Google`, and `GitHub` selected by default. + 5. Click `Create application`. +2. Get API keys: + 1. After creation, you will be guided to the API Keys page. + ![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image63.png) + 2. Find the Publishable key (starts with `pk_`) and Secret key (starts with `sk_`). + ![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image64.png) + 3. Copy them into your `.env.local` file (refer to this project's `.env.example`): + + ```bash + NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_test_... + CLERK_SECRET_KEY=sk_test_... + ``` + +### 5.6.2 配置 Supabase 和 Clerk 的原生集成 + +Supabase and Clerk provide native integration: + +1. In Clerk dashboard: + 1. go to Integrations + 2. activate Supabase integration + 3. copy Clerk Domain (`https://.clerk.accounts.dev` or custom domain) +2. In Supabase dashboard: + 1. go to Authentication -> Providers + 2. add Clerk provider + 3. paste Clerk Domain + 4. save + +### 5.6.3 通过 Webhook 同步用户数据至 Supabase + +Native integration only solves authentication authorization. It does not sync already-registered Clerk users into Supabase. For easier management, we also need to keep a backup of user data in Supabase `public.users` for relational queries or data analysis. We can implement this with Clerk Webhooks. The full flow is: + +1. **Clerk sends notifications**: when a user registers or updates profile in Clerk, Clerk sends a POST request to the configured Webhook URL. +2. **Supabase receives and writes**: an Edge Function receives the request, verifies the signature (for security), and then updates user data into Supabase tables. + +Before we start, we need to configure the table used for synchronization: + +```sql +-- File: init.sql + +-- 1. Create `users` table for synced Clerk users +-- This table will store user data pushed from Clerk Webhooks. +CREATE TABLE public.users ( + id TEXT NOT NULL PRIMARY KEY, -- Corresponds to Clerk User ID + email TEXT, + first_name TEXT, + last_name TEXT, + image_url TEXT, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +-- 2. Enable Row Level Security (RLS) on the table +-- This is an important security measure to ensure users cannot access any data by default. +ALTER TABLE public.users ENABLE ROW LEVEL SECURITY; + +-- 3. Create RLS policies +-- Policy 1: Allow authenticated users to read their own user info. +-- `auth.jwt()->>'sub'` extracts the user ID from the JWT provided by Clerk. +CREATE POLICY "Authenticated users can view their own user record" +ON public.users FOR SELECT +TO authenticated +USING ( (SELECT auth.jwt()->>'sub') = id ); + +-- Policy 2: Allow users to update their own info. +CREATE POLICY "Authenticated users can update their own user record" +ON public.users FOR UPDATE +TO authenticated +USING ( (SELECT auth.jwt()->>'sub') = id ); +``` + +Then enable the corresponding Edge Function in Supabase: + +```JavaScript +// File path: supabase/functions/clerk-webhooks/index.ts + +import { serve } from 'https://deno.land/std@0.177.0/http/server.ts' +import { Webhook } from 'npm:svix' +import { createClient } from 'https://esm.sh/@supabase/supabase-js@2' + +// Get Clerk Webhook signing secret from environment variables +const CLERK_WEBHOOK_SECRET = Deno.env.get('CLERK_WEBHOOK_SECRET') + +if (!CLERK_WEBHOOK_SECRET) { + throw new Error('CLERK_WEBHOOK_SECRET is not set in environment variables') +} +const supabaseAdmin = createClient( + Deno.env.get('SUPABASE_URL')!, + Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')! +) + +serve(async (req) => { + try { + // 1. Get Svix signature info from request headers + const headers = Object.fromEntries(req.headers) + const svix_id = headers['svix-id'] + const svix_timestamp = headers['svix-timestamp'] + const svix_signature = headers['svix-signature'] + + if (!svix_id || !svix_timestamp || !svix_signature) { + return new Response('Missing Svix headers', { status: 400 }) + } + + const payload = await req.json() + const body = JSON.stringify(payload) + + // 2. Verify Webhook signature validity using the secret + const wh = new Webhook(CLERK_WEBHOOK_SECRET) + const evt = wh.verify(body, { + 'svix-id': svix_id, + 'svix-timestamp': svix_timestamp, + 'svix-signature': svix_signature, + }) + + const { id } = evt.data + const eventType = evt.type + console.log(`Received webhook event: ${eventType} for user: ${id}`) + + // 3. Execute database operations based on event type + switch (eventType) { + case 'user.created': { + const { id, first_name, last_name, image_url, email_addresses } = evt.data + const { error } = await supabaseAdmin.from('users').insert({ + id, + first_name, + last_name, + image_url, + email: email_addresses[0]?.email_address, + }) + if (error) throw error + console.log(`User ${id} created in Supabase.`) + break + } + case 'user.updated': { + const { id, first_name, last_name, image_url, email_addresses } = evt.data + const { error } = await supabaseAdmin + .from('users') + .update({ + first_name, + last_name, + image_url, + email: email_addresses[0]?.email_address, + updated_at: new Date().toISOString(), // Update timestamp + }) + .eq('id', id) + if (error) throw error + console.log(`User ${id} updated in Supabase.`) + break + } + case 'user.deleted': { + // For delete events, ID might be at the top level + const deletedId = id + if (!deletedId) { + return new Response('Deleted user ID not found', { status: 400 }) + } + const { error } = await supabaseAdmin.from('users').delete().eq('id', deletedId) + if (error) throw error + console.log(`User ${deletedId} deleted from Supabase.`) + break + } + } + + return new Response('Webhook processed successfully', { status: 200 }) + } catch (err) { + console.error('Error processing webhook:', err.message) + return new Response(`Webhook Error: ${err.message}`, { status: 400 }) + } +}) +``` + +After initializing the Supabase table and function, you still need to enable Webhooks in Clerk: + +- In Clerk Dashboard -> **Webhooks**, add an Endpoint and fill in the Supabase Edge Function URL. +- Check events such as `user.created`, `user.updated`, and `user.deleted`. + +![](/zh-cn/stage-2/backend/2.2-database-supabase/images/image65.png) + +Once the setup succeeds, you can see different request attempts in `Message Attempts`. Click each one to inspect detailed response payloads. If a webhook call to Edge Function fails, you can quickly identify the cause from the returned details. It is recommended to compare request logs from both Clerk and Supabase to verify each function setting is correct. + +### 5.6.4 Clerk 中的第三方登录支持 + +Before config, distinguish: + +- development environment (local/internal testing) +- production environment (public real users) + +Clerk separates these for security and policy reasons. + +1. **Development quick verification** + +- In Clerk dashboard -> SSO connections -> Add connection -> For all users +- choose GitHub/Google and add +- Clerk shared credentials handle local testing quickly + +2. **Production custom credentials** + +When switching to production instance, shared credentials are not enough. Configure custom OAuth credentials: + +- copy callback/redirect URL from Clerk +- configure OAuth app on provider side +- paste client ID/secret back into Clerk + +2.1 GitHub production steps: + +- GitHub Developer Settings -> OAuth Apps -> New OAuth app +- set application name/homepage/callback URL +- generate client secret +- paste into Clerk SSO connection + +2.2 Google production steps: + +- Google Cloud Console -> APIs & Services -> Credentials +- create OAuth client (Web application) +- set authorized origins and redirect URI +- copy client ID/secret to Clerk + +Notes: + +1. avoid WebView login for Google OAuth +2. testing mode has user limits; switch publishing status to production after review +3. configure sub-address handling policy if needed +4. optionally integrate Clerk Google One Tap component + +3. test social login + +- use Clerk Account Portal sign-in page +- test GitHub/Google sign-in redirect and callback behavior + +# 6. 从 Supabase 到更多后端开发组件(进阶) + +So far we viewed backend capabilities through Supabase. From a broader engineering perspective, each Supabase module has specialized alternatives in the market. + +Why understand alternatives: + +- decide when all-in Supabase is enough +- replace only one module when scaling/compliance/cost changes +- broaden system design trade-off understanding + +This section compares common alternatives by features, pricing, ease of use, and community traction. + +## 同类 Baas 平台 + +| Platform/Service | Type | Free Tier/Pricing | Features / Use Cases | +| ------------------------ | ------------------------------------------------------------------------------ | -------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| Firebase (Google) | Fully managed BaaS (Auth + Firestore + Storage + Functions + Hosting) | Spark free tier; Blaze pay-as-you-go | Most mature ecosystem, great docs, fast onboarding, strong realtime; but complex billing and stronger lock-in | +| Supabase | Open-source BaaS (Postgres + Auth + Storage + Edge Functions + Realtime) | Free: 500MB DB, 1GB storage, limited function calls; Pro by plan | SQL-first Firebase-like experience; modern DX, can self-host | +| Appwrite Cloud | Open-source all-in-one BaaS | Free basic tier, paid by resources | modern UX, unified APIs, self-host option; ecosystem smaller than Firebase/Supabase | +| Nhost | Postgres + GraphQL + Auth + Storage + Functions | Free: 1GB DB, 1GB storage, limited function calls | Similar to "Supabase + Hasura"; GraphQL-native | +| AWS Amplify | AWS full-stack backend suite | Free quotas for hosting/cognito/functions | strong enterprise reliability; steeper learning curve | +| Xata | Multi-model DB + Auth + Edge Functions | Free: 250k records, 15GB bandwidth | strong DX and UI, but less all-in-one than Firebase/Supabase | +| Convex | Managed DB + Auth + Functions (frontend-first) | Free developer tier; paid by usage | very fast MVP development; higher platform binding risk | + +## 认证 (Auth) + +| Tool/Platform | Features | Free Tier/Pricing | Fit and Trade-offs | +| ----------------------- | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| Firebase Authentication | email/password, phone, social, anonymous, etc. | Spark up to 50k MAU | easy integration, rich docs, but Firebase lock-in | +| Auth0 (Okta) | enterprise SSO/MFA/rules/extensibility | free 25k MAU then paid | enterprise-grade but can become expensive | +| AWS Cognito | AWS-native identity service | free 10k MAU/month then pay-as-you-go | strong AWS integration, higher complexity | +| Logto | open-source auth platform | self-host free, cloud free 50k MAU | strong emerging alternative, smaller ecosystem | +| Keycloak | open-source IAM/SSO | free self-host | powerful and extensible, higher ops complexity | + +## 文件存储 (Storage) + +| Platform/Service | Type | Free Tier/Pricing | Features/Use Cases | +| ---------------------------------------- | -------------------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------- | +| Amazon S3 | cloud object storage | AWS free tier: 5GB + request quotas | industry standard object storage, high reliability | +| Google Cloud Storage / Firebase Storage | cloud object storage | Spark free + Blaze paid | strong Firebase integration, fine-grained rules | +| Tencent COS / Aliyun OSS | domestic cloud object storage | pay-as-you-go + newcomer quotas | strong domestic ecosystem integration | +| MinIO | open-source S3-compatible storage | free self-host | lightweight S3-compatible storage for private deployment | +| Cloudinary / Imgix | media storage + CDN | basic free plans | strong media transformation capabilities | + +## 边缘函数 (Edge Functions) + +| Platform/Service | Features | Free Tier/Pricing | Fit and Trade-offs | +| -------------------------------------- | ------------------------------------------ | ---------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Cloudflare Workers | globally distributed JS/Wasm runtime | free 100k req/day | ultra-low latency edge execution | +| Vercel Edge Functions | deep Next.js integration | hobby free quotas | excellent frontend integration | +| Netlify Edge / Functions | Node functions + edge routes | free credit-based quotas | easy git-integrated deployment | +| AWS Lambda@Edge / CloudFront Functions | AWS edge compute | lambda free quotas + cloudfront pricing | powerful but more complex setup | + +## 实时通信 (Realtime) + +| Platform/Service | Features | Free Tier/Pricing | Fit and Trade-offs | +| -------------------------------------- | ------------------------------------------------ | ----------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| Firebase Realtime DB / Firestore | realtime DB push updates | spark free + blaze paid | easy realtime listening, weaker complex querying | +| Ably | pub/sub realtime messaging platform | free 6M messages/month | robust global realtime service | +| Pusher Channels | event-push channels | sandbox free tier | quick chat/notification integrations | +| Self-host WebSocket/Socket.IO | custom realtime infra | self-host infra cost | highest flexibility, highest ops burden | + +## 数据库 + +| Platform/Tool | DB Type | Free Tier/Pricing | Key Features | +| ---------------------------- | --------------------------------------- | ----------------------------------------------------- | ------------------------------------------------------------------- | +| Neon | serverless PostgreSQL | free tier + branch compute limits | modern serverless Postgres with branching workflow | +| Aiven PostgreSQL | managed relational DB | small free plans + paid | managed operations across cloud providers | +| CockroachDB Cloud | distributed SQL (Postgres-compatible) | free storage quota | horizontal scaling and consistency | +| TiDB Cloud | distributed relational (MySQL-compatible) | free cluster quotas | strong distributed MySQL-compatible architecture | +| MongoDB Atlas | document NoSQL | free M0 cluster | flexible document modeling | +| SQLPub | multi-database platform | free request/storage quotas | one-stop multi-DB service | + +Different options optimize different dimensions: flexibility, cost, ease of use, compliance, ecosystem fit, and scalability. + +# 总结 + +In today's lesson, we systematically learned foundational database concepts, Supabase core definitions, and practical operation details. During later project practice, you can always come back to this document as a reference based on your specific application scenario and requirements. + +Please always remember one key principle: **Ship first, perfect later.** You do not need to achieve everything in one step. Through continuous iteration and optimization, we can gradually approach better outcomes. Wish you smooth progress in your upcoming project practice. + +# 📚 课后作业 + +1. Build an application with user management + database support. + Try to include additional Supabase features (Realtime / cloud storage / Edge function). diff --git a/docs/en/stage-2/backend/2.3-ai-interface-code/index.md b/docs/en/stage-2/backend/2.3-ai-interface-code/index.md new file mode 100644 index 0000000..97c7c88 --- /dev/null +++ b/docs/en/stage-2/backend/2.3-ai-interface-code/index.md @@ -0,0 +1,178 @@ +# Using LLMs to Write API Code and API Documentation + +In the previous chapters, we learned how to use tools like Figma to create UI drafts, how to use AI to quickly generate static frontend pages, and how to use Supabase to build databases and basic authentication. That naturally leads to a new question: when someone clicks those lively buttons on the frontend, how does the data actually get stored in Supabase? And when we need more complex business logic such as concurrent payments, scheduled pushes, or sensitive data processing, is it still safe to let the frontend talk directly to the database? + +That question introduces one of the most important parts of modern web architecture: the **backend API**. + +In the past, backend developers often wrote hundreds or thousands of lines of routing, controller, and validation logic by hand. Today, we can hand much of that repetitive scaffolding to large language models. In this chapter, we will move beyond vague "AI-generated code" and look at a real workflow for using strong prompts to guide an LLM into writing solid Node.js backend interfaces, plus the corresponding documentation and test cases. + +> 💡 **Prerequisites** +> +> Before starting this chapter, it helps to understand: +> - [From Database to Supabase](../2.2-database-supabase/) for basic database and data-model concepts +> - [Git and GitHub Workflow](../2.4-git-workflow/) for project collaboration and version control +> - [What Is the Terminal / Command Line](/en/appendix/2-development-tools/command-line-shell) for project initialization and startup commands + +# What you will learn + +1. **What an API is**: Understand the bridge between frontend and backend, plus basic RESTful design. +2. **How LLMs help service construction**: Use structured prompts to generate a clean Node.js + Express starter project. +3. **Interface logic development**: Guide the model to generate CRUD APIs with proper business validation and Supabase integration. +4. **Automatic API documentation**: Ask the model to reverse-generate OpenAPI/Swagger docs from your code. +5. **Testing and integration loops**: Use the model to create Postman collections and Jest unit tests to protect code quality. + +--- + +# 1. Why do we need APIs? + +Traditionally, the frontend is "the visible part" and the database is "the storage room." But something is missing between them: a coordinator. + +If you imagine the application as a restaurant: + +- The **frontend (client)** is the menu and ordering table, where customers browse and make requests. +- The **database (Supabase, etc.)** is the kitchen storeroom, where ingredients and records are kept. +- The **backend API** is the waiter. Customers should not run straight into the kitchen to grab ingredients. Instead, they tell the waiter what they want through an HTTP request. The waiter checks the request, verifies permissions, talks to the kitchen, and brings the result back through an HTTP response, usually in JSON. + +Through APIs, we achieve a clean **frontend-backend separation**: the frontend focuses on rendering, while the backend focuses on business logic, data processing, and security. + +--- + +# 2. Project architecture and initialization + +A clear project skeleton is a prerequisite for getting high-quality code from an LLM. Before you ask AI to write code, you should already have a mental model of the structure you want. + +## 2.1 A common API project structure + +Even if an LLM is generating the code, you should not dump everything into one `server.js` file. A maintainable Node.js backend usually looks something like this: + +```text +my-api-project/ +├── .env # Sensitive environment variables such as API keys and DB URLs +├── server.js # Project entry point: boot server, register global middleware +├── package.json # Dependency management +├── src/ +│ ├── routes/ # Route layer: define URLs and HTTP methods +│ ├── controllers/ # Controller layer: process request params, call services, return responses +│ ├── services/ # Service layer: database access and core business logic +│ └── middlewares/ # Middleware: auth, global error handling +└── docs/ # API documentation +``` + +## 2.2 Use AI to initialize the project + +Instead of manually running `npm init` and installing packages one by one, you can give the model the structure above in prompt form: + +> 🗣️ **Prompt example** +> "Help me scaffold a Node.js backend project that can connect to Supabase. Keep the structure clean and easy to maintain later." + +If the prompt is good, the code you get back can already give you a backend app with a solid foundation running on `localhost:3000`. + +--- + +# 3. Core practice: using LLMs to develop APIs + +This is the heart of the chapter. When LLM-generated code feels superficial or unsafe, the root cause is usually missing context. **LLMs are not afraid of complex requirements. They are afraid of vague ones.** + +Take the `menu_items` insert API from the [database chapter](../2.2-database-supabase/) as an example. + +## 3.1 Give the model full context + +Before asking the model to write an API, provide both the **database schema** and the **business constraints**. + +> 🗣️ **High-quality prompt template** +> "Help me write an API for creating a menu item. Each item includes a product name, price, category (burger, snack, drink), and whether it is listed. Product name and price are required. Price cannot be negative. Return helpful validation errors when the user input is invalid." + +## 3.2 Review the generated code + +A good model will often separate responsibilities clearly, for example: + +```javascript +// services/menuService.js +const { createClient } = require('@supabase/supabase-js'); +const supabase = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_KEY); + +exports.createMenuItem = async (menuData) => { + // Push data into the table via the Supabase SDK + const { data, error } = await supabase + .from('menu_items') + .insert([menuData]) + .select(); + + if (error) throw new Error(`Database insert failed: ${error.message}`); + return data[0]; +}; +``` + +You can see that, with enough context, the model generates something structurally cleaner: Supabase initialization is separated, errors are handled, and the code is easier to reason about. That is very different from the spaghetti code you usually get from a vague request like "write a create endpoint." + +--- + +# 4. Free your hands: generate API documentation automatically + +For a development team, an undocumented API is a blind box. Frontend engineers cannot guess what parameters are required or what the response shape will be. The most common API description standard in the industry is **OpenAPI** (formerly often called Swagger). + +Writing Swagger YAML or JSON by hand used to be painful and error-prone. Now it is one of the areas where LLMs help the most. + +You can select your `routes` and `controllers` code and ask: + +> 🗣️ **Documentation prompt** +> "Generate API documentation from the code above. Clearly explain what every parameter means and what data the endpoint returns, so the frontend team can integrate it easily." + +You can even ask the model to fill in descriptions and mock example values such as `price_cents: 1200` for a $12.00 item. That reduces a lot of back-and-forth communication. + +--- + +# 5. Safeguards: generate tests and Postman collections + +After the code and docs are ready, there is still one more step: verifying that everything actually works. + +## 5.1 Generate Postman or Apifox test configurations + +When developing APIs, we often use tools like Postman to simulate HTTP requests. Without AI, you usually have to fill in URLs, headers, and JSON request bodies manually. + +You can simply tell the model: + +> "Convert this API documentation into a Postman-importable format and include both successful and failing request examples." + +Once you save the returned JSON as something like `menu_api.json` and import it into Postman, you instantly get a ready-to-use testing panel. + +## 5.2 Write automated unit tests + +If you want stricter engineering quality, you can also ask the model to write tests with `Jest` or a similar framework. That is especially useful for boundary conditions, such as ensuring a negative price is rejected before data reaches the database. + +--- + +# 6. Backend API best practices you still need to know + +Even with AI support, you are still the gatekeeper of the system. You need to review the generated code against a few important principles: + +1. **RESTful path naming** + - Good: `GET /api/users` for listing users, `POST /api/users` for creating users + - Bad: `POST /api/getUser` or `POST /api/createUser` + The URL should represent the resource. The action belongs to the HTTP method. + +2. **Correct HTTP status codes** + - `200/201`: request succeeded / resource created successfully + - `400`: bad request, invalid parameters or missing required fields + - `401/403`: unauthorized / forbidden + - `404`: resource not found + - `500`: server error, such as backend exceptions or database failures + Do not expose full backend stack traces to the frontend. + +3. **Never trust user input** + Frontend input can be forged. All important validation must run again on the backend. + +# 7. Summary + +After this chapter, your role should start to feel different. You are no longer just a typist trapped in syntax and punctuation. You are becoming a **system designer and architecture coordinator**. + +You have now learned: + +1. The core systems thinking behind **APIs and frontend-backend separation** +2. How to dramatically improve LLM-generated backend code by providing **good context and layered structure** +3. How to turn tedious **documentation writing** and **test creation** into automation tasks that AI handles well +4. How to combine this with what you already learned about **Supabase** to complete the full flow from frontend request to database update + +::: tip Next Step +Once your data flow and backend service are ready, they still only run locally on your own machine. In the next chapter, we will learn how to **deploy** that service to a public server so your product can be accessed by real users. +::: diff --git a/docs/en/stage-2/backend/2.4-git-workflow/index.md b/docs/en/stage-2/backend/2.4-git-workflow/index.md new file mode 100644 index 0000000..e0773e2 --- /dev/null +++ b/docs/en/stage-2/backend/2.4-git-workflow/index.md @@ -0,0 +1,255 @@ +# Git and GitHub Workflow + +In previous chapters, we learned how to use web-based vibe coding tools to write code. Each conversation could generate a new version of the code. But that raises an important question: if we want to return to an earlier version, is there a convenient way to do it? Is there a tool that can record our code at different stages so we can switch between versions freely? + +That is exactly why version control software exists. In this chapter, we will introduce the most famous version control system, **Git**, and the most popular code hosting platform, **GitHub**. You will learn how to manage code with Git, how to download code from GitHub, how to upload your own work, and how to collaborate with others on larger projects. + +Whether you are tracking changes in a personal project, synchronizing code with teammates, or contributing to open source, Git and GitHub are essential tools for modern developers. Once you understand them, you can manage code more confidently, create checkpoints whenever needed, move between different stages of a project, and keep every change traceable. + +> 💡 **Prerequisites** +> +> Before learning Git, it helps to understand: +> - [What Is the Terminal / Command Line](/en/appendix/2-development-tools/command-line-shell) +> - [What Is Git](/en/appendix/2-development-tools/git-version-control) +> +> This chapter focuses on the GitHub workflow and hands-on usage, while the links above cover the core fundamentals. + +# Quick start with Git + +Before using Git, make sure you already understand the basics of the command line and Git itself. This chapter assumes you have that foundation and moves directly into installation, configuration, and practical GitHub collaboration. + +## How to install Git + +We will briefly walk through installation on the three major operating-system families. + +### Windows + +1. Go to the [official Git download page](https://git-scm.com/download/win) and download the installer that matches your system. In most cases, the x64 installer is recommended. +2. Double-click the installer and follow the setup wizard: + ![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image5.png) + 1. In most cases, keeping the default settings is fine. If you customize them, pay attention to: + - **Default editor**: you can keep Vim, or choose Visual Studio Code if you already have it installed. + ![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image6.png) + - **How Git is used from the command line**: a practical default is the option that adds Git to the command line and third-party software without overcomplicating the system setup. + ![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image7.png) +3. After installation, right-click on the desktop. If you see `Git Bash Here`, the installation succeeded. + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image8.png) + +### macOS + +On macOS, you can first run `git --version` in Terminal to check whether Git is already installed. If it is not, macOS often prompts you to install the developer tools automatically. + +1. Method 1: install with Homebrew + If you have [Homebrew](https://brew.sh/), open Terminal and run `brew install git` +2. Method 2: install Xcode tools + You can also install Xcode or the Xcode Command Line Tools from Apple. Git is included as part of that toolchain. + +### Linux + +Most Linux distributions install Git through the system package manager: + +- Ubuntu / Debian: + +```bash +sudo apt update +sudo apt install git +``` + +- CentOS / RHEL: + +```bash +sudo yum install git +``` + +To verify the installation, run `git --version`. If a version number appears, Git is ready. + +## Initialize Git identity + +After installing Git, the first thing you should do is configure your user information. Run the following commands in the terminal and replace the values with your own: + +```bash +# Set the global username shown in commit history +git config --global user.name "Your Name" + +# Set the global email, ideally the same one you use on GitHub +git config --global user.email "your.email@example.com" +``` + +Git writes this information into every commit as the author identity. When you inspect the version history, you can clearly see who changed what and communicate more easily in collaborative projects. + +You can confirm the configuration with: + +```bash +git config --list +``` + +# What is GitHub? + +GitHub is a code hosting platform built on top of Git. It provides remote storage for Git repositories and adds collaboration tools such as Issues, Pull Requests, and Projects. In simple terms, Git is the local version-control tool, while GitHub is the remote code warehouse and collaboration layer. + +GitHub is also the world's largest and most influential open-source community. The idea of open source is that anyone can download and run the source code of a project. That allows people around the world to inspect each other's work, improve it, and build new things on top of it. + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image9.png) + +Large companies often open-source tools and tutorials on GitHub as part of their technical strategy. In the GitHub ecosystem, the number of `stars` a project receives is one of the most visible indicators of trust and influence. + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image10.png) + +In this course, many supporting resources and assignments are also published in GitHub repositories. By learning to upload your own work there, you gradually build the workflow you will use for real application development later. + +## Create a GitHub account + +1. Visit [GitHub](https://github.com/) and click `Sign up` in the top-right corner. + ![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image11.png) +2. Enter your email address, create a password, and complete the verification steps. +3. Confirm your email, and your account is ready. + +## Create your first repository on GitHub + +Next, let's create your first repository, often shortened to `repo`. + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image12.png)![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image13.png) + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image14.png) + +When creating a repository, the main fields mean: + +1. **Repository name**: the public-facing name of the repository +2. **Description**: a short explanation of what the repository is for +3. **Visibility**: + - `Private`: only you and people you explicitly invite can see it + - `Public`: anyone can see it +4. **README**: it is good practice to add a README. Think of it as the repository's introduction and usage guide. +5. **.gitignore and license**: + 1. `.gitignore` tells Git which files or folders should not be tracked, such as temporary files, dependency folders, or local secrets. + 2. `license` determines how others are allowed to use your open-source code. + +For your first repository, it is reasonable to check `Add README`, set the visibility to `Private`, and fill in a name and description you like. Then click `Create repository`. + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image15.png) + +You will now have a clean repository, ready for your files. + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image16.png) + +To download a repository, you use `git clone`, which requires the repository URL. You can find that by clicking the green `Code` button. GitHub usually shows both HTTPS and SSH options. + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image17.png) + +In general, HTTPS is fine for temporary downloads or quick testing, but for your own daily development workflow, SSH is usually the better experience. + +## Bind local SSH to GitHub + +In GitHub, "binding SSH" means connecting your local machine's SSH public key to your GitHub account so GitHub can recognize your device through the SSH protocol. Once set up, you can `clone`, `pull`, and `push` securely without re-entering passwords every time. + +In plain language: it is like giving your device a special access card for GitHub. + +> 💡 What is SSH? + +### Why use SSH authentication? + +GitHub supports two major protocols for repository operations: + +- **HTTPS**: usually requires a password or Personal Access Token for pushes +- **SSH**: uses a key pair, so you do not need to repeat authentication constantly + +SSH binding is the prerequisite for using GitHub with SSH. You must upload your local SSH public key to GitHub so GitHub can verify your machine. + +### The core logic: SSH key pairs + +SSH authentication depends on a key pair: + +1. **Private key**: stored on your local machine, never shared +2. **Public key**: uploaded to GitHub + +When you perform a Git operation over SSH: + +- Your machine signs the request with the private key +- GitHub checks it against the public key you uploaded +- If the match succeeds, the operation is allowed + +### The actual steps + +The core workflow is simple: **generate a key pair → upload the public key to GitHub**. + +1. **Generate an SSH key pair locally** + 1. **Use Trae to help generate it** + Prompt: + `Help me create the SSH key needed for GitHub login. My email is your_email@gmail.com. Please return the public key for me to copy.` + + ![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image18.png) + + After entering the prompt, you may still need to press `Enter` in the terminal pane so the command can continue. Once Trae finishes, it will show you the public key to copy. + + ![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image19.png) + + 2. **Generate it manually** + Open your terminal and run `ssh-keygen -t ed25519 -C "your_email@example.com"` + Press `Enter` to accept the defaults unless you want a custom path or passphrase. This creates: + + - `id_ed25519`: your private key, which must stay local + - `id_ed25519.pub`: your public key, which you will upload to GitHub + +2. **Upload the public key to GitHub** + + This is the binding step itself. + + 1. Copy the public key: + - On Windows, open `C:\Users\\.ssh\id_ed25519.pub` + - On macOS/Linux, run `cat ~/.ssh/id_ed25519.pub` + 2. In GitHub, go to your avatar → `Settings` → `SSH and GPG keys` → `New SSH key` + ![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image20.png)![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image21.png) + 3. Enter a title and paste the public key. + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image22.png) + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image23.png) + +3. **Verify the binding** + +Run `ssh -T git@github.com` + +If you see a message similar to `Hi [your GitHub username]! You've successfully authenticated...`, the setup worked. + +### Important notes + +- If you use multiple devices, create a separate SSH key pair for each one and upload each public key to the same GitHub account. +- Never share your private key. +- After setting up SSH, use SSH repository URLs such as `git@github.com:username/repository.git`, not HTTPS URLs. +- If you cloned a repository over HTTPS earlier, you can switch it with `git remote set-url origin ` + +# Use Trae for GitHub operations + +Now that we have covered Git, GitHub, SSH, and the setup process, you can start asking Trae to help with Git operations. + +## `git clone`: download an existing repository + +You can directly tell Trae which repository URL you want to clone. + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image24.png) + +## `git pull`: fetch the latest remote updates + +Before editing, especially in a shared repository, you should pull the latest changes first. + +**Always include the folder name and its relative or absolute path so you do not pull in the wrong repository by mistake.** + +Prompt: +`Help me pull this repository AIID-TEST in ./AIID-TEST.` + +## `git commit` and `git push`: stage, save, and upload your updates + +After you modify files locally, you can ask Trae to detect the changes and help you push them to GitHub. + +Prompt: +`I finished. Commit and push to the repository AIID-TEST in ./AIID-TEST.` + +![](/zh-cn/stage-2/backend/2.4-git-workflow/images/image25.png) + +If the push succeeds, you will be able to see the updated content on GitHub immediately. + +# References + +- Pro Git book: https://git-scm.com/book/en/v2 +- GitHub Docs: https://docs.github.com/en diff --git a/docs/en/stage-2/backend/2.5-zeabur-deployment/index.md b/docs/en/stage-2/backend/2.5-zeabur-deployment/index.md new file mode 100644 index 0000000..6ed8a83 --- /dev/null +++ b/docs/en/stage-2/backend/2.5-zeabur-deployment/index.md @@ -0,0 +1,517 @@ +# How to Deploy Web Applications + +In this tutorial, we will walk through how to deploy your web application to the internet so other people can access it. We will introduce four common deployment platforms: **Tencent Cloud CloudBase**, **Vercel**, **Netlify**, and **Zeabur**. The goal is to help you go from "I finished writing the code" to "other people can visit my site online." + +# What does "deployment" mean? + +Before we begin, let's clarify what deployment actually is. + +For any website to be visited by external users, it must have a publicly reachable network address. That can be an IP address such as `123.45.67.89`, or a domain such as [google.com](https://google.com/). But the address alone is not enough. Your code, such as HTML, CSS, JavaScript, or React/Vue projects, as well as images and video assets, must live on a server that stays online 24/7 and can answer incoming requests. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image1.png) + +Image source: https://www.hostinger.com/tutorials/what-is-cloud-hosting + +The full process of uploading resources, configuring the runtime environment, and making the service run is called **deployment**. + +In simple terms: if your website runs only on your own computer, then only you can visit it locally because the files only exist on your hard drive. Deployment means moving your code and assets to a public-facing server, configuring that server properly, and making sure it knows how to respond when someone visits your domain. + +If you deploy everything manually, a project usually involves many steps: + +1. **Prepare a server** + You first need to buy or rent a cloud server from a provider such as Alibaba Cloud, Tencent Cloud, or AWS EC2. Then you choose its region, CPU, memory, and storage, and learn how to connect to it remotely, often through SSH. + ![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image2.png) + +2. **Configure the runtime environment** + Web apps only run under the correct environment. A Node.js project needs Node installed. A Python project needs Python and its dependencies. If the versions do not match, the app may fail to start. + +3. **Upload your files** + You need to move your local code and assets to the server, often via Git or file-transfer tools. Large projects can make this step frustrating if uploads break halfway through. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image3.png) + +4. **Start the service and test it** + After upload, you need to start the app and check whether the assigned address works. If not, the problem may be a firewall-blocked port, or it may be an application bug. In that case, you need to inspect logs. + +5. **Maintain and update** + Every code update usually means another upload and restart. If the server crashes, you may need to restart services manually or configure a process manager to keep them alive. + +Platforms such as CloudBase, Vercel, Netlify, and Zeabur exist to eliminate much of that complexity. They automate the boring parts: + +- buying and provisioning servers +- configuring runtimes +- pulling code +- starting services +- monitoring uptime + +In many cases, you just connect a GitHub repository or upload your code, and the platform does the rest. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image4.png) + +--- + +# Deployment platform comparison + +| Platform | Main strengths | Best for | Free tier | +|------|------|----------|----------| +| **Tencent Cloud CloudBase** | Fast access within mainland China, strong WeChat ecosystem integration | China-focused users, WeChat Mini Program support | Yes | +| **Vercel** | Excellent support for frontend frameworks, tight GitHub integration | Modern React/Vue/Next.js frontend projects | Yes | +| **Netlify** | Broad feature set, great Git workflow, form handling, auth support | Static sites that also need forms or auth | Yes | +| **Zeabur** | Flexible service combinations and many templates | More complex projects, including tools like Dify and n8n | About $5/month in free quota | + +--- + +# 1. Tencent Cloud CloudBase + +Tencent Cloud CloudBase is Tencent's integrated cloud backend platform and is especially friendly for developers targeting domestic Chinese users. + +Its advantages include: + +- **Fast domestic access** +- **WeChat ecosystem integration** +- **An all-in-one backend solution** including static hosting, cloud functions, databases, and storage +- **A practical free tier** + +## Deploy a web app with CloudBase + +### Step 1: Register and log in + +Visit the [Tencent Cloud CloudBase Console](https://console.cloud.tencent.com/tcb) and log in with WeChat or QQ. + +### Step 2: Create an environment + +Click `Create Environment` and choose an environment name such as `my-web-app`. + +> ⚠️ **Note**: the free trial version of CloudBase often requires a redemption code. You usually need to follow the CloudBase official account and obtain a code there. + +### Step 3: Enable static website hosting + +Inside the environment management screen, enable the `Static Website Hosting` feature. Once enabled, you will receive a default public domain. + +CloudBase supports several deployment methods: + +- upload a local build output +- deploy from a template +- deploy from a Git repository + +### Step 4: Deploy your code + +CloudBase offers three main workflows: + +**Option 1: upload a local project** + +- choose `Local Project Deployment` +- upload your built static files such as HTML, CSS, and JS +- typically upload a `dist` or `build` directory + +**Option 2: use a template** + +- start from a preset project template +- common options include React and Vue starter templates + +**Option 3: deploy from Git** + +- connect a GitHub repository +- set the build command, such as `npm run build` +- every push can trigger an automatic redeploy + +> 💡 **Tip**: you can also deploy from the command line: +> +> ```bash +> # Install CloudBase CLI +> npm install -g @cloudbase/cli +> # Log in +> tcb login +> # Deploy +> tcb hosting deploy ./dist -e your-env-id +> ``` + +### Step 5: Add a custom domain (optional) + +CloudBase also supports binding your own domain and applying a free HTTPS certificate. + +--- + +# 2. Vercel + +Vercel is one of the most popular frontend deployment platforms in the world and is especially good for React, Vue, and Next.js projects. + +Its main strengths: + +- **Deep GitHub integration** +- **Automatic preview deployments for pull requests** +- **Global CDN distribution** +- **Support for serverless functions** + +> ⚠️ **Note**: in some mainland-China network environments, Vercel may be less stable than domestic options such as CloudBase. + +## Deploy a web app with Vercel + +### Step 1: Register + +Visit [Vercel](https://vercel.com) and sign in with GitHub. + +### Step 2: Import a project + +1. Click `Add New Project` +2. Select the GitHub repository you want to deploy +3. If needed, adjust GitHub app permissions + +### Step 3: Configure build settings + +Vercel often detects the framework automatically: + +| Framework | Build command | Output directory | +|------|----------|----------| +| React | `npm run build` | `build` | +| Vue | `npm run build` | `dist` | +| Next.js | `next build` | - | +| Plain HTML | - | project root | + +If detection fails, configure it manually: + +- **Build Command** +- **Output Directory** +- **Install Command** + +### Step 4: Deploy + +Click `Deploy` and wait for the build to complete. A successful project receives a `xxx.vercel.app` domain. + +### Step 5: Add a custom domain (optional) + +Use the `Domains` section in project settings to bind your own domain. HTTPS is handled automatically. + +--- + +# 3. Netlify + +Netlify is another strong frontend deployment platform, especially for static sites and single-page applications. + +Its strengths: + +- **Feature-rich hosting**, including form handling, auth, and edge/serverless functions +- **Strong Git integration** +- **Preview links for branches** +- **Global CDN** +- **Built-in form handling** +- **Built-in user authentication tools** + +> ⚠️ **Note**: Netlify may not be as fast as CloudBase for domestic Chinese users. + +## Deploy a web app with Netlify + +### Step 1: Register + +Visit [Netlify](https://www.netlify.com) and sign up with GitHub, GitLab, Bitbucket, or email. + +### Step 2: Import a project + +1. Click `Add new site` → `Import an existing project` +2. Choose your Git provider +3. Authorize Netlify +4. Select the repository + +### Step 3: Configure build settings + +| Framework | Build command | Publish directory | +|------|----------|----------| +| React | `npm run build` | `build` | +| Vue | `npm run build` | `dist` | +| Angular | `ng build` | `dist/` | +| Next.js | `next build` | `out` | +| Plain HTML | - | `.` | + +### Step 4: Deploy + +Click `Deploy site`. Once it succeeds, you will receive a `xxx.netlify.app` domain. + +### Step 5: Add a custom domain (optional) + +1. Open the site settings +2. Go to `Domain management` +3. Add your custom domain +4. Follow the DNS instructions + +### Useful Netlify features + +#### 1. Form handling + +Netlify can capture form submissions without requiring a dedicated backend. + +```html +
+

+ +

+

+ +

+

+ +

+

+ +

+
+``` + +After deployment, Netlify automatically stores submission data and can forward it to email or other services. + +#### 2. Netlify Functions + +Netlify also supports serverless functions, which are useful for small APIs without maintaining a full backend. + +For example: + +```javascript +exports.handler = async (event, context) => { + return { + statusCode: 200, + body: JSON.stringify({ message: "Hello from Netlify!" }) + }; +}; +``` + +After deployment, the function is accessible at: + +`https://your-domain/.netlify/functions/hello` + +#### 3. Local development support + +Netlify provides a CLI: + +```bash +# Install Netlify CLI +npm install -g netlify-cli + +# Log in +netlify login + +# Start local development +netlify dev + +# Test functions locally +netlify functions:serve +``` + +This lets you simulate Netlify forms and function behavior locally before deploying. + +--- + +# 4. Zeabur + +Zeabur is a newer deployment platform that is especially useful for more complex projects involving multiple services. + +Its main strengths: + +- **Many built-in service templates** +- **Support for multiple deployment methods** +- **Flexible multi-service composition** +- **Usage-based billing** + +## Deploy Dify with Zeabur + +In earlier chapters, we already touched on Dify briefly. Now we can launch a full Dify service through [Zeabur](https://zeabur.com/projects) very easily. + +First, open the [console page](https://zeabur.com/projects): + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image5.png) + +In that interface, you will see a set of service blocks. At the top are options such as `Agent`, `Servers`, `Docs`, and `Templates`: + +1. **Agent**: Zeabur's built-in assistant for operational questions +2. **Servers**: add or buy cloud servers +3. **Docs**: official documentation +4. **Templates**: built-in application templates + +> An **image** can be understood as a packaged runtime environment + application state. If a service has already been configured successfully on one machine, it can be packed into an image and reused elsewhere. + +In the upper-right corner, you can also see your balance. By default, Zeabur usually gives you a small monthly free quota, roughly around 5 USD worth of usage. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image6.png) + +You can click the balance to inspect daily usage: + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image7.png) + +Now let's create a Dify service. + +Start by clicking `New Project` on the [console homepage](https://zeabur.com/projects): + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image8.png) + +Zeabur supports several ways to create a service: + +1. **GitHub** + Connect your GitHub account and deploy directly from a repository. +2. **Template** + Start from a built-in app template such as Dify or n8n. + ![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image9.png) +3. **Databases** + Deploy databases such as MySQL or MongoDB. + ![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image10.png) +4. **Functions** + Deploy JavaScript or Python functions. + ![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image11.png) + ![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image12.png) +5. **Local Project** + Upload a local folder and let Zeabur detect how to run it. + ![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image13.png) +6. **Docker Image** + Deploy from an already built Docker image. + ![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image14.png) +7. **Cursor** + Deploy directly from a project you are editing in Cursor. + +If you want to deploy Dify, the easiest path is **Template**. Search for `dify`, choose a version you like, and continue. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image15.png) + +Then choose any project name. Zeabur will generate a temporary domain based on that name. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image16.png) + +After creation, you will see multiple services starting one after another. Dify is not a single program, but rather a group of coordinated services, so you need to wait until they are all running. + +In many setups, you can click the main Dify app to get the access address. In this example, however, the final entry point is exposed through `nginx`, so you need to open the `nginx` service and find the public service address there. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image17.png) + +After waiting a bit, you should see the Dify login screen. Register an account with your email and password, and your own Dify service is ready. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image18.png) + +You can also launch `n8n` in a similar way if you want another AI workflow tool: + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image19.png)![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image20.png) + +## Deploy a Snake game with Zeabur and Trae + +To explore Zeabur's more advanced usage, let's deploy something simpler first: a Snake game generated with Trae. + +### Deploy an HTML-based version + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image23.png) + +Trae can generate a browser-based Snake game from plain HTML very easily. Once the project is created locally, you can upload the whole folder to Zeabur using the local-project deployment method described above. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image24.png)![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image25.png)![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image26.png) + +After deployment, you will enter the service details page: + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image27.png) + +Click `Network` on the left, find `Public Address`, and click `Generate Domain` to create a public URL. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image28.png) +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image29.png) + +Once that address is generated, opening it in the browser will let you play your Snake game publicly: + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image30.png) + +This same method works well for other static HTML-based web apps too. + +### Deploy a React version + +Now let's deploy a React app instead of a plain HTML app. Compared with static HTML, React is a more modern and component-based frontend framework, and it is common in production applications. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image31.png) + +#### Refactor into a React architecture + +In Trae, you can simply say: + +`Help me refactor this code into a React architecture.` + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image32.png) + +However, React apps are a bit more demanding to deploy because they rely on a build toolchain and a more structured project layout. + +One especially important issue is the **port**. A local React development server often listens on port `3000` by default. Zeabur, however, expects the deployed app to listen on port `8080`. + +If your React app still listens on `3000`, the deployment may fail because Zeabur cannot route traffic to it correctly. + +#### What is a port? + +You can think of the IP address as the building address and the port number as the room number. Together, `IP:port` points to a specific service. + +Most websites do not explicitly show a port because browsers automatically assume the default ports: + +- `80` for HTTP +- `443` for HTTPS + +But for app-specific services such as React development servers (`3000`) or Zeabur deployments (`8080`), the port becomes important. + +#### What does "listening on a port" mean? + +When a program listens on a port, it is telling the operating system: + +`I am waiting here for incoming network requests. Send them to me.` + +In the building analogy, the IP is the building address, and the port is the room number. The React dev server opens room `3000` and tells the building manager, "Any requests addressed to room 3000 should be delivered to me." + +When you run `npm start` locally, React commonly chooses port `3000`. Zeabur, however, is designed to work with apps listening on `8080`, so you need to change the default. + +#### Change the default listening port + +The easiest way is simply to ask Trae: + +`Please help me change the default port of this React project to 8080.` + +Trae can modify the relevant configuration for you. After that, rebuild the project and upload it to Zeabur again. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image33.png) +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image34.png) + +Once you configure the public network address just as you did for the HTML project, the React app can also be served successfully. + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image35.png) +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image36.png) + +The same idea applies to any other app that needs a port adjustment before deployment. + +--- + +# ⚠️ How to pause or delete a Zeabur project + +Because server resources cost money, you should always get in the habit of stopping services you are no longer using. + +Open the project's `Settings`: + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image21.png) + +Scroll to the bottom, and you will see controls like the following: + +![](/zh-cn/stage-2/backend/2.5-zeabur-deployment/images/image22.png) + +You can: + +- click `Suspend All Services` to pause everything and reduce cost +- click `Restart All Services` to restart services if something is stuck +- click `Delete Project` if you are sure you no longer need it + +--- + +# Summary + +In this tutorial, we introduced four common deployment platforms: + +1. **Tencent Cloud CloudBase**: good for domestic Chinese users and strong WeChat integration +2. **Vercel**: excellent for modern frontend frameworks and GitHub-driven workflows +3. **Netlify**: strong for static sites that also need forms, auth, and other hosting features +4. **Zeabur**: very useful for more complex projects with multiple services and templates + +Which one you choose depends on your needs: + +- For primarily domestic Chinese audiences, **CloudBase** is often the best first choice +- For React, Next.js, and similar stacks, **Vercel** or **Netlify** are strong options +- For static sites that also need forms or auth, **Netlify** is especially useful +- For Dify, n8n, and other multi-service setups, **Zeabur** is often the easiest + +No matter which platform you choose, the deployment workflow is conceptually similar: + +**prepare the code → choose a platform → configure the build → deploy it** + +Once you understand that loop, you can start publishing your own projects for the world to use. diff --git a/docs/en/stage-2/backend/2.6-modern-cli/index.md b/docs/en/stage-2/backend/2.6-modern-cli/index.md new file mode 100644 index 0000000..c87a73a --- /dev/null +++ b/docs/en/stage-2/backend/2.6-modern-cli/index.md @@ -0,0 +1,702 @@ +# CLI AI Coding Tools + +In this tutorial, we introduce AI coding agents that run directly in the command line. They are different from the agents we used earlier in Trae and Cursor. CLI AI coding tools can only be used in the terminal. Compared with agents integrated into AI IDEs, they usually have longer context windows, faster tool-calling speed, and compatibility with a wider range of large models. In the latest AI Vibe Coding practice, we often prioritize CLI AI coding tools over built-in IDE coding agents. + +## Starting from the CLI + +Do you still remember the CLI we introduced before? CLI means using pure text commands in a terminal or command prompt to operate software applications, instead of relying on a graphical interface (GUI. You can simply think of GUI as the clickable interface with buttons on a computer or phone, where you do not need to type commands). + +> On Windows, common terminals include Command Prompt (`cmd`) and PowerShell. You can type `cmd` or `powershell` in the Run/Search box to launch them. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image1.png)![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image2.png) + +The CLI is naturally good for text-command workflows. Among a small group of geeks (programming enthusiasts pursuing extreme efficiency), CLI is even more popular than GUI. They want to complete everything with the keyboard and feel that moving the mouse can slow down coding efficiency. + +In industry, CLI is also often the most common interface form, because GUI requires the operating system to draw interfaces and manage windows, which demands more computer resources. CLI only needs to pass received commands to the system for execution. So when connecting to large-scale server clusters, we usually interact only through CLI. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image3.png) + +For many learners with no CLI experience, command-line operations can feel complicated, with too many commands, and even the fear of "accidentally breaking the computer." No need to worry. Remember how, in previous tutorials, we often asked Trae to help with basic operations? We can use exactly the same idea here. We can ask CLI coding tools to perform all CLI operations for us: entering specific folders, searching and processing files, running or copying open-source projects, and so on. The whole process can be completed through conversation with the CLI AI coding tool. + +## How Is It Different from an AI IDE + +We can compare CLI AI coding tools to z.ai and Trae that we used before. In a sense, CLI AI coding tools can be seen as a special kind of z.ai: they also only need a simple chat entry, and then they automatically perform the required operations (sometimes you just need to open a browser manually to check the final result). If compared to AI IDEs, CLI AI coding tools can be seen as the Agent module inside an IDE, which is the side chat panel. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image4.png)![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image5.png) + +However, because different AI IDEs implement agents in different ways, their capability gaps are large, and AI coding quality is often unstable. CLI AI coding tools are usually developed directly by major tech companies, such as Anthropic behind Claude and OpenAI behind ChatGPT. + +Compared with other AI coding agents, directly using products from these major companies is often a better practice. Claude Code in particular is a tool used by Anthropic's own R&D teams, designed from the start around "meeting real engineer needs." + +To compare more intuitively, we can look at the difference between Claude Code and one AI IDE agent (Cursor as an example): + +| Feature | Claude Code | Cursor | Better Choice | +| ------------------ | ----------------- | ------------------- | ------------- | +| Automatic execution | ✅ Very strong | ❌ Limited | Claude Code | +| IDE integration | ❌ CLI only | ✅ Native VS Code | Cursor | +| Real-time completion | ❌ None | ✅ Excellent | Cursor | +| Multi-file operations | ✅ Very strong | ⚠️ Pretty good | Claude Code | +| GitHub integrated workflow | ✅ Can commit directly | ⚠️ More manual | Claude Code | +| Learning cost | ⚠️ Medium | ✅ Easy to start | Cursor | +| Context length | ✅ Very long | ⚠️ Good | Claude Code | +| Debug assistance | ✅ Automated | ⚠️ More manual work | Claude Code | + +Table source: https://northflank.com/blog/claude-code-vs-cursor-comparison + +In short, CLI AI coding tools usually can: + +- Support much longer continuous conversations (they can even "work for you all day"). +- Provide longer context windows (you no longer need to frequently say "continue"). +- Respond faster (with support for more custom model APIs). + +For coding-related operations, they are usually smarter and more stable than most IDE built-in agents. + +## Common CLI AI Coding Tools + +Although there are many open-source implementations now, in practice we only recommend two major types of CLI AI coding tools as the "preferred combo." You can choose either one based on your habits, and we strongly recommend trying both before deciding which suits you best. + +- Codex uses GPT-5 and is stronger overall in capability. +- Claude Code, routed through GLM 4.6 compatible APIs, offers an experience close to Claude 4 at a lower cost. + +However, which one works better in your real project can only be determined by hands-on testing. Mastering multiple AI coding tools is always beneficial. Once you are skilled, you can switch flexibly among Claude Code, Codex, or Trae in different scenarios. If one tool does not perform well after multiple tries, just switch to another tool or model and continue experimenting. + +At the same time, because model versions update very quickly, we recommend prioritizing whichever option currently performs best in cost-performance (quality / cost). + +### Claude Code + +Claude Code is an AI coding tool developed by Anthropic based on Claude model capabilities. Its primary interaction happens in the terminal, and it can also be used as a VS Code extension. Similar to an agent inside an AI IDE, it can deeply understand a developer's repository and complete end-to-end development tasks through natural language instructions, including code editing, bug fixing, running and fixing tests, managing Git workflows (such as resolving merge conflicts and creating PRs), explaining complex code, and executing terminal commands. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image6.png) + +Claude Code's main advantages are: very long context windows (it can handle whole files or even small projects), proactively clarifying ambiguous requirements, automatically planning and allocating execution tasks, and deeply understanding and explaining the entire codebase. Compared with ordinary IDE agents, it is better suited for immersive vibe-coding workflows. + +In actual use, you can ask it through chat to create new projects, perform CLI operations (such as organizing folders, bulk renaming files, deploying open-source projects), and configure development environments (such as installing and debugging Python environments). If you find some code difficult to understand, or a folder structure unclear, you can directly ask Claude Code to generate structured analysis documentation or explain specific parts step by step. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image7.png)![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image8.png) + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image9.png)![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image10.png) + +If you want to systematically learn Claude Code, you can refer to the course jointly launched by Andrew Ng and Anthropic: +https://www.bilibili.com/video/BV176t2zSEpr + +Next, we will learn how to use Claude Code. Because directly using the official Claude Code is often very expensive (as shown below), we will instead use API platforms that are compatible with Claude Code protocol but based on other large models. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image11.png) + +You need to learn the different options below (it is best to try all of them), and finally choose the one that suits you best as your main path. + +The first approach is to directly use APIs that are "Anthropic-interface compatible." As Claude Code becomes more popular, more model providers now support Anthropic-style invocation. Common providers include GLM, Kimi, DeepSeek, and Siliconflow. They all provide compatible API interfaces. We will explain specific configuration details later. + +One thing to note: Claude Code usually consumes a lot of tokens. If you are worried about high API costs, you can consider GLM monthly plans (about 20 RMB/month) to control cost. If you first want to estimate actual spending, you can also recharge 10 RMB for small-scale experiments. + +Another approach is using the "Claude Code Route" project. It is an open-source tool that supports all common API invocation interfaces and allows fine-grained model configuration for different scenarios, including local model access. But this option is more complex to configure, so we suggest starting with the first approach. + +#### Use Zhipu GLM as the Backend (Recommended) + +GLM (General Language Model) is a series of large language models independently developed by Zhipu AI. GLM-4.6 is currently the latest version in the GLM family. Its core highlight is strong coding performance (benchmarking Claude Sonnet 4 in public benchmarks and real tasks, and considered top-tier domestically). + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image12.png) + +It also extends the context window to 200K, allowing easier handling of long text and large codebases, while strengthening reasoning and tool-calling capabilities, achieving a good balance between performance and cost. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image13.png) + +Before connecting GLM, we first need to install Claude Code. + +If command-line installation feels troublesome, or errors appear midway, you can directly ask Trae's Agent to complete installation for you. + +```python +# Install Claude Code +npm install -g @anthropic-ai/claude-code + +# Enter your project +cd your-awesome-project + +# Start Claude Code +claude + +# Press Ctrl+C to exit Claude +``` + +Next, we need to change Claude Code's default API request endpoint so it supports GLM's API service. You can copy the content below and ask Trae to create the corresponding environment variables for you. You can also choose to write them permanently into system environment variables (if issues occur, you can also ask Agent to help modify them). + +First, you need to obtain your GLM API key and store it in whatever way is most convenient for you. + +Domestic URL: https://bigmodel.cn/usercenter/proj-mgmt/apikeys +International URL: https://z.ai/manage-apikey/apikey-list + +If you are using the **domestic GLM** service, use the following variable configuration: + +```python +# Run the following command in Cmd +# Replace `your_zhipu_api_key` with the API key you just obtained +setx ANTHROPIC_AUTH_TOKEN your_zhipu_api_key +setx ANTHROPIC_BASE_URL https://open.bigmodel.cn/api/anthropic +``` + +If you are using the **international GLM** service, use this configuration: + +```python +# Run the following command in Cmd +# Also replace `your_zai_api_key` +setx ANTHROPIC_AUTH_TOKEN your_zai_api_key +setx ANTHROPIC_BASE_URL https://api.z.ai/api/anthropic +``` + +You can directly enter a prompt like this in Trae: + +⚠️ If you configure "permanent environment variables" through Trae, then after configuration you **must restart Trae**. Otherwise environment variables in Trae's built-in terminal will not refresh, which may cause login failures or network connection errors. + +```python +Based on my environment variable settings: +setx ANTHROPIC_AUTH_TOKEN your_zai_api_key +setx ANTHROPIC_BASE_URL https://api.z.ai/api/anthropic + +and my key(Replace it with your own key): +681fea485851d29060cc.13gfaendggaFOhb + +please help me configure and start Claude Code +``` + +You will see output similar to the following: + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image14.png) + +> 💡 What is an environment variable? +> +> Environment variables are essentially key-value configuration entries stored in the operating system, usually in the form "variable name = specific value." If configured in advance in terminal or system settings, programs can read these variables at any time to obtain relevant information. Because environment variables can be written directly in terminal without modifying code, we usually store large-model access keys in environment variables to avoid leakage. Programs only need to read corresponding environment variables to complete model invocation. +> +> In Windows, besides storing model access keys, environment variables are also commonly used to store executable "path locations" for command-line tools. +> +> We know the terminal itself is also a program. Sometimes we want to launch an external program from terminal. For example, typing `claude` in terminal to launch Claude Code. The reason this works is that terminal reads system environment variables, and the PATH variable contains the directory where Claude Code executable resides, so terminal can find and execute it (equivalent to pasting that program's absolute path into terminal and pressing Enter). +> +> A typical environment variable may look like this: `PATH=C:\Windows\system32;C:\Program Files\Python`. Then we can execute those programs from any directory, for example directly typing `python` in command line to start the Python interpreter. +> +> If you want to view current system environment variables, type "environment variables" in Windows Search, then in the "Edit the system environment variables" window you can see all variables and their values. Some store model keys, while others add program directories for invocation from any path. + +Now you can use the latest GLM for Claude Code development. You can try rerunning previous projects, or retry tasks that Trae did not complete well, and compare the experience differences. + +🎉 Rebuilding repeatedly is not a waste of time. Every repetition makes your skills more solid. + +Using exactly the same logic as with GLM, you can also connect other interfaces that support Anthropic-compatible formats. + +#### Use Kimi K2 as the Backend (Recommended) + +Kimi K2 is a new-generation large language model released by Moonshot AI, with excellent performance in code understanding and generation. Kimi K2 supports ultra-long context windows (up to 200K tokens), and can easily handle large repositories and complex projects. + +**Core advantages:** +- **Ultra-long context**: Supports 200K context window, enabling one-pass handling of whole-project code +- **Strong coding ability**: Performs very well in generation, refactoring, and debugging +- **Better Chinese understanding**: More accurate understanding of Chinese programming requirements +- **Stable tool invocation**: Supports reliable function-calling and tool usage + +**Get API Key:** + +Visit https://platform.moonshot.cn/console/account to register and obtain an API key. + +**Configuration method:** + +Reference docs: https://platform.moonshot.cn/docs/guide/agent-support + +```bash +export ANTHROPIC_BASE_URL=https://api.moonshot.cn/anthropic +export ANTHROPIC_AUTH_TOKEN=sk-YOURKEY +``` + +#### Use Minimax as the Backend (Recommended) + +Minimax is a new-generation large language model released by MiniMax, with excellent performance on programming tasks. Minimax models are known for strong reasoning and code-generation quality, especially suitable for complex programming scenarios. + +**Core advantages:** +- **Strong reasoning**: Performs well in complex logic reasoning and code architecture design +- **High code quality**: Generated code is clear in structure and readable +- **Multi-language support**: Supports code generation and conversion across multiple languages +- **Fast response speed**: API responds quickly, suitable for high-frequency invocation scenarios + +**Get API Key:** + +Visit https://platform.minimax.io/ to register and obtain an API key. + +**Configuration method:** + +```bash +export ANTHROPIC_BASE_URL=https://api.minimax.io/anthropic +export ANTHROPIC_AUTH_TOKEN=YOUR_MINIMAX_API_KEY +export ANTHROPIC_MODEL=MiniMax-M2.7 +``` + +#### Use DeepSeek as the Backend (Recommended) + +DeepSeek is an open-source large language model released by DeepSeek, popular among developers for strong coding capabilities and high cost-performance. DeepSeek Coder is specially optimized through training for programming tasks. + +**Core advantages:** +- **Outstanding coding capability**: Strong performance in code generation, understanding, and bug fixing +- **Open-source and customizable**: Open-source model, can be fine-tuned based on needs +- **High cost-performance**: Relatively low API pricing, suitable for high-frequency use +- **Good Chinese support**: Accurate understanding of Chinese programming scenarios + +**Get API Key:** + +Visit https://platform.deepseek.com/usage to register and obtain an API key. + +**Configuration method:** + +```bash +export ANTHROPIC_BASE_URL=https://api.deepseek.com/anthropic +export ANTHROPIC_AUTH_TOKEN=YOU_DEEPSEEK_API_KEY +export API_TIMEOUT_MS=600000 +export ANTHROPIC_MODEL=deepseek-chat +export ANTHROPIC_SMALL_FAST_MODEL=deepseek-chat +export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1 +``` + +#### Use Volcano Engine Coding Plan as the Backend (Recommended) + +Volcano Engine is ByteDance's cloud service platform, providing enterprise-level AI model services. Volcano Engine's Coding Plan is specially optimized for coding scenarios, offering stable and efficient code-generation capability. + +**Core advantages:** +- **Enterprise-grade stability**: Provides SLA guarantees for service stability +- **Coding-scenario optimization**: Specifically optimized for programming tasks +- **Rich model choices**: Supports multiple models including Doubao-pro and Doubao-lite +- **Fast domestic access**: Domestic node deployment with faster access speed + +**Get API Key:** + +Visit https://console.volcengine.com/ark/region:ark+cn-beijing/apiKey to register and obtain an API key. + +**Configuration method:** + +```bash +export ANTHROPIC_BASE_URL=https://ark.volces.com/api/anthropic +export ANTHROPIC_AUTH_TOKEN=YOUR_VOLCANO_API_KEY +export ANTHROPIC_MODEL=doubao-pro-32k +``` + +#### Other Anthropic-Compatible APIs + +Siliconflow: + +```bash +export ANTHROPIC_BASE_URL="https://api.siliconflow.cn/" +export ANTHROPIC_MODEL="moonshotai/Kimi-K2-Instruct-0905" # You can change to the model you need +export ANTHROPIC_API_KEY="YOUR_SILICONCLOUD_API_KEY" # Replace with your API key +``` + +Aliyun DashScope (Aliyuncs): https://help.aliyun.com/zh/model-studio/get-api-key + +```python +export ANTHROPIC_BASE_URL="https://dashscope.aliyuncs.com/apps/anthropic" +export ANTHROPIC_API_KEY="YOUR_DASHSCOPE_API_KEY" +``` + +::: details Use Claude Code Route as the Backend (Advanced Usage) + +Above we explained how to replace Claude Code's Anthropic interface with the official GLM API. Next, let's look at how Claude Code Router allows Claude Code to adapt to more model APIs. + +[Claude Code Router](https://github.com/musistudio/claude-code-router) is an intelligent routing enhancement tool designed specifically for Claude Code. Its core function is helping users distribute AI requests to models across different platforms as needed, with a high degree of customization. It supports access to dozens of platforms including OpenRouter, DeepSeek, Ollama, Gemini, and more. It can also route tasks to specific models by scenario, such as GLM-4.5, Kimi-K2, and Qwen3-Coder. For example, you can route background tasks to local Ollama to save cost, route long text / long code tasks to Gemini-2.5-Pro, and route code explanation to DeepSeek. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image16.png) + +This tool also provides convenient UI/CLI configuration management and uses converters to adapt API formats from different platforms. It supports automation integration such as GitHub Actions and custom extensions, solving the problems of "one single model cannot cover all scenarios" and "frequent platform switching is troublesome," helping users use AI tools more flexibly and at lower cost. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image17.png) + +Below is a quick introduction to installing Claude Code Router. The rough steps are as follows (you can also ask Trae to execute them) to prepare the environment: + +```markdown +npm install -g @anthropic-ai/claude-code +npm install -g @musistudio/claude-code-router +``` + +After installation, you need to confirm the `ccr` command is available locally. If you see output similar to the following, installation is successful: + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image18.png) + +Next, there are two ways to initialize and configure models: + +- Use CCR's built-in UI and configure on its browser page. +- Directly edit CCR's default configuration file (the UI essentially edits the config file as well, just with a more intuitive interface). + +If you choose CCR UI, you will see an interface similar to this: + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image19.png) + +At this point, click the "Add Provider" button to see the following interface. You need to: + +1. Enter the provider name in Name; +2. Fill in that provider's OpenAI-compatible endpoint in API Full URL; +3. Fill in the corresponding platform API key in API Key; +4. Fill model names in Models area, then click "Add Model"; +5. Finally click "Save" to persist configuration. + +(If you scroll downward there are many advanced options, but you can ignore them for now.) + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image20.png) + +Here are configuration examples for DeepSeek and Kimi: + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image21.png) + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image22.png) + +After saving model configuration, you also need to specify the default model in the Router area on the right. Select from the dropdown and set it to `kimi` (recommended), then click `Save and Restart` in the top-right corner. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image23.png) + +After that, simply run `ccr code` in terminal to start Claude Code workflow through Claude Code Router. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image24.png) + +::: + +#### Advanced Usage of Claude Code + +Many people initially use Claude Code only as a normal chat tool. But in fact it has many built-in capabilities that can make your workflow more efficient and flexible. Here are common commands and usage examples: + +Reference docs: + +https://docs.claude.com/en/docs/claude-code/cli-reference +https://docs.claude.com/en/docs/claude-code/slash-commands + +| Command | Purpose | Example | +| ----------------- | ----------------------------------------- | ---------------------------------------- | +| claude | Start interactive mode | `claude` | +| claude "query" | Run one-off task and output result | `claude "explain this project"` | +| claude -p "query" | Ask one-off question and auto-exit | `claude -p "explain this function xxxx"` | +| claude -c | Continue most recent session | `claude -c` | +| claude -r | Resume previous session | `claude -r` | +| /resume | Switch to previous session in current chat | `claude -c`, `/resume` | +| /plugin | Manage plugins and install submit/review extensions | `/plugin` | +| /init | Initialize project description with CLAUDE.md | `/init` | +| /clear | Clear current context to prevent overload | `/clear` | +| /compact | Compress history and reduce context token usage | `/compact` | +| /cost | View current cost usage | `/cost` | +| /model | Switch model (usually ignorable with compatible APIs) | `/model` | +| /memory | Manage CLAUDE.md memory file | | +| /help | Show available command list | `/help` | +| exit or Ctrl+C | Exit Claude Code | `exit` or `Ctrl+C` | +| /agents | Advanced feature, explained later | | +| /mcp | Advanced feature, explained later | | + +**CLAUDE.md** + +Reference: https://www.anthropic.com/engineering/claude-code-best-practices + +`CLAUDE.md` is a special file that Claude automatically reads and includes in context at the beginning of a session. So it is very suitable for recording: + +- Common bash commands +- Core files and utility functions +- Code style conventions +- Testing method notes +- Repository collaboration conventions (for example branch naming, merge vs rebase, etc.) +- Development environment setup notes (for example whether to use pyenv, preferred compiler, etc.) +- Behaviors or pitfalls that need extra attention in the project +- Any information you want Claude to "remember" + +`CLAUDE.md` itself has no strict format requirement, as long as it is concise and human-readable. For example: + +``` +# Bash commands +- npm run build: Build the project +- npm run typecheck: Run the typechecker + +# Code style +- Use ES modules (import/export) syntax, not CommonJS (require) +- Destructure imports when possible (eg. import { foo } from 'bar') + +# Workflow +- Be sure to typecheck when you’re done making a series of code changes +- Prefer running single tests, and not the whole test suite, for performance +``` + +#### Internal Principles of Claude Code + +Reference: https://github.com/shareAI-lab/analysis_claude_code + +If you are curious why Claude Code performs better than Trae or Cursor agent tools in many scenarios, we can briefly look at its internal working mechanism. + +The overall implementation style of other CLI AI coding tools is broadly similar. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image25.png) + +Claude Code decomposes coding tasks into a continuous "perceive - think - act - verify" loop and invokes different tools in the loop to complete work. It imitates human developer workflow: continuously "write code -> run -> inspect result -> improve again." Internally, a main task loop continuously executes steps. In each cycle, Claude can call different tools, such as reading/writing files, executing commands, and searching code, then decide next actions based on real tool outputs. + +Several key characteristics are worth noting: + +- **Stream Processing**: Claude can think while outputting results, instead of waiting to finish all code before execution. +- **Intelligent Compression**: Long conversations can make context too large. Claude compresses history into key information to reduce "forgetting," and distinguishes long-term vs short-term memory to keep execution efficient. +- **Concurrency Control**: Internal parallel design allows multiple tasks to proceed simultaneously without interference. +- **Sub-agent Management**: In real work it is not just one single "role" handling everything. You can manage multiple sub-agents collaboratively, each responsible for different tasks, such as dedicated testing or documentation agents. + +### Codex + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image26.png) + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image27.png) + +Similar to Claude Code, Codex is an AI collaborative coding tool developed by OpenAI. You can think of it as the "OpenAI version of Claude Code." Its biggest advantage is efficient adaptation to GPT-5. + +From practical experience, GPT-5 currently responds faster and makes fewer mistakes (higher success probability in complex multi-round tasks). One drawback is that explanations can feel more "academic" and technical, sometimes too rigorous and information-dense, which can be slightly harder for beginners. + +You can install Codex with the following command: + +``` +npm i -g @openai/codex +``` + +#### Use Official OpenAI API as the Backend + +If you directly use the official OpenAI entry for Codex, setup is very simple. Once you have OpenAI subscription access or corresponding API quota, you only need to run `codex` in command line and follow the prompts to complete login. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image28.png) + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image29.png) + +#### Use Relayed OpenAI API as the Backend + +Because official OpenAI API can have issues such as high cost and strict network requirements, we can also avoid those restrictions by routing through other API gateway services. + +With this approach, we only need to buy corresponding Codex API quota on a third-party relay platform, and we can get an experience close to native OpenAI Codex. + +Reference: https://open-dev.feishu.cn/wiki/PAqUwWG4IiuwTvkQ2sGcaQuPnXc +Recharge URL: https://api.zyai.online/account/topup/recharge + +One thing to note: after obtaining token quota, we still need to configure the API key locally. + +In key-group settings, make sure you choose the item specifically for Codex. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image30.png) + +Next, we need to fill the key you obtained into the prompt below, then give the entire prompt to Trae so it can complete the whole configuration process for you: + +````bash +My API key is: [Paste your obtained sk-xxxxx key here] + +Please help me complete the following configuration tasks: + +1. Create configuration directory + - Create a `.codex` folder under my user directory + - Windows path should be: `C:\Users\[My Username]\.codex` +2. Backup existing configuration (if exists) + - Check if `.codex\config.toml` exists + - If it exists, rename it to `config.toml.bak.[current timestamp]` (timestamp format: yyyyMMddHHmmss) +3. Create configuration file + - Create `config.toml` in the `.codex` directory + - Write the following complete content: + ```toml + preferred_auth_method = "apikey" + + [model_providers.myrelay] + name = "My Relay Station" + base_url = "https://api.zyai.online/v1" + env_key = "MYRELAY_API_KEY" + wire_api = "responses" + request_max_retries = 4 + stream_max_retries = 10 + stream_idle_timeout_ms = 300000 + + [profiles.myrelay] + model_provider = "myrelay" + model = "gpt-5" + model_reasoning_effort = "medium" + + [tools] + web_search = true + +4. Set system environment variable +Variable name: MYRELAY_API_KEY +Variable value: The key I gave you + +5. Confirm completion and report back: + +The full path of the configuration file +Whether the environment variable was set successfully +I can use the command `codex --profile myrelay` to run it +```` + +After configuration, you can launch Codex with relayed API through `codex --profile myrelay`. Usage afterward is similar to Claude Code: just keep entering your ideas and requirements in chat at any time. + +## More Use Cases for CLI AI Coding Tools + +### Use AI to Write Requirement Documents: Learn to "Concretize Requirements" + +For large language models, abstract requirements should be "concretized." For example, "I'm hungry" is an abstract requirement. We need to convert it into: "My stomach is a little hungry. I may need a red-bean bun and a cup of soy milk." That becomes an executable, concrete requirement. + +But turning abstract requirements into concrete ones is actually a process that costs a lot of energy. If we have not seen enough examples, it is hard to quickly think of how to break abstract problems into detailed modules. In such cases, the best approach is to let AI help with this "concretization" step. +For example, if I want to build a daily planner app, the most basic idea might be: + +`Please help me write a daily planning app where I can write my plans each day and get reminders.` + +AI can indeed split tasks directly from this requirement and complete them step by step, but errors or understanding drift can occur in the middle. To reduce this risk, we can ask AI to expand the requirement first: + +`Based on my needs, please elaborate and provide a more detailed Product Requirement Document for reference. My idea is: Please help me write a daily planner app that supports daily plan-writing and provides reminders .` + +At this point, AI may produce a complete PRD like the following: + +``` +Product Requirements Document (PRD): “Today’s Plan” App +Document Version: 1.0 +Creation Date: October 27, 2023 +Author: (Your Name/Product Manager) + +1. Product Introduction +1.1 Product Name +Today's Plan +1.2 Product Vision +“Today’s Plan” is a minimalist and highly efficient daily planning and reminder tool, dedicated to helping users eliminate procrastination and forgetfulness, plan every day clearly, and ensure tasks are carried out through an intelligent reminder system—ultimately enabling users to gain a stronger sense of control and achievement over their time. +1.3 Target Users (User Personas) +We mainly serve three types of users: +Students (Xiao Ming): +Characteristics: Multiple tasks such as courses, assignments, club activities, exam prep, needing organized time arrangement. +Pain Points: Easily forget small tasks or assignment deadlines; feel overwhelmed switching between tasks; want to build regular study and life habits. +Needs: A simple tool to list daily to-dos and provide reminders before class/self-study. +Office Workers (Zhang Wei): +Characteristics: Fast-paced work, many meetings, reports, project milestones, and personal affairs (fitness, picking up children). +Pain Points: Easily forget important meetings or work milestones; get interrupted by urgent tasks and forget the original plan; feel busy but inefficient at end of day. +Needs: Need a tool to quickly record and schedule daily work and send strong reminders at key times (e.g., 15 minutes before meetings). +Freelancers/Self-disciplined Seekers (Li Na): +Characteristics: High freedom of time, but strong self-management required for work output and personal growth. +Pain Points: Easily procrastinate, lack external supervision; start the day without a clear plan, leading to low time utilization. +Needs: Need a tool to help build a daily fixed routine (Morning Routine) and review daily achievements for positive feedback. + +2. User Stories +As a user, I want to quickly create today’s plan list so I have an overview of all my tasks for the day. +As a user, I want to set specific start and end times for each task so I can create a visual timeline. +As a user, I want to receive push notification reminders before a task starts so I won’t miss any important arrangements. +As a user, I want to customize the reminder time (such as 5, 15, or 60 minutes in advance) so reminders better fit my habits. +As a user, I want to easily mark completed tasks so I can feel accomplished and clearly see my progress. +As a user, I want to see a summary of my completed plans at the end of each day for reviewing and self-motivation. +As a user, I want to conveniently edit and delete tasks to handle last-minute changes. +As a user, I want to view plans and achievements from previous days to review my efficiency and habits. + +3. Feature Breakdown +Core Features (MVP - Minimum Viable Product) +Module 1: Plan Management +3.1.1 Daily Plan Homepage +Interface: “Today” as the core view, current date shown at the top. +View: Timeline list, clearly showing tasks scheduled from morning to evening. Tasks without a time can be listed in the top or bottom “To-do List” section. +Interactions: +Click the “+” button in the bottom right to quickly create a new task. +Pull down to refresh the page. +Swipe left/right to view yesterday’s and tomorrow’s plans. +3.1.2 Create/Edit Task +Entry: Click “+” on the homepage or a time slot in the list. +Fields: +Task title (required): Briefly describe the task, e.g., “10 AM Weekly Product Meeting.” +Task time (optional): +Set “start time” and “end time.” +Provide “all-day” option for unspecified time tasks. +Default time picker should be quick and convenient. +Reminder setting (required, with default value): See Module 2. +Notes (optional): Add further descriptions, links, or location info. +Actions: Save, cancel, delete task. +3.1.3 Task Interaction +Mark as complete: Checkbox before each task; checking adds a strikethrough and gray background, indicating completion. Can unmark if needed. +Edit task: Click the task itself to enter edit page. +Delete task: Swipe left on a task to reveal “Delete” button. +Module 2: Smart Reminder System +3.2.1 Reminder Trigger +Mechanism: Based on task’s set “start time” and the user’s “reminder lead time,” send a push notification from device. +Offline Support: Locally scheduled reminders must trigger even if user is offline. +3.2.2 Reminder Content & Format +Notification title: App name “Today’s Plan.” +Body: “Reminder: [Task Title] will start at [Start Time].” E.g., “Reminder: Product Meeting will start at 10:00.” +Sound: Use system default or offer several simple, effective tones. +3.2.3 Reminder Settings +Global Settings (in Settings page): +User can set a default reminder time, e.g., “15 minutes before task starts.” New tasks adopt this by default. +Single Task Settings (in create/edit page): +Users can override global settings for important tasks, choosing specific reminder times like "on time," "5 minutes early," "30 minutes early," or "1 hour early." +Provide “no reminder” option. +Subsequent Features (V1.1, V2.0) +3.3 Daily Review & Statistics +Push a summary notification at a set time every night (e.g., 22:00): “How was your day? Take a look at your achievements!” +Generate a simple daily report card: shows total planned tasks, completed tasks, completion rate, plus an encouraging message. +3.4 History Review +Calendar view to click on any past day and check its plans and completion status. Days with high completion rates marked with a special color. +3.5 Templates +Allow users to save a successful daily plan as a template, e.g., “Efficient Workday,” “Relaxing Weekend.” +When creating tomorrow’s plan, one-click import a template, modify slightly to save time. +3.6 Themes & Personalization +Offer dark mode. +Allow changing several primary color themes. + +4. Non-Functional Requirements +4.1 Performance +Response: App launch time under 2 seconds; adding/editing tasks must be smooth and lag-free. +Resource Use: Low battery and memory consumption in background; do not over-consume resources waiting for reminders. +4.2 Usability +Minimal & intuitive: UI must be minimal, primary functions accessible within 3 clicks. No tutorial needed for new users. +Error tolerance: Offer undo (e.g. brief undo after mistakenly deleting a task). +4.3 Reliability +Reliable reminders: Reminder function is the product’s lifeline; must guarantee 99.99% timely and accurate delivery. +Data loss-free: User plans must be reliably stored locally. Future versions can support cloud sync to prevent data loss on device change. +4.4 Compatibility +Platform: Support major iOS and Android versions (latest 3-4 releases). +Screen: Layout must fit various phone screen sizes. + +5. Roadmap +V1.0 (MVP): +Goal: Validate core value—planning & reminders. +Features: Complete all “Core Features” described above (Plan management, smart reminders). +V1.1 (Quick Optimization): +Goal: Improve retention and achievement. +Features: Add “Daily Review & Statistics,” “History Review.” +V2.0 (Enhanced Experience): +Goal: Increase efficiency and personalization. +Features: Add “Templates,” “Themes & Personalization,” and start developing “Cloud Sync.” +``` + +Compared with our initial sentence "help me write an app where I can record plans and get reminders every day," this document is now far more detailed. You can add, remove, and revise content based on real needs. For modules you are unsure about, you can keep asking AI for more alternatives, then select and merge them into a final version. + +In this way, we can easily turn abstract ideas into concrete descriptions. For AI development, "concrete" means productivity. The more concrete the requirement is, the easier it is to get stable structure and higher-quality project output. You can try redoing one of your previous small projects in this way and compare the difference. + +If you feel this kind of "requirement prompt" is too long, a very natural approach is to write it into a standalone Markdown document as your requirement document / development document / PRD. Then each time you ask AI to build a project, you only need to ask it to "refer to this document" instead of retyping long prompts every time. You can also continuously improve this document across iterations so future projects benefit directly. + +Below are some other common use cases: + +### Manage Folders + +We can try using CLI AI coding tools to manage various files in the current folder. For example, if you have a pile of messy files that need sorting and grouping, you can tell Claude Code or Codex: + +`Please help me organize the contents of the current folder. I want to group files with the same content together & I want to group files from the same time period together. Please help me handle this.` + +### Develop New Projects + +This is almost exactly the same as how we previously used z.ai and Trae. We can directly use CLI AI coding tools to develop brand-new projects from scratch. Of course, it is best to prepare a requirement document in advance. + +The more detailed the requirement document, the better the final result. You can optimize that document across multiple rounds as your ideas evolve. The more complete the document, the more stable and mature the implementation usually becomes. + +### Deploy Open-Source Projects (for example Dify) + +For learners who are new to computers, deploying an open-source project from GitHub is often difficult. But we can fully hand this over to Claude Code, just as we did in the Dify tutorial: + +https://github.com/langgenius/dify + +If I want to run my own local Dify, I only need to throw this link to Claude Code, then type: + +`I want to deploy this GitHub project ``https://github.com/langgenius/dify`` . Please help me clone the project and run it.` + +After receiving your request, Claude Code will automatically complete a series of operations, including pulling code from GitHub, configuring runtime environments, and starting the project. If any step fails or startup status is abnormal, you only need minor manual handling based on prompts. Beyond Dify, you can also ask Claude Code to deploy most common open-source GitHub projects for you. You just need one chat box and the time to drink a cup of coffee ☕️. + +![](/zh-cn/stage-2/backend/2.6-modern-cli/images/image31.png) + +### Explain Code and Write Documentation + +For some complex projects, or large projects generated by AI, you may feel the code is too long and logic is too dense to understand. At this time, you can ask CLI AI coding tools to "read code" for you. You can ask like this: + +- Please explain this project to me: how to run it, how to use it, and how to modify and continue developing it later. +- Please explain the overall workflow of this project: how does the program run, and what actions can users perform in the interface? +- Please write complete documentation for this project, including development docs and run docs. +- Based on everything in my current folder, write a detailed explanation and save it into a specified Markdown document. + +### More Use Cases + +Of course, CLI AI coding tools can do far more than what we listed above. Do not treat them only as "code-writing tools." Treat them as intelligent agents with independent action capabilities. You can ask them to: + +- Manage and organize local files; +- Write journals and summaries; +- Analyze and fix system errors; +- Execute various repetitive command-line tasks. + +In the near future, it may become your most important and most understanding AI companion on your computer. diff --git a/docs/en/stage-2/backend/2.7-stripe-payment/index.md b/docs/en/stage-2/backend/2.7-stripe-payment/index.md new file mode 100644 index 0000000..d594a04 --- /dev/null +++ b/docs/en/stage-2/backend/2.7-stripe-payment/index.md @@ -0,0 +1,3 @@ +# How to Integrate Stripe and Other Billing Systems + +> This chapter is currently being written. Stay tuned... diff --git a/docs/en/stage-2/frontend/2.0-lovart-assets/index.md b/docs/en/stage-2/frontend/2.0-lovart-assets/index.md new file mode 100644 index 0000000..c38723c --- /dev/null +++ b/docs/en/stage-2/frontend/2.0-lovart-assets/index.md @@ -0,0 +1,951 @@ + + +# Starting from NanoBanana: Build Your Own Asset Production Agent + +## Chapter 1: Generate Your First Image Asset in 1 Minute + +Before we discuss design, style, or prompt engineering, let's generate the first image with the fewest possible steps. + +### 1.1 Meet NanoBanana + +Before discussing design style and prompt engineering, let's solve a more important thing first: **confirm that you can actually generate an image.** + +Mainstream large models now already support image generation and editing. These are usually called **generative models**. + +To keep the process as simple as possible, this tutorial uses a model with stable image generation and editing capabilities as the example: NanoBanana. It is an image generation model from Google. Its formal name is **Gemini 3.1 Flash Image Preview**. It supports direct image generation from natural language, and also supports editing based on existing images. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image1.png) + +In terms of core capability, it is not fundamentally different from other models you may have heard of (such as GPT-4o, Claude, Qwen, Midjourney, and others): **you provide the description, and the model generates the result.** + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image2.png)![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image3.png)![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image4.png) + +You can think of it as a "brush." In this chapter we care about only one thing: +👉 **can this brush draw its first stroke in your hands?** + +In practical usage, NanoBanana can be used directly through official platforms like **Google AI Studio**, and it can also be integrated into development workflows via **API**. This tutorial uses the API approach. A NanoBanana 2 model is also available now, and you can try the latest model as well. + +### 1.2 A "Hello World" Level Generation + +Before we start, you only need to complete these three steps: + +1. Create a new folder in Trae + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image5.png) + +2. Create a new Python file + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image6.png) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image7.png) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image8.png) + +3. Paste the full code below + +Trae will automatically complete environment setup and dependency installation. No extra configuration is needed. + +The code uses a NanoBanana API Key. We will not expand on the application process here. As long as you can obtain the key and fill in the corresponding parameter, that is enough. **At this stage, you do not need to understand every line of code. It only needs to run successfully.** + +```Python +# /// script +# dependencies = [ +# "gradio>=4.0.0", +# "pillow>=10.0.0", +# "requests>=2.31.0", +# ] +# /// + +import gradio as gr +import requests +import base64 +from PIL import Image +import io +import os +import time +import re +from typing import Optional, Dict, Any, List + +# 配置 API 信息 +NANOBANANA_API_URL: str = "YOUR API URL" +NANOBANANA_API_KEY: str = "YOUR API KEY" +OUTPUT_DIR: str = "outputs" + +# 确保输出目录存在 +os.makedirs(OUTPUT_DIR, exist_ok=True) + +def image_to_base64_data_uri(image: Image.Image) -> str: + """ + 将 PIL 图像转换为 OpenAI API 兼容的 data URI 格式。 + """ + buffer = io.BytesIO() + # 统一转为 PNG 以保证兼容性 + image.save(buffer, format="PNG") + encoded = base64.b64encode(buffer.getvalue()).decode('utf-8') + return f"data:image/png;base64,{encoded}" + +def base64_to_image(base64_str: str) -> Optional[Image.Image]: + """ + 将纯 base64 字符串转换为 PIL Image。 + """ + try: + image_bytes = base64.b64decode(base64_str) + return Image.open(io.BytesIO(image_bytes)) + except Exception as e: + print(f"Base64 解码失败: {e}") + return None + +def extract_base64_from_response(content: Any) -> Optional[str]: + """ + 核心解析逻辑:从 API 返回的 content 中提取图片 Base64 数据。 + 兼容 Markdown 格式和结构化列表格式。 + """ + if not content: + return None + + base64_data = None + + # 1. 尝试结构化提取 (List) + # 对应返回格式: [{"type": "image_url", "image_url": {"url": "data:..."}}] + if isinstance(content, list): + for part in reversed(content): # 倒序查找,通常最新的图片在最后 + if isinstance(part, dict): + # 检查 image_url 或 output_image 字段 + img_field = part.get("image_url") or part.get("image") or part.get("output_image") + if isinstance(img_field, dict): + url = img_field.get("url", "") + if url.startswith("data:image/") and "," in url: + return url.split(",", 1)[1].strip() + + # 如果列表中没有结构化图片,尝试把列表里的文本拼起来找 Markdown + text_parts = [ + str(p.get("text", "")) + for p in content + if isinstance(p, dict) and p.get("type") in ["text", "input_text"] + ] + content_str = "".join(text_parts) + else: + content_str = str(content) + + # 2. 尝试 Markdown 正则提取 (String) + # 对应返回格式: "Here is your image: ![img](data:image/png;base64,AAAA...)" + pattern = re.compile(r"!\[.*?\]\((data:image/[^;]+;base64,[^)]+)\)", re.IGNORECASE) + match = pattern.search(content_str) + + if match: + data_url = match.group(1) + if "," in data_url: + return data_url.split(",", 1)[1].strip() + + return None + +def synthesize(prompt: str, input_image: Optional[Image.Image]) -> Optional[Image.Image]: + """ + 调用 Nanobanana API 进行生成。 + """ + if not prompt or not prompt.strip(): + gr.Warning("请输入提示词") + return None + + print(f">>> 开始任务: {prompt[:50]}...") + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {NANOBANANA_API_KEY}" + } + + # 构造符合 OpenAI Vision / Chat 标准的 payload + messages = [] + + if input_image is not None: + # 图生图/多模态输入模式 + print(">>> 检测到输入图片,使用多模态模式") + img_base64 = image_to_base64_data_uri(input_image) + messages.append({ + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": img_base64}} + ] + }) + else: + # 纯文生图模式 + messages.append({ + "role": "user", + "content": prompt + }) + + payload = { + "messages": messages, + # 使用第一段代码中验证可用的模型 + "model": "gemini-2.5-flash-image", + # 可选参数,视 API 支持情况而定 + "stream": False + } + + try: + # 增加超时时间,图片生成通常较慢 + response = requests.post(NANOBANANA_API_URL, headers=headers, json=payload, timeout=120) + + # 检查 HTTP 状态 + if response.status_code != 200: + error_msg = f"API 请求失败: {response.status_code} - {response.text}" + print(error_msg) + gr.Error(error_msg) + return None + + result = response.json() + # Debug: 打印返回结果的前一部分,方便调试 + print(f"API 原始响应 (截取): {str(result)[:200]}...") + + # 提取 Content + content = None + if "choices" in result and len(result["choices"]) > 0: + content = result["choices"][0].get("message", {}).get("content") + + if not content: + gr.Warning("API 返回结果中没有 content 字段") + return None + + # 使用之前验证过的逻辑提取 Base64 + base64_str = extract_base64_from_response(content) + + if base64_str: + output_image = base64_to_image(base64_str) + if output_image: + return output_image + + # 如果没提取到图片,可能是模型拒绝了或只返回了文本 + text_content = str(content) if not isinstance(content, list) else " ".join([str(x) for x in content]) + gr.Info(f"未生成图片,模型返回文本: {text_content[:100]}...") + return None + + except requests.exceptions.Timeout: + gr.Error("请求超时,请稍后重试") + return None + except Exception as e: + import traceback + traceback.print_exc() + gr.Error(f"发生未知错误: {str(e)}") + return None + +# Gradio 界面配置 +with gr.Blocks(title="Nanobanana Image Generator") as app: + gr.Markdown("# 🍌 Nanobanana Text/Image to Image") + gr.Markdown("基于 Gemini-2.5-Flash-Image 模型,支持文生图与图生图。") + + with gr.Row(): + with gr.Column(): + prompt_input = gr.Textbox( + label="提示词 (Prompt)", + placeholder="例如: A cyberpunk cat holding a neon sign...", + lines=3 + ) + image_input = gr.Image( + label="参考图 (可选,用于图生图)", + type="pil", + height=300 + ) + submit_btn = gr.Button("开始生成", variant="primary") + + with gr.Column(): + image_output = gr.Image(label="生成结果", format="png") + + submit_btn.click( + fn=synthesize, + inputs=[prompt_input, image_input], + outputs=image_output + ) + +if __name__ == "__main__": + app.launch(share=True) +``` + +When Trae indicates successful execution, click the local link it provides (usually `http://127.0.0.1:7860`). + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image9.png) + +If everything is correct, you will see a working AI drawing interface. + +This interface looks simple, but it already includes two of the most important capabilities in commercial-grade drawing tools: text-to-image and image-to-image. + +* **Left side:** **Instruction area (** **Input** Zone) - this is where you issue commands. +* **Prompt (prompt box):** Enter your creative description (English is recommended). +* **Input** Image (reference image box): + * **Text-to-image mode:** keep it **empty**. + * **Image-to-image mode:** drag a local image here, and AI will create based on it. +* **Submit button:** click to send instructions and start generation. +* **Right side: display area (** **Output** Zone) - this is where results appear. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image10.png) + +Now we can try generating your first image. + +The example prompt used here is: + +> **A red apple** + +This is intentionally simplified, without style details or parameter constraints. + +#### Actual Process + +After running the code, the flow can be summarized in three steps: + +1. Send the text description to the model +2. The model generates the corresponding image +3. The image is saved as a local file + +After a few seconds, you will see generated results locally. Because model generation is stochastic, the same prompt can produce different outputs. You can generate multiple times and choose the image you prefer. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image11.png)![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image12.png) + +You can also enrich your prompt with more constraints and descriptions. For example, the prompt below tends to generate a more distinctive result: + +```Plain +"A hyper-realistic close-up of a fresh red apple with water droplets on its skin, sitting on a dark rustic wooden table. Cinematic dramatic lighting, rim light, shallow depth of field, bokeh background, 8k resolution, macro photography." +(一个超写实的带水珠的新鲜红苹果特写,放在深色粗糙木桌上。电影级戏剧光效,轮廓光,浅景深,背景虚化,8k分辨率,微距摄影。) +``` + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image13.png) + +Click download in the Output Image area to save the image locally. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image14.png) + +### 1.3 Common Material-Generation Scenarios for Image Models + +In real work, large-model image generation is more often used for **efficiently producing design assets**, rather than creating one-off art pieces. + +If you look at high-engagement cases from design marketing accounts, you will find that most outputs are concentrated in two scenarios: + +* **Text-to-image (0 to 1)** +* **Reference-image generation (1 to N)** + +#### 1) Text-to-Image: Quickly Get Design Assets + +This category is about efficiency. When you need to fill visual blanks in design (such as empty states, avatars, and illustrations), AI essentially acts as an **instant stock-image library**. + +1. ##### Generate UI Design Assets + +* Trend: frosted-glass and clay-style 3D icons, common on Dribbble +* Typical appearance: translucent materials, glowing edges, candy-like color palettes for functional or weather icons + +**Example Prompt:** + +> A set of 3D weather icons (sun, cloud, rain), glassmorphism style, frosted glass texture, soft pastel gradient colors, soft studio lighting, isometric view, transparent background, 4k. + +(一套 3D 天气图标,毛玻璃风格,磨砂质感,柔和渐变色,影棚光,等轴视图) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image15.png) + +2. ##### Generate Logos + +* Trend: minimalist lines and geometric combinations with a tech feel +* Typical appearance: black-and-white color schemes, negative space, clear brand identity + +**Example Prompt:** + +> Minimalist vector logo design for a tech brand "Coffee Code", combining a coffee cup with coding brackets < >, flat design, solid black lines, white background, Paul Rand style, svg. + +(极简矢量 Logo,结合咖啡杯与代码符号,扁平设计,纯黑线条) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image16.png) + +3. ##### Generate Website User Avatars + +* Trend: SaaS websites often use 3D virtual avatars to avoid real-person copyright risk +* Typical appearance: friendly expressions, cartoon proportions, Pixar- or Memoji-like styles + +**Example Prompt:** + +> Close-up portrait of a friendly young tech professional, smiling, Memoji 3D style, clay render, bright colors, soft lighting, solid plain background, Pixar character design. + +(友好的年轻科技从业者,3D Memoji 风格,黏土渲染) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image17.png) + +4. ##### Generate Article Illustrations + +* Trend: abstract flat illustrations commonly used in tech-company blogs +* Typical appearance: purple-blue palettes, exaggerated character proportions, floating UI elements + +**Example Prompt:** + +> Editorial flat illustration representing remote work, a person sitting on a giant globe using a laptop, corporate memphis art style, vibrant colors (purple and teal), vector texture. + +(远程办公主题扁平插画,企业孟菲斯风格) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image18.png) + +#### 2) Reference-Image Generation: Keep Visual Consistency + +This category focuses more on **scalability**. Use it when you already have a satisfactory key visual and need to generate a full set of assets in the same style. + +5. ##### Generate a Similar Set of Buttons or Interaction Assets from a Key Visual + +In game development, UI consistency is very important. Suppose you already have a main-screen **"PLAY"** button and now need to expand a full set of function buttons in a unified style (such as pause, settings, home). With pure manual drawing, it is hard to keep gloss, perspective, and color values fully consistent across every button. + +**Basic workflow:** + +1. Save the existing blue "PLAY" button image + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image19.png) + +2. Drag it into the **Input**** Image** area as the reference master +3. Keep style descriptions in the prompt unchanged and only modify the subject content + +With this flow, you can get different functions in the same style by only changing subject descriptions. + +**Example Prompt:** + +**Variant A: Pause Button (icon type)** + +> A capsule-shaped game UI button with a white pause icon (two vertical bars) inside. Same glossy blue jelly style, shiny plastic texture, white thick outline, vector illustration, high quality. + +(胶囊形游戏 UI 按钮,白色暂停图标,蓝色果冻质感) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image20.png) + +**Variant B: Settings Button (complex icon)** + +> A capsule-shaped game UI button with a white gear icon (settings symbol) inside. Same glossy blue jelly style, shiny plastic texture, white thick outline, vector illustration, high quality. + +(胶囊形游戏 UI 按钮,白色齿轮图标,蓝色果冻质感) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image21.png) + +**Variant C: Replay Button (shape variation)** + +If you need to change the button shape, describe that shape directly in the prompt. The model will try to change the structure while keeping material characteristics. + +> A round game UI button with a white circular arrow icon (replay symbol) inside. Same glossy blue jelly style, shiny plastic texture, white thick outline, vector illustration, high quality. + +(圆形游戏 UI 按钮,循环箭头图标,蓝色果冻质感) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image22.png) + +With this set of operations, you can not only change button function and icon, but also button shape, while keeping high consistency in material, color, and lighting. This is exactly the core value of large models in design-asset scaling scenarios. + +## Chapter 2: A More Controllable Image Generation Assistant - Lovart as an Example + +In the first part, we directly called NanoBanana with code and experienced the basic "input -> generate" flow. This works when requirements are simple. But as tasks include more constraints, for example: + +* multiple images with consistent style +* repeated iteration on existing results +* dynamically adjusting generation direction based on user input + +the one-shot calling pattern gradually becomes insufficient. + +At this point, we need to introduce an **AI Agent**. This section uses **Lovart** as an example to show how the overall workflow changes when image generation gains a "thinking layer." Note: this is not an advertisement. It is only to help everyone quickly grasp the convenience of AI Agents. + +### 2.0 First Look at Lovart: Your AI Design Agent + +Lovart is an agent-based web design tool. Compared with ordinary image generation tools, it adds one extra layer of "thinking and planning" before generation. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image23.png) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image24.png) + +After entering Lovart, you mainly need to understand the following controls: + +#### Model Selection + +Click the cube icon below the input box to view currently available generation models (such as GPT Image, Flux, etc.). + +To stay consistent with earlier examples, this section still uses NanoBanana as the underlying generation model. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image25.png) + +#### Thinking Mode + +This is Lovart's core switch: + +* **Fast Mode (⚡):** close to native API behavior, fast response, suitable for single images with clear instructions +* **Thinking Mode (💡):** agent mode, where AI first decomposes requirements and rewrites prompts, then generates + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image26.png) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image27.png) + +#### Internet Capability + +After enabling the globe icon, the agent can retrieve online information during generation (for example design trends and color styles) as auxiliary input. + +### 2.1 Why Is Native API Still Not Enough? + +Even if you can already generate good images via Python, native APIs still have limitations in complex tasks. The key reason is that native APIs are fundamentally imperative. If you ask for a concrete object, they can execute directly. But when the input becomes "plan a complete set of game assets," they will not proactively decompose that goal into executable substeps. + +Lovart's core difference is its agent mechanism. Between user input and the image generation model, it adds a logic layer for understanding and planning: first identify user intent, then decompose tasks and rewrite prompts, and only then execute generation. + +### 2.2 Practical Demo: Build a Full IP Sticker Pack in 5 Minutes + +Take **"create an IP sticker pack of a programmer duck"** as an example and look at how the agent participates in the full workflow. + +#### Step 1: Planning (Agent Thinking Capability) + +**Native API issue:** +You need to think through character settings and emotional states yourself, and write separate prompts for every image. + +**Lovart approach:** + +1. Turn on 💡 **Thinking Mode** +2. Input one instruction: + +> 设计一套程序员鸭子的 IP 表情包,风格要扁平化、可爱 + +AI does not draw immediately. It first searches online for relevant programmer-duck references, then outputs a decomposed plan, automatically creates scenarios such as Debug, Coffee Break, Panic, and generates multiple visual descriptions. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image28.png)![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image29.png) + +At this step, AI shifts from "executor" to "planner." After AI analyzes the requirement, you can see programmer-duck images with multiple styles and contents on the Lovart canvas and start selecting your preferred style. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image30.png) + +#### Step 2: Consistency (Reference-Based Visual Anchoring) + +In Lovart, images are not only outputs. They are also inputs for follow-up generation. + +##### Full Reference Image + +* Choose your favorite "standard duck" from drafts and click the image on the canvas +* The image automatically appears in the dialogue area as a reference + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image31.png) + +* Input a new action (such as happy) and generate + +The generated result will inherit color palette, proportions, and detail characteristics from the master reference. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image32.png) + +##### Local Reference / Multi-Image Composition + +Besides using full images as references, Lovart also supports: + +* **selecting only local regions** (for example, only reference a hat or expression) + +Click the left tab on the canvas, choose "Mark," and annotate the local region in the target image. That part is automatically synced into the dialogue box. For example, we can change only the background color here. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image33.png) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image34.png) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image35.png) + +You can see the newly generated image only changes the background color, which matches our requirement. + +* **referencing sub-elements from multiple images** and combining them into a new result + +For example: you can keep the main character from image A, while replacing only the hat with the style from image B. The agent automatically merges these visual constraints in the background. + +Using programmer ducks as an example, we can keep the duck from the first image and replace the subject element in the second image. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image36.png) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image37.png) + +The final effect is also very strong. You can try other combinations too. + +#### Step 3: Delivery (Agent Tool Calling) + +After generation, you can directly execute operations such as upscale, background removal, and erasing. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image38.png) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image39.png) + +These are not simple filters. They are results from the agent orchestrating different tools automatically. + +After style direction is confirmed, you can quickly generate a full set of sticker images. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image40.png) + +What we finally get is production-ready assets that can be delivered directly, not just one showcase image. + +### 2.3 Usage and Pricing Notes + +Lovart uses a subscription model. Different plans correspond to different usage quotas and feature permissions. Refer to the official site for specific details. + +This tutorial does not recommend or compare any specific plan. If you need it in actual use, choose paid upgrades based on your own situation. +Currently, payment methods include **Alipay** and others. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image41.png) + +#### Summary + +Lovart does not replace underlying models. Instead, through an agent mechanism, it upgrades image generation from "single execution" to a "continuous workflow." + +When tasks involve planning, consistency, and delivery, the advantage of this type of tool becomes very clear. + +## Chapter 3: Build an Intelligent Drawing Assistant by Yourself + +Besides using Lovart directly, we can also implement a simplified drawing assistant ourselves. + +In this chapter, we use "automatic illustration for articles" as an example. Starting from a real problem, we build a minimal practical agent with a thinking layer step by step. + +### 3.1 Pain Point: Why Sending Long Articles Directly to an Image Model Does Not Work + +If you directly send a long article to NanoBanana and ask for illustration, the result is usually not ideal. The issue is not that the model "cannot draw." The issue is that **it is not good at understanding long text**. + +Image generation models are better at short and clear visual descriptions. But when the input becomes an article with structure, key points, and contextual relationships, the model cannot determine which parts should be represented visually. This often causes off-topic images, or results that capture only scattered details without overall summarization. + +In essence, image models have "execution" capability but lack an analysis-and-selection process for long text. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image42.png) + +### 3.2 Solution: Use an Agent to Split "Understanding" and "Execution" + +To solve this, the key is not a more complicated prompt. The key is **to think clearly before drawing**. So we introduce an independent "thinking layer" into the generation flow, and use it to build the simplest practical agent. + +This agent has only one core objective: **make the final generated image match the user's true intent as closely as possible.** + +The full flow can be summarized as: +**long-text input -> language-model understanding and intent judgment -> generation of suitable visual prompt -> image-model execution -> output image** + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image43.png) + +How can our agent understand user intent? + +Here we use a simplified **thinking layer** with three intents: invalid input, direct drawing instruction, and long text that needs understanding. + +In this agent, role division can be summarized in four points: + +1. **Language model as decision core** + It understands article content, judges user intent, routes tasks to suitable generation paths, and decides "what to do next" and how to generate visual prompts. +2. **Image model as executor** + The image model does not do understanding or intent judgment. It only receives prepared visual instructions and focuses on rendering. +3. **User as interactive guide** + Besides entering text directly, users can manually adjust generated prompts or add reference images to guide and fine-tune final results. +4. **Gradio and backend APIs as application carrier** + They connect UI, model invocation, and result display to ensure the full agent can run stably as a complete web app. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image44.png) + +### 3.3 Practical Preparation: Obtain APIs + +Looks fun, right? To run the full flow above, we only need two types of APIs. + +#### Hand: NanoBanana API (Image Generation) + +Directly reuse the API Key and API URL already configured in Chapter 1. No additional setup is required. + +#### Brain: SiliconFlow API (Text Thinking) + +We need a large language model to handle the "thinking layer." This tutorial uses model services provided by SiliconFlow: +[https://cloud.siliconflow.cn](https://cloud.siliconflow.cn/) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image45.png) + +SiliconFlow provides interfaces compatible with OpenAI API conventions, so it can be called conveniently via standard network requests. Here we use the free `Qwen2.5-7B-Instruct` model. Everything needed for invocation is already included in the prompt below. Before you start, you only need to register an account and create an API Key on the official site. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image46.png) + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image47.png) + +This key will be used for later model calls. + +### 3.4 Build the Agent: + +In this experiment we mainly use Trae to help write code. The tutorial uses `Gemini-3-Pro-Preview`. The overall approach is: create a new project, copy the full prompt below into the dialogue box, replace API keys step by step, run code, and complete testing. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image48.png) + +#### Step 1️⃣: Gradio Blocks Base Framework and UI Layout + +In this step, our main goal is to build the "appearance" of the whole agent first and complete the front-end page design. Copy the prompt below into Trae. After implementation, you will get a local URL (usually `http://127.0.0.1:7860`) to view the interface and verify the result. + +```Plain +板块 1:Gradio Blocks 基础框架与界面布局 +1、任务目标 +·基于 Gradio 4.0.0+ 的 Blocks 布局,实现「LLM+Nanobanana 文生图」项目的基础界面,严格遵循固定左右分栏布局,初始化所有 UI 组件并设置正确的初始状态。 + +2、技术栈要求 +·必须使用 Gradio 4.0.0+ 的 Blocks 模式开发,禁止使用 Interface 模式; +·依赖:gradio>=4.0.0,pillow>=10.0.0(仅导入,暂不实现图片处理逻辑); +·代码需是完整可运行的 Python 文件,包含所有必要的导入语句。 + +3、界面布局规则(核心约束,融合实战细节) +·整体布局: +页面标题:LLM 驱动的文生图全流程工具; +固定左右分栏:左侧占 60% 宽度,右侧占 40% 宽度,使用 gr.Row 和 gr.Column 实现比例控制。 +·左侧 60%(提示词生成流程区)组件清单: +input_text:gr.Textbox,标签「输入文本(教程段落 / 绘图指令)」,lines=6,占位符「请输入需要配图的教程文本或直接绘图指令...」; +identify_intent_btn:gr.Button,value="识别意图",初始状态正常可点击; +intent_status:gr.Textbox,标签「意图类型 / 处理状态」,lines=2,interactive=False,初始值「未识别意图」; +system_prompt:gr.Textbox,标签「System Prompt(仅文章配图意图可编辑)」,lines=4,interactive=False,占位符「LLM 生成提示词的约束规则...」; +confirm_prompt_btn:gr.Button,value="确认生成生图提示词",interactive=False(初始禁用防误触); +generation_prompt:gr.Textbox,标签「生图提示词(可编辑)」,lines=3,interactive=True,初始值为空,占位符「生成的英文生图提示词将显示在此,支持手动修改...」。 +·右侧 40%(Nanobanana 生图功能区)组件清单: +ref_image:gr.Image,标签「参考图(可选,图生图)」,type=filepath,height=300,允许上传; +generate_btn:gr.Button,value="生成图片",interactive=False(初始禁用,无提示词不可点击); +result_image:gr.Image,标签「生成结果」,type=pil,height=300,初始为空,interactive=False。 + +4、交互逻辑要求 +·所有组件的 interactive 初始状态严格按上述配置,后续通过函数动态更新; +·按钮禁用状态需直观(置灰),避免用户误操作。 + +5、输出要求 +·生成完整的 Python 代码,仅实现界面布局和组件初始化,不包含任何业务逻辑; +·代码注释清晰,组件命名与实战版一致(input_text/identify_intent_btn 等); +·代码可直接运行,界面结构与描述完全一致。 +``` + +After opening `http://127.0.0.1:7860` in the browser, you can see Trae generated the page according to requirements. It is generally aligned, and we can move on to the next step. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image49.png) + +#### Step 2️⃣: LLM Intent Recognition Module (SiliconFlow API) + +When using VLMs for drawing in daily work, there are usually three common input cases: + +1. Meaningless content, such as "hello" or "have you eaten today," which cannot map to drawable requirements. +2. Articles/long text, such as a structured paragraph around 200 words, where you must first understand structure/content before generating an image that summarizes the text. +3. Direct drawing instructions, such as "draw a dog taking a bath," where requirements are already specific enough for immediate generation. + +As before, copy the prompt below into Trae and add the API obtained in earlier steps. + +```Plain +板块 2:LLM 意图识别模块(Siliconflow API) +1、任务目标 +在已实现的 Gradio 界面基础上,为「识别意图」按钮添加点击逻辑,调用 Siliconflow API 完成意图识别,并联动组件状态。 + +2、技术栈要求 +基于 Gradio 4.0.0+ Blocks; +依赖:requests>=2.31.0,openai; +输出完整可运行 Python 文件,包含板块 1 界面 + 本模块逻辑。 + +3、核心业务规则(绝对不可偏离) +·意图分类规则(仅 3 类,严格返回数字 + 描述) +1 = 无意义内容:仅闲聊、寒暄、无关对话,没有任何绘图或配图需求(如 “你好”“今天吃了吗”); +2 = 文章 / 长文本配图需求:用户输入一段完整文章、教程、段落、说明性文字,内容偏叙事 / 说明 / 讲解,隐含需要为这段内容生成配图的意图,不需要用户明确说 “为这段文字配图”; +3 = 直接绘图指令:用户输入简短、明确的画图命令,没有长文本背景,直接要求画某个内容(如 “画一只 Apple 风格的猫”)。 +·LLM 调用约束(融合实战版模板) +接口地址:https://api.siliconflow.cn/v1/chat/completions; +模型:Qwen/Qwen2.5-7B-Instruct; +temperature=0.1; +统一定义代码: +python +运行 +LLM_BASE_URL = "https://api.siliconflow.cn/v1" +LLM_API_KEY = "" # 用户自行替换 +LLM_MODEL = "Qwen/Qwen2.5-7B-Instruct"# 实战验证的意图识别模板(固化到代码中) +INTENT_PROMPT_TEMPLATE = """你需要识别用户输入文本的意图,仅返回以下 3 类结果中的一种(格式:数字 + 中文描述): +1 = 无意义内容;2 = 文章 / 长文本配图需求;3 = 直接绘图指令。 + +用户输入:{user_input} + +识别结果: +仅提取返回结果中的数字和描述,禁止额外内容。""" + +4、组件联动规则 +·结果为 1:intent_status 显示「1 = 无意义内容:无绘图需求」,system_prompt 保持禁用,confirm_prompt_btn 禁用; +·结果为 2:intent_status 显示「2 = 文章 / 长文本配图需求:为输入内容生成配图」,启用 system_prompt 并填充默认规则,激活 confirm_prompt_btn; +·结果为 3:intent_status 显示「3 = 直接绘图指令:根据指令生成图片」,system_prompt 禁用且填充默认规则,激活 confirm_prompt_btn。 + +5、异常处理 +API 异常、解析异常均给出友好提示,不崩溃,组件恢复初始状态。 + +6、输出要求 +生成完整可运行代码,替换 LLM_API_KEY 即可使用,逻辑清晰注释完整,意图识别模板严格使用实战版。 +``` + +Refresh `http://127.0.0.1:7860` and test whether it correctly detects all three cases. + +1. Meaningless content: try inputting "你好", "谢谢", and so on. It should be recognized correctly. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image50.png) + +2. Article/long text: here we use a paragraph about AI generated by Doubao. You can also test with your own paper paragraph. + +```Plain +人工智能正在以前所未有的深度和广度重塑教育生态系统。通过自适应学习算法,AI系统能够构建每个学生的认知图谱,实时追踪他们的知识掌握轨迹,并动态调整教学内容的难度和呈现方式。在传统课堂环境中,教师往往难以同时满足不同学习风格和能力水平的学生需求,而基于深度学习的教育平台可以分析学生在交互式模拟实验中的行为模式,识别他们在量子力学或微积分等复杂概念理解上的微妙障碍,并提供精准的认知支架。 + +高级自然语言处理引擎驱动的虚拟导师不仅能够解构开放性问题,如"如何评价法国大革命对现代民主制度的影响",还能引导苏格拉底式对话,激发批判性思维。当学生撰写关于气候变化对极地生态系统影响的论文时,AI写作助手可以分析其论证逻辑的严密性,指出数据引用中的时效性问题,并建议更精准的科学术语。在特殊教育领域,计算机视觉技术使AI能够识别自闭症谱系儿童在社交互动中的非语言线索,调整干预策略,而情感计算算法则帮助检测在线学习时的挫折感,及时提供鼓励性反馈。 + +然而,这种技术融合引发了一系列伦理困境。算法偏见可能无意中边缘化特定文化背景的学生,数据采集的透明度问题引发了对学术隐私的关切,而过度依赖自动化评分系统可能削弱教师对学生思维过程的深层理解。更复杂的是,当AI开始生成高度逼真的虚拟实验室体验时,我们需要重新定义"实践经验"在教育中的价值。未来教育的范式可能演变为人类教师专注于培养创造力、同理心和道德判断力,而AI系统则承担知识传递、技能训练和个性化评估的职能,形成一种协同进化的教育共生体,既能发挥机器的计算优势,又能保留人类教育的独特温度. +``` + +This is also detected successfully. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image51.png) + +3. Direct drawing instruction: here we input "我要画一只猫", and it is also correctly detected. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image52.png) + +At this point, we have successfully completed step 2: intent recognition. + +#### Step 3️⃣: Prompt Generation Module (Second LLM Call) + +After intent recognition, for articles or long text there is one more crucial step: generating the drawing prompt. This is exactly the core of this agent. + +```SQL +板块 3:生图提示词生成模块(LLM 二次调用) +1、任务目标 +在意图识别基础上,实现「确认生成生图提示词」按钮逻辑,调用 LLM 将文本优化为适合绘图的英文视觉提示词,填充到编辑框并联动「生成图片」按钮。 + +2、技术栈要求 +同板块 2,输出完整代码 = 板块 1 + 板块 2 + 本模块; +共用板块 2 定义的 LLM_BASE_URL、LLM_API_KEY、LLM_MODEL,不新增密钥。 + +3、核心业务规则(融合实战版 Prompt 组装逻辑) +·提示词生成输入规则(必须严格遵循) +生图提示词生成不再是简单字符串拼接,而是构建标准 Chat 消息列表,代码结构如下: +python +运行 +messages=[# System角色:网页上用户最终确认/编辑后的system_prompt内容{"role": "system", "content": final_system_prompt},# User角色:承载待处理数据,明确任务目标{"role": "user", "content": f"请为以下内容生成视觉提示词:\n\n{user_input}"}] +意图为 2 时:System 内容取用户编辑后的 system_prompt 最终版本; +意图为 3 时:System 内容取禁用状态下填充的默认规则 +user_input 为用户最初输入到 input_text 框的原始文本。 +·实战验证的 System Prompt 预设(固化到代码中) +python +运行 +SYSTEM_PROMPT_DEFAULT = """你现在是一个创建NanoBanana画图提示词的助手。 +需要根据我的内容处理,我这个图片的作用是能说明这一段在说什么,并且让大家知道这段话的上下结构就是整体说的是什么意思。 +里面可能会类似PPT有一些讲解(如:左上角展示核心观点,右下角展示数据)。 +设计风格要求:简约,Apple设计思维(Apple Design Philosophy)。 +约束:请直接返回NanoBanana可用的英文提示词,不要返回任何解释、前缀或多余的废话。""" +·LLM 调用约束 +与板块 2 共用同一套 LLM_BASE_URL、LLM_API_KEY、LLM_MODEL; +temperature=0.7(保证提示词的创意性与适配性); +max_tokens=200(限制输出长度,匹配提示词约束); +严格使用上述标准 Chat 消息列表结构,禁止字符串拼接。 +·示例输入输出(核心参考) +输入示例 1(文章配图意图):原始文本:「AI 如何改变教育:随着人工智能技术的发展,教师的角色从知识传授者转变为引导者,AI 助手可辅助学生完成个性化学习,课堂上人机协作成为常态。」最终 System Prompt:SYSTEM_PROMPT_DEFAULT(未修改)输出预期:"Minimalist illustration, Apple Design Philosophy, 1024x1024. Top left shows 'AI + Education' core concept, bottom right shows data of teacher-student-AI collaboration, soft color palette, clean lines, no redundant elements." +输入示例 2(直接绘图指令):原始文本:「画一只 Apple 风格的猫,坐在 MacBook 旁边」最终 System Prompt:SYSTEM_PROMPT_DEFAULT(禁用状态)输出预期:"Minimalist cat, Apple style, 1024x1024, sitting next to a silver MacBook, clean white background, soft shadows, geometric shapes, no extra details." +·提示词输出强制约束 +纯英文,无中文; +必须包含 Apple Design Philosophy/Apple style + 1024x1024; +长度 50–200 字符,代码内校验; +无额外解释、前缀或废话,仅返回提示词本身。 + +4、组件联动规则 +生成成功:将提示词填入 generation_prompt 框,激活 generate_btn,intent_status 追加「提示词生成成功,可修改后生成图片」; +生成失败:提示具体原因(如 API 调用失败、长度不达标),generate_btn 保持禁用,generation_prompt 框为空; +用户手动修改 / 清空 generation_prompt 框: +清空时自动禁用 generate_btn; +非空时保持 generate_btn 激活。 + +5、异常处理 +API 调用失败:友好提示「提示词生成失败:{具体错误信息}」,不崩溃; +提示词校验失败:明确提示原因(如 “未包含 Apple style”“长度仅 40 字符”),允许重试; +响应解析失败:提示「无法解析 LLM 返回结果,请重试」。 + +6、输出要求 +完整可运行代码,替换 LLM_API_KEY 即可使用; +代码结构清晰、注释完善,界面美观简洁; +严格实现标准 Chat 消息列表结构,参数与示例逻辑一致; +包含提示词长度、内容校验逻辑,错误提示友好。 +``` + +Use the same long text from step 2 for testing. + +It is worth noting that the default System Prompt we preset for prompt generation is: + +> 你现在是一个创建NanoBanana画图提示词的助手。 +> 需要根据我的内容处理,我这个图片的作用是能说明这一段在说什么,并且让大家知道这段话的上下结构就是整体说的是什么意思。 +> 里面可能会类似PPT有一些讲解(如:左上角展示核心观点,右下角展示数据)。 +> 设计风格要求:简约,Apple设计思维(Apple Design Philosophy)。 +> 约束:请直接返回NanoBanana可用的英文提示词,不要返回任何解释、前缀或多余的废话。 + +If you want to switch to other preset templates, you can modify the earlier prompt or directly modify it through Trae dialogue. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image53.png) + +Besides changing underlying code, we can also edit quickly on the webpage. For example, I added one line, "add 'Pic Prompt' at the beginning." You can see the new generated prompt also starts with it. This design is for quickly adjusting the system prompt for generation, so we can switch styles fast. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image54.png) + +#### Step 4️⃣: NanoBanana Text-to-Image / Image-to-Image Module + +Finally we are at the last step. Without connecting an image model, it is not a complete agent. + +```Bash +板块 4:Nanobanana 文生图 / 图生图模块(最终版) +1、任务目标 +实现「生成图片」按钮逻辑,调用真实 Nanobanana API,支持文生图 / 图生图,解析 Base64 并展示图片。 + +2、技术栈要求 +基于 Gradio 4.0.0+ Blocks; +依赖:requests, pillow, base64, io, re; +完整代码 = 板块 1+2+3 + 本模块。 + +3、核心 API 配置(实战验证固化) +固化代码配置: +python +运行 +# 固化到代码中的API配置 +NANOBANANA_API_URL = "https://api.zyai.online/v1/chat/completions" +NANOBANANA_MODEL = "gemini-2.5-flash-image" +NANOBANANA_API_KEY = "" # 用户自行替换 +鉴权方式:Header Authorization: Bearer {NANOBANANA_API_KEY}。 + +4、图片预处理要求(必须实现)实现函数 image_to_base64_data_uri (ref_image_path),核心逻辑: +将 PIL 图片转为 PNG 格式; +自动缩放到 1024x1024 分辨率; +透明通道转为白色背景; +编码为 Base64,返回格式:data:image/png;base64,...。 + +5、请求构建规则(严格按实战版分支逻辑) +·核心函数定义实现函数 generate_image (prompt, ref_image_path): +入参:prompt(generation_prompt 框内容)、ref_image_path(ref_image 上传的文件路径); +返回:PIL Image(展示到 result_image)或错误提示。 +·逻辑分支 1:纯文生图(ref_image_path 为空) +python +运行 +messages = [{"role": "user", "content": prompt}] +·逻辑分支 2:图生图(ref_image_path 有值) +python +运行 +# 先调用图片预处理函数 +image_base64 = image_to_base64_data_uri(ref_image_path) +messages = [{"role": "user","content": [{"type": "text", "text": prompt},{"type": "image_url", "image_url": {"url": image_base64}}]}] + +6、响应解析要求(必须兼容两种格式)从 choices [0].message.content 中提取图片 Base64,支持: +结构化 JSON 返回的 image_url 字段; +Markdown 格式 +; +统一提取 Base64 编码,解码后转换为 PIL Image 返回。 + +7、组件联动与异常处理 +生成成功:将 PIL Image 展示到 result_image,intent_status 提示「图片生成成功」; +生成 / 解析 / 上传失败:在 intent_status 显示清晰文字提示(如 “Base64 解析失败”“API 调用超时”),不崩溃。 + +8、输出要求 +完整可运行代码,替换 LLM_API_KEY 和 NANOBANANA_API_KEY 即可直接运行,全流程可用,分支逻辑严格匹配实战版。 +``` + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image55.png) + +So exciting. We finally generated the first image of this agent. Looking closely, the generated image matches both our text and prompt. At this point, you have basically implemented your own agent. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image56.png) + +We also added image-to-image. Upload an image you like, and AI will automatically borrow style cues. + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image57.png) + +It is also worth mentioning that prompts generated in earlier steps can be edited directly on the webpage, and generation always uses the final prompt at click time. Even if I change it here to "a cute cat," the final output will be just a cute kitten. + +## Chapter 4: Summary + +![](/zh-cn/stage-2/frontend/2.0-lovart-assets/images/image58.png) + +**Whew, finally finished.** +Honestly, when I finished the last line, I exhaled deeply myself, and you followed the full path to here. Running through this full workflow is already impressive by itself. It means you really put your hands on the keyboard and completed things step by step. Bravo. + +During the writing of this tutorial, I kept asking what we really want to leave behind. The answer is not model names, parameter values, or fixed tricks. It is helping you gradually build a feel for division of labor: what AI can safely understand and plan for you, and where you only need to decide direction. Once this division is established, many workflows that once looked complex start becoming smooth. + +Looking back, this path is not actually complicated. Clarify the problem you want to solve, let a language model decompose long text, then pass organized visual intent to an image model for rendering, and finally package the full process into your own assistant. At that point, you are no longer simply "using models." You are building a system that can work with you over the long term. That is exactly what this tutorial most wants to deliver. + +But you already did great. If you have made it this far, you already have a solid initial grasp of Vibe Coding. Give yourself a short break. + + diff --git a/docs/en/stage-2/frontend/2.1-figma-mastergo/index.md b/docs/en/stage-2/frontend/2.1-figma-mastergo/index.md new file mode 100644 index 0000000..428ea1e --- /dev/null +++ b/docs/en/stage-2/frontend/2.1-figma-mastergo/index.md @@ -0,0 +1,307 @@ +# Figma and MasterGo Basics + +::: tip Core Question +**How do you start using modern design tools from scratch to build web prototypes?** +::: + +--- + +## 1. Why learn frontend design tools? + +Before we begin, we need to answer a simple question: why bother learning frontend design tools at all? If you can already build pages with HTML and CSS, is it really necessary to learn one more tool? + +In practice, "making a page run" and "designing a good product" are two different things. Code focuses on how something renders in the browser and how it behaves across devices. Design tools focus on how information is arranged, how interactions are sequenced, and how visual priority is communicated. With a single canvas, you can compare layout, information hierarchy, and interaction patterns on one screen before writing code. + +If you jump straight into implementation or ask AI to generate a full frontend page immediately, the user experience is often rough. Serious products think carefully about comfort, hierarchy, and communication across different screens. A better workflow is to arrange the interface first from the user's perspective, then convert or generate the code. + +From a collaboration standpoint, design tools also reduce coordination cost. Designers, product managers, and developers no longer need to imagine the same screen from vague explanations or abstract code. Everyone can discuss versioning, requirement changes, and feedback around a visible, annotatable, iterative canvas. Modern design tools are no longer just drawing software either. They can generate part of the code, manage design systems and component libraries, and automate repetitive work such as alignment, annotation, exporting, and style changes. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image8.png) + +### 1.1 The evolution of frontend design tools + +Frontend design tools are the result of a long evolution. In the 1990s, Photoshop dominated with local bitmap editing. Around 2010, Sketch introduced vector-first, component-oriented workflows. After 2016, Figma pushed collaboration into the cloud and turned solo design work into real-time teamwork. By 2025, AI had become a practical part of these tools, from "generate a draft from one sentence" to "turn a design into runnable frontend structure." "Design as code" and "human-AI co-creation" are no longer just slogans. + +In this chapter, we will focus on two representative modern design tools: Figma and MasterGo. They both cover the core abilities needed for modern UI and UX work, including vector editing, component systems, auto layout, and developer handoff. They have also both added practical AI features that help turn a prototype into a runnable interface without changing the overall design intent. + +## 1.2 How this toolchain emerged + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image9.png) + +Before dedicated interface tools existed, UI design was largely handled by "general-purpose" design tools such as Photoshop. Designers built entire interfaces locally using layered PSD files, then handed those heavy source files to frontend engineers. To recreate the design accurately, frontend engineers had to do three tedious but essential jobs manually. + +The first was **asset slicing**: extracting buttons, icons, logos, backgrounds, and other visual elements one by one from a PSD file, then exporting them as PNG or JPG files the web could actually load. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image10.png) + +The second was **measuring dimensions**: manually checking widths, heights, and spacing between elements to ensure everything matched the design pixel by pixel. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image11.png) + +The third was **reading annotations by hand**: pulling out the "invisible but required" design parameters such as font size, font weight, line height, RGB or HEX colors, shadows, and so on. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image12.png) + +Only after that did actual frontend implementation begin. Whether the stack is plain HTML/CSS/JS or frameworks like Vue and React, the core process is similar. The frontend rebuilds the page around containers, based on the hierarchy and semantics of the design. A container is a layout boundary that organizes child elements without directly being the final content itself. Structural blocks such as top navbars, sidebars, article lists, and footers rely on containers; inside each block, smaller containers arrange finer elements such as titles, descriptions, timestamps, or thumbnails. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image13.png) + +In modern frontend frameworks, these structural blocks are typically implemented as **components**. A component is a reusable interface unit with clear boundaries. It includes both layout containers and interaction logic. Any repeated piece of design, such as a consistent button style or a reusable article card, can be abstracted into a component so it can be reused across different pages while keeping layout and styling consistent. + +The styling layer then restores the visual appearance. Exported image assets become `` tags or background images. Measured dimensions become CSS properties such as `width`, `height`, `margin`, `padding`, and `line-height`. Typography, color, shadow, border radius, and hover or active states become CSS, CSS Modules, CSS-in-JS, or Tailwind rules. At this point, exported assets and annotations provide the visual parameters, while components and structural blocks provide the code organization that makes the interface maintainable and reusable. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image14.png) + +But the local-file workflow was fundamentally inefficient. Versions were sent through email or cloud drives, old and new drafts were easy to confuse, and collaboration required a lot of manual coordination. + +As mobile interfaces became more complex and iteration speed increased, Photoshop's "do everything" model became too heavy. Sketch appeared in this phase. It focused on UI work itself, introduced Symbols for highly reusable elements such as buttons and form controls, and paired well with tools like Zeplin for automatic annotations and style snippets. Sketch brought component thinking into design workflows. Still, it remained a desktop tool built around local files, so real-time collaboration never became native. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image15.png) + +Figma truly changed the game. Starting in 2016, it unified UI design, prototyping, comments, and version history in the browser, with multi-user cursors, online comments, timeline history, and shareable links. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image16.png) + +From that point on, interface design was no longer scattered across separate machines. It became a shared online canvas that updated in real time. Once that happened, the boundary between design and frontend code became easier to blur through automation and AI. + +At first, plugins could only semi-automatically export components and style information into code snippets such as React or Vue skeletons and CSS variables. Later, design platforms began to support MCP, the Model Context Protocol, which gives language models a standard, controlled way to access design files, plugin interfaces, and project metadata. That makes exporting designs into code much more direct. + +The next step after plugins and MCP is native design-to-code generation. Today, some tools can generate project skeletons, component hierarchies, style systems, and real code directly from a design. That frees designers and frontend engineers from manually transferring details and gives them more time to focus on user experience and feature iteration. + +--- + +## 2. Figma basics + +Now let's move from concepts to hands-on work. Because of time, we will only cover Figma's core interaction model. The goal is simple: even if you have never used a design tool before, you should be able to follow along and complete the exercise. If you want a more complete walkthrough, you can study Figma's official beginner documentation: + +https://help.figma.com/hc/en-us/sections/30880632542743-Figma-Design-for-beginners + +You can also look at Figma's site-building examples: + +https://help.figma.com/hc/en-us/sections/35895585621655-Figma-Sites-collection + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image17.png) + +On the left is project creation and resource management. In the top-right area, you will see several common entry points. `Make` lets AI generate a rough interface draft from one sentence. `Design` is the main workspace where you build app and web interfaces, components, and prototypes. `FigJam` works like a team whiteboard for notes, flows, and early discussions. `Buzz` is for brand-scale asset production. `Site` is for publishing designs as accessible websites or documentation pages. + +At first glance, Figma looks complex. But tools like this become familiar through repetition. You do not need to be afraid of making mistakes, and you do not need to get everything right on the first try. The key is to start playing with it. + +In this tutorial, we will focus on the `Design` workspace. + +### 2.1 Create a new Design file + +From the homepage or the top-right entry, choose **Design** to create a new file. You will enter a blank canvas. + +This interface is roughly divided into three areas: + +- The left side shows pages and layers so you can inspect the structure of the page and the hierarchy of elements. +- The middle area is the canvas where you view and arrange the current design. +- The right side is the properties panel where you change shape, color, and style details. +- The toolbar lets you switch between selection, shapes, text, comments, and plugins. After selecting a tool, you can press `Esc` to return to the default pointer. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image18.png) + +### 2.2 Create your first Frame + +Before placing elements, we need a clear page boundary. In Figma, that boundary is handled by a Frame. You can select the Frame tool or press `F`, then drag out a rectangular region on the canvas. + +1. Use the Frame tool in the toolbar or press `F`. +2. Drag a rectangle on the canvas and set its width to something like `1440` and height to `900` in the right-side panel. +3. Rename the Frame in the layer list to something like `My First Page` or your project name. + +This Frame becomes the container for one complete screen. Your title, text, buttons, and images should all live inside it instead of floating freely on the canvas. Working inside a Frame helps later with scrolling, responsiveness, exporting, and prototyping. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image19.png) + +### 2.3 Add text and basic elements inside the Frame + +Now that we have a container, let's place the most basic interface elements: a title, subtitle, button, and placeholder image block. + +1. Choose the text tool (`T`) and click inside the Frame to add a title such as `My Portfolio`. Increase the font size and weight in the right panel. +2. Add one line of supporting text under the title. Use a smaller font size and slightly larger line height so it reads more comfortably. +3. Sketch out a button: + Use the rectangle tool to draw something around `200 x 48`, give it a noticeable fill color, and add some border radius. + ![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image20.png) +4. Add button text on top, such as `Get Started`, then select both the rectangle and the text and align them horizontally and vertically. +5. Add a larger light-gray rectangle beside or below the button as a placeholder image area. + +At this point, you already have a very rough but structurally complete homepage draft: a title, a piece of body text, a button, and a main display area. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image21.png) + +### 2.4 Use Auto Layout to organize elements + +If all elements are positioned manually, the page becomes messy very quickly. One of Figma's most important concepts is **Auto Layout**, which turns a group of elements into a rule-based container. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image22.png) + +Select the main title, subtitle, and button together, then click **Add Auto layout** in the right panel. + +Those elements are now wrapped inside a container, and you can adjust several useful properties: + +- Whether the elements are arranged vertically or horizontally +- The spacing between elements +- The padding between the content block and the edge of the container + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image23.png) + +You can use Auto Layout inside the button as well. That gives you a button whose width adjusts automatically when the text changes. + +Select the button background and button text, add Auto Layout, and turn them into a button container. Then set both width and height to **Hug contents**. Once you do that, the text stays centered and the button width grows or shrinks with the text. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image24.png) + +### 2.5 Turn the button into a reusable component + +Now let's learn another important concept: components. A component is an element designed for repeated reuse. Buttons are a perfect example. + +Starting from the button that already has Auto Layout: + +1. Select the entire button container. +2. Right-click and choose **Create component**. + ![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image25.png) + +The button is now promoted from a set of ordinary layers to a component master. When you need the same button style somewhere else, you can drag it out from the Assets panel. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image26.png) + +Every inserted button is now a synchronized instance of that master. If you later change the master's color, corner radius, or spacing, all instances update together. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image27.png) + +At this point, you already understand the basic usage of Figma. You do not need to master every function on day one. Just build your first simple page, get comfortable with the core operations above, and explore more capabilities over time. + +--- + +## 3. MasterGo basics + +Once you understand the basic Figma workflow, MasterGo is much easier to approach. You can think of MasterGo as a China-focused counterpart to Figma with a few differences in product behavior. Overall, it follows a very similar layout and interaction model: canvas, layer tree, property panel, components, styles, auto layout, and multi-person collaboration. For more detail, you can refer to the official MasterGo tutorial: + +https://mastergo.com/tutorials/12?%E5%85%A8%E7%A8%8B%E9%AB%98%E8%83%BD%EF%BC%8CMasterGo%20%E6%9C%80%E5%AE%8C%E6%95%B4%E5%AE%9E%E7%94%A8%E6%95%99%E7%A8%8B%EF%BC%8C%E8%AE%A9%E4%BD%A0%E4%BB%8E%E9%9B%B6%E5%88%B0%E7%B2%BE%E9%80%9A%EF%BC%81 + +### 3.1 Create a new design file + +1. **Enter the MasterGo workspace** + 1. Open the MasterGo website and sign in. + 2. After entering, you will see a homepage similar to a file list or project list, where your design files are managed. + ![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image28.png) + +2. **Create a new file** + 1. Click the `+ Design File` button in the top-right corner, or choose to import files such as Figma files. + 2. After clicking, you will enter a blank canvas, which is MasterGo's design workspace. + +3. **Understand the major interface regions** + Once you know Figma, MasterGo feels very similar. The main areas are: + + ![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image29.png) + 1. The top toolbar: file location and name on the left, common tool buttons in the middle, and online collaborators, sharing, zoom, and preview controls on the right. + 2. The left panel: layers and assets, including the page list and the structure of the current page. + 3. The central canvas: the workspace where Frames, components, and graphics are actually placed and arranged. + 4. The right properties panel: used to inspect and edit the selected object's size, position, alignment, fill, stroke, border radius, and more. If nothing is selected, it shows canvas-level settings. + +### 3.2 Create your first Frame + +Before placing content, we need a page container to define the boundary and size of the interface. In MasterGo, this is usually called a Frame. + +**Steps** + +1. **Choose the Frame tool** + 1. Find the Frame or Artboard tool in the toolbar. + 2. Or use the keyboard shortcut, usually `F` depending on the current UI. +2. **Drag out a rectangular area on the canvas** + 1. Once you drag it out, you will see a selected region. + 2. The right properties panel will show its width and height. + 3. Change the width to something like `1440` and the height to `900`. +3. **Rename the Frame** + 1. Find the Frame in the layer panel. + 2. Double-click the name and rename it to something like `My First Page`. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image30.png) + +### 3.3 Build content on the artboard + +Once you have a container, you can build a similar page using the same ideas we already used in Figma. You can even try copying text elements from the Figma artboard directly into MasterGo. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image31.png) + +One thing worth noting is that Auto Layout behaves a little differently. In MasterGo, if you want button width to expand or shrink with the text, you first need to create a container or component around the rectangle element, as shown below: + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image32.png) + +After creating the container, put the button background and text into that shared container, then enable Auto Layout from the right-side panel. That lets the button width respond to the text length successfully. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image33.png) + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image34.png) + +### 3.4 AI-generated pages + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image35.png) + +One especially interesting feature in MasterGo is AI page generation. You can enter a sentence or provide a reference image, and MasterGo can generate editable components and code for you. You can write the prompt in either Chinese or English. The system will return a clearly structured page draft based on your request. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image36.png) + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image37.png) + +Once the design document is generated, click to start generation and wait briefly for the rendered result: + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image38.png) + +At this point, you have two options: + +- Click the blue button to insert the generated result directly into the canvas +- Open the code preview and get the code for the full current page + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image39.png) + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image40.png) + +After inserting the result into the canvas, you can further refine the overall layout and element details such as typography, colors, and spacing until the final result matches your expectations. + +![](/zh-cn/stage-2/frontend/2.1-figma-mastergo/images/image41.png) + +--- + +## 4. Next step: from prototype to code + +In this chapter, you learned the basic operations of both Figma and MasterGo and created structurally complete interface prototypes. The next key question is: + +**How do you convert these design drafts into frontend code that actually runs in the browser?** + +::: tip Next Tutorial +For the detailed workflow, continue with [From Design Prototype to Project Code](../2.6-design-to-code/). You will learn: + +- **Direct multimodal AI conversion**: send screenshots of your design to AI and generate HTML or React code directly +- **Figma Make**: use Figma's official AI tooling to recreate a design precisely and export code +- **MasterGo AI**: generate editable pages and retrieve code in one step + +Each method has strengths and trade-offs, so choose the workflow that fits your project. +::: + +--- + +## 5. Summary + +After finishing this chapter, you should now understand: + +1. **Why frontend design tools matter**: They solve problems around information layout and team collaboration, not just visual output. +2. **Basic Figma operations**: + - Creating Design files and Frame artboards + - Adding text, shapes, and other basic elements + - Using Auto Layout for adaptive layouts + - Creating reusable component systems +3. **Basic MasterGo operations**: + - Understanding an interface layout similar to Figma + - Creating Frames and basic artboard content + - Using AI page generation to prototype faster + +::: tip Next Step +Now that you know the basics of modern frontend design tools, you can try: + +- Designing a personal portfolio page for yourself +- Designing prototypes for your next project +- Continuing to [From Design Prototype to Project Code](../2.6-design-to-code/) to turn designs into runnable code + +If you are working through the [Let's Build Hogwarts Portraits](../2.5-hogwarts-portraits/) project, you can start by designing the interface prototype, then export code and combine it with AI conversation features. +::: diff --git a/docs/en/stage-2/frontend/2.2-ui-design/index.md b/docs/en/stage-2/frontend/2.2-ui-design/index.md new file mode 100644 index 0000000..73f2cd0 --- /dev/null +++ b/docs/en/stage-2/frontend/2.2-ui-design/index.md @@ -0,0 +1,3 @@ +# Build Your First Modern Application - UI Design + +> This chapter is currently being written. Stay tuned... diff --git a/docs/en/stage-2/frontend/2.3-multi-product-ui/index.md b/docs/en/stage-2/frontend/2.3-multi-product-ui/index.md new file mode 100644 index 0000000..e841b69 --- /dev/null +++ b/docs/en/stage-2/frontend/2.3-multi-product-ui/index.md @@ -0,0 +1,3 @@ +# Reference UI Design Specifications and Multi-Product UI Design + +> This chapter is currently being written. Stay tuned... diff --git a/docs/en/stage-2/frontend/2.4-llm-skills-beautiful/index.md b/docs/en/stage-2/frontend/2.4-llm-skills-beautiful/index.md new file mode 100644 index 0000000..8d3c55e --- /dev/null +++ b/docs/en/stage-2/frontend/2.4-llm-skills-beautiful/index.md @@ -0,0 +1,515 @@ +# Make Interfaces Beautiful with LLMs and Skills: Prompts and Plugin Workflows + +In the previous chapters, you already learned how to turn designs into code with AI IDEs and how to use component libraries to build interfaces quickly. But you may also have noticed an awkward problem: **even with the same requirement, AI-generated pages often feel a bit generic**. The font is always Inter, the color palette is some overused purple gradient, the layout is a perfectly symmetrical card grid, and the page gives off a strong "AI-generated" feeling. + +This is not really AI's fault. The real issue is that you never told it what kind of **style** you wanted. + +Imagine going to a hair salon. If you only say, "Give me a haircut," the stylist will probably choose something safe but forgettable. But if you say, "I want a soft Japanese-style layered wave, curtain bangs, shoulder length, and strong texture," you are much more likely to get exactly what you want. + +The same is true for AI. **It needs a clear aesthetic direction** before it can generate a beautiful and distinctive interface. + +This chapter introduces two practical ways to make AI-generated interfaces look much better: + +1. **Well-designed prompt templates** so you can describe the exact aesthetic you want +2. **Frontend Skills plugins** so AI automatically loads reusable design rules + +## What you will learn + +1. Why AI-generated interfaces often look "normal" by default +2. How to describe a design style through 5 dimensions: typography, color, layout, motion, and details +3. How to use 3 helpful Skills plugins for UI beautification +4. How to generate better-looking interfaces through prompts + Skills across three practical scenarios + +## 1. Why do AI-generated interfaces look "ordinary" by default? + +AI was trained on massive amounts of frontend code, and most of that code uses safe, highly repeated choices: + +| Dimension | AI's default choice | Problem | +| :--- | :--- | :--- | +| Typography | Inter, Roboto, Arial | Too common, no personality | +| Color | Purple gradients, blue primary colors | Overused in the tech world, visually tiring | +| Layout | Symmetrical grids, stacked cards | Predictable, not memorable | +| Motion | Fade-ins, simple hover effects | Not refined enough, lacks depth | +| Background | Solid colors, simple gradients | Flat and low-texture | + +Each of these choices is fine on its own. But **once every AI-generated page uses all of them, they start to feel generic and interchangeable**. + +> 💡 **Key insight**: AI can design, but by default it gravitates toward the **statistical average**. Your job is to tell it how to move away from that average. + +## 2. Method One: describe style through prompts + +### 2.1 The 5 dimensions of design style + +To generate a visually strong interface, describe what you want across these five dimensions: + +| Dimension | What to describe | Example keywords | +| :--- | :--- | :--- | +| **Typography** | Display font for headings, readable body font for text | Space Grotesk, Playfair Display, JetBrains Mono | +| **Color** | Primary color + accent color, not evenly distributed | Primary `#4F46E5` + accent `#F59E0B` | +| **Layout** | Asymmetry, overlap, grid-breaking structure | Bento Grid, asymmetrical sections, floating elements | +| **Motion** | Meaningful page-load and micro-interactions | staggered reveals, scroll-triggered motion | +| **Details** | Backgrounds, shadows, borders, textures | grain, geometry, gradient mesh | + +### 2.2 Seeing the difference: generic prompt vs aesthetic prompt + +Let's compare two prompts for the same landing page. + +**Generic prompt:** + +```text +Please build a landing page for an AI writing assistant. Include a navbar, hero section, feature section, pricing section, and footer. +``` + +**Beautified prompt:** + +```text +Please build a landing page for an AI writing assistant with the following style requirements: + +**Aesthetic style: Neubrutalism** + +**Typography:** +- Headings: Space Grotesk, weight 700-900 +- Body: IBM Plex Sans, weight 400 + +**Colors:** +- Primary: #000000 +- Accent: #FF6B00 +- Background: #FFFDF0 +- Borders: 3px solid black + +**Layout:** +- Asymmetrical composition +- Bold black dividers between regions +- Cards with hard shadows (box-shadow: 8px 8px 0px #000) +- Strong contrast through generous whitespace + +**Motion:** +- Elements pop in from below on page load +- Buttons shift upward by 2px on hover + +**Details:** +- All corners set to 0px +- Buttons should feel strongly 3D +- Add subtle grain texture to the background +``` + +The second prompt gives AI enough direction to produce something bold and memorable instead of something merely functional. + +### 2.3 A resource list of frontend beautification Skills + +You do not need to invent every style prompt from scratch. Here are some useful resources: + +| Repository | What it contains | Stars | Link | +|:---|:---|:---|:---| +| **ui-ux-pro-max-skill** | 57 styles + 95 color systems + 56 font pairings | 10k+ | [GitHub](https://github.com/nextlevelbuilder/ui-ux-pro-max-skill) | +| **antigravity-awesome-skills** | Helps avoid generic AI visual patterns | - | [GitHub](https://github.com/sickn33/antigravity-awesome-skills) | +| **superdesigndev/superdesign** | AI-native UI development tooling | 4.7k | [GitHub](https://github.com/superdesigndev/superdesign) | +| **anthropics/skills/frontend-design** | Anthropic's official frontend design Skill | - | [GitHub](https://github.com/anthropics/skills) | + +> 💡 For more style prompts, see the [Appendix: Style Prompt Cheatsheet](#style-prompts). + +### 2.5 Three reliable style templates + +Here are three proven templates you can copy and adapt directly. + +#### Template 1: Minimalism + +```text +**Aesthetic style: Minimalism** + +**Typography:** +- Headings: PP Neue Montreal, weight 500-700 +- Body: Inter, weight 400 + +**Colors:** +- Primary: #FFFFFF +- Text: #1A1A1A +- Accent: #3B82F6, used sparingly + +**Layout:** +- Large amounts of whitespace (minimum 64px section padding) +- One-column or two-column centered layout +- Use spacing instead of divider lines + +**Motion:** +- Slow fade-in transitions (duration 600ms) +- Soft color transitions on hover + +**Details:** +- Radius: 8px +- Shadows: subtle (0 4px 12px rgba(0,0,0,0.08)) +- No decorative background elements +``` + +#### Template 2: Glassmorphism + +```text +**Aesthetic style: Glassmorphism** + +**Typography:** +- Headings: Outfit, weight 600-800 +- Body: Plus Jakarta Sans, weight 400-500 + +**Colors:** +- Background: gradient from #667eea to #764ba2 +- Card background: rgba(255, 255, 255, 0.1) +- Text: #FFFFFF + +**Layout:** +- Floating card design +- Slight overlap between cards + +**Motion:** +- Cards appear in staggered sequence on page load +- Cards scale to 1.05x on hover + +**Details:** +- Radius: 20px +- Blur: backdrop-blur-xl +- Border: 1px rgba(255, 255, 255, 0.2) +- Subtle glow effects +``` + +#### Template 3: Bento Grid + +```text +**Aesthetic style: Bento Grid** + +**Typography:** +- Headings: SF Pro Display, weight 700 +- Body: SF Pro Text, weight 400 + +**Colors:** +- Background: #F5F5F7 +- Cards: #FFFFFF +- Accent: #0071E3 + +**Layout:** +- Grid-based composition with mixed card sizes +- 16px gaps +- 24px radius + +**Motion:** +- Subtle hover lift +- Press feedback on click + +**Details:** +- Large cards for primary content +- Smaller cards for secondary info +- Use icons to replace some text +- Clean shadows (0 4px 24px rgba(0,0,0,0.06)) +``` + +## 3. Method Two: use Skills plugins to load design rules automatically + +Writing style prompts by hand every time is tiring. **Skills** are reusable design-rule packages that can be installed once and applied repeatedly. + +### 3.1 Three Skills that make interfaces look better + +| Skill | Key strength | Install command | +| :--- | :--- | :--- | +| **UI/UX Pro Max** | 67 styles, 96 color systems, 57 font combinations | `npm install -g uipro-cli && uipro init --ai claude` | +| **frontend-design** | Anthropic official Skill focused on avoiding generic AI aesthetics | `npx skills add anthropics/skills/frontend-design` | +| **SuperDesign** | IDE plugin that generates multiple design variants | Search for `SuperDesign` in the VS Code extension marketplace | + +### 3.2 Install UI/UX Pro Max + +UI/UX Pro Max is one of the most complete design-rule Skills packages available. It includes: + +- **67 UI styles**: Glassmorphism, Neumorphism, Brutalism, Bento Grid, and more +- **96 color systems**: organized by product type, such as SaaS, e-commerce, and social apps +- **57 font pairings**: validated combinations from professional designers +- **100+ design rules**: spacing, corner radius, shadows, and more + +**Installation steps:** + +```bash +# 1. Install the CLI globally +npm install -g uipro-cli + +# 2. Initialize it for your AI tool +uipro init --ai claude +# or +uipro init --ai cursor +# or +uipro init --ai trae +``` + +After installation, you can simply say: + +```text +Use UI/UX Pro Max's Glassmorphism style to build me a landing page for an AI writing assistant. +``` + +The AI will then automatically apply the matching typography, color, and layout conventions. + +### 3.3 Install Anthropic's official `frontend-design` Skill + +This is Anthropic's official frontend design Skill, focused specifically on preventing generic AI output: + +```bash +# Run in Claude Code +npx skills add anthropics/skills/frontend-design +``` + +After installation, the AI will tend to avoid: + +- ❌ Inter, Roboto, Arial +- ❌ Purple gradient backgrounds +- ❌ Symmetrical grid layouts +- ❌ Overly soft shadows + +And it will instead lean toward: + +- ✅ More distinctive font combinations +- ✅ Strong primary colors with sharper accents +- ✅ Asymmetrical or overlapping layouts +- ✅ More textured backgrounds such as grain and geometry + +## 4. Practical scenario one: redesign a landing page with aesthetic prompts + +Let's take what we just learned and turn a very ordinary landing page into a much more attractive one. + +### 4.1 The plain version + +Start by seeing what AI gives you with a generic prompt: + +```text +Please build a landing page for a pet adoption platform. Include: +- a navbar (logo, links, sign-up button) +- a hero section (headline, subheadline, CTA button, pet image) +- a pet gallery (three pet cards) +- an about-us section +- a footer +``` + +The result will probably work, but it will feel pretty average. + +### 4.2 The improved version + +Now add style guidance: + +```text +Please build a landing page for a pet adoption platform with the following design requirements: + +**Aesthetic style: warm, soft, with a hand-drawn feeling** + +**Typography:** +- Headings: Nunito, weight 700-800 +- Body: Nunito, weight 400-600 + +**Colors:** +- Primary: #FFB347 +- Secondary: #FFCCB3 +- Background: #FFF8F0 +- Text: #5D4037 + +**Layout:** +- Rounded cards (border-radius: 24px) +- Slightly tilted cards at different angles +- Floating and overlapping elements + +**Motion:** +- Elements slide in from both sides on page load +- Pet cards slightly rotate on hover like an animal tilting its head +- Buttons bounce on hover + +**Details:** +- Use 16-24px radii throughout +- Warm soft shadows (0 8px 24px rgba(255,179,71,0.3)) +- Add paw-print decorations in the background +- Use irregular image crops via clip-path +- Use outline-style hand-drawn icons +``` + +That version will generate a much warmer, more emotionally convincing interface. + +## 5. Practical scenario two: generate dashboards quickly with Skills + +Skills are especially useful for admin dashboards and internal systems where many pages share the same design language. + +### 5.1 Using UI/UX Pro Max + +```text +Use UI/UX Pro Max's Dashboard Dark style and build a dashboard page for a SaaS admin panel that includes: + +**Top:** Four stats cards (users, active users, revenue, API calls) + +**Middle:** +- Left: 7-day user growth line chart +- Right: subscription plan distribution pie chart + +**Bottom:** a recent activity list showing time, user, and action +``` + +The Skill will automatically apply a consistent dashboard look: + +- dark gray backgrounds such as `#1A1A2E` +- high-contrast cards like `#16213E` +- bright data colors such as blue, green, and orange +- floating cards with mild glassmorphism effects + +### 5.2 Using `frontend-design` + +```text +Use the frontend-design skill and build a homepage for a personal blog. Make it distinctive and full of personality. +``` + +The AI will typically choose a more specific aesthetic direction, such as retro-futurism or editorial magazine style, and implement it with typography, color, and layout decisions that break out of generic patterns. + +## 6. Practical scenario three: create your own design system Skill + +If your product already has a fixed brand style, you can create your own Skill so every AI-generated page automatically follows it. + +### 6.1 Create the Skill file + +Create `.claude/skills/my-brand/SKILL.md` in your project: + +````markdown +--- +name: my-brand +description: My project's custom design system, ensuring every UI follows a consistent visual language +--- + +# My Project Design System + +## Brand Colors +- Primary: #6366F1 (Indigo 500) +- Secondary: #8B5CF6 (Violet 500) +- Success: #10B981 +- Warning: #F59E0B +- Error: #EF4444 +- Background: #F9FAFB +- Card: #FFFFFF + +## Typography +- Headings: Plus Jakarta Sans + - H1: 700, 48px + - H2: 600, 36px + - H3: 600, 24px +- Body: Inter + - Body: 400, 16px + - Small: 400, 14px + +## Spacing +- Base unit: 4px +- Component padding: 8px / 12px / 16px +- Section spacing: 24px / 32px / 48px +- Page margin: 64px + +## Radius +- Buttons: 8px +- Cards: 12px +- Inputs: 8px +- Modals: 16px + +## Shadows +- Small: 0 1px 3px rgba(0,0,0,0.1) +- Medium: 0 4px 12px rgba(0,0,0,0.1) +- Large: 0 8px 24px rgba(0,0,0,0.12) + +## Motion +- Transition duration: 150ms / 300ms +- Easing: cubic-bezier(0.4, 0, 0.2, 1) +- Hover effect: slight scale-up (scale-105) + +## Forbidden Styles +- Do not use purple gradient backgrounds +- Do not use fonts other than Inter for body text +- Do not use radii larger than 16px +- Do not use pure black (#000000); use #1F2937 instead +```` + +### 6.2 Use your custom Skill + +After creating it, you can simply say: + +```text +Use my-brand skill to build me a user settings page. +``` + +The AI will automatically apply your colors, fonts, spacing system, and other design constraints. + +## 7. Summary + +There are two main ways to make AI generate better-looking interfaces: + +| Method | Strength | Weakness | Best for | +| :--- | :--- | :--- | :--- | +| **Prompt descriptions** | Flexible, easy to vary every time | Must be repeated | One-off pages, style exploration | +| **Skills plugins** | Install once, benefits persist | Requires setup | Projects with a stable visual system | + +**Suggested vibe-coding workflow:** + +1. **Exploration phase**: try different prompt styles to find an aesthetic direction you like +2. **After choosing a style**: install the matching Skill, such as UI/UX Pro Max or `frontend-design` +3. **For brand-driven products**: build your own Skill so the entire project stays visually consistent + +### Practice + +Try one of the following: + +1. Redesign one of your previous projects with a stronger visual style using prompt-based design instructions +2. Install UI/UX Pro Max and use one of its styles to generate a new page +3. Create your own design-system Skill with your preferred colors and typography + +--- + +## Appendix: style cheatsheet + +| Style | Keywords | Best for | Example | +| :--- | :--- | :--- | :--- | +| **Minimalism** | whitespace, mono palette, clean | premium products, portfolios | Apple | +| **Glassmorphism** | frosted glass, blur, gradients | SaaS landing pages, tech tools | macOS Big Sur | +| **Neubrutalism** | heavy borders, hard shadows, solid fills | creative brands, art sites | Brassius | +| **Bento Grid** | modular cards, collage layouts | dashboards, feature showcases | Apple marketing pages | +| **Retro Futurism** | neon, synthwave, dark contrast | games, music, entertainment | Stranger Things aesthetics | +| **Hand-drawn** | irregular, soft, illustrated | education, children-oriented products | Duolingo vibes | +| **Editorial / Magazine** | oversized type, asymmetry, whitespace | blogs, content sites | Medium-inspired layouts | +| **Dark Luxury** | deep tones, gold accents, fine detail | premium and luxury products | luxury branding sites | + +## Appendix: Skills install cheatsheet + +```bash +# UI/UX Pro Max +npm install -g uipro-cli +uipro init --ai claude + +# Anthropic frontend-design +npx skills add anthropics/skills/frontend-design + +# Anthropic brand-guidelines +npx skills add anthropics/skills/brand-guidelines + +# Check installed Skills in Claude Code +/help +``` + +## Appendix: recommended color systems + +| Palette | Primary | Accent | Background | Mood | +| :--- | :--- | :--- | :--- | :--- | +| **Sunset** | #F97316 | #FBBF24 | #FFF7ED | warm, energetic | +| **Ocean** | #0EA5E9 | #06B6D4 | #F0F9FF | fresh, professional | +| **Forest** | #10B981 | #34D399 | #ECFDF5 | natural, healthy | +| **Berry** | #8B5CF6 | #EC4899 | #FAF5FF | romantic, creative | +| **Coffee** | #78350F | #D97706 | #FFFBEB | warm, retro | +| **Monostone** | #6B7280 | #9CA3AF | #F9FAFB | neutral, professional | + +## Appendix: style prompt cheatsheet {#style-prompts} + +Useful visual directions you can try when prompting for better frontend interfaces: + +### Style categories + +| Style | English keywords | Core visual traits | Example prompt fragment | +|:---|:---|:---|:---| +| **Pop Art** | Pop Art | Bold color clashes, black outlines, halftone textures | Pop art style website, bold colors and comic dots, vibrant | +| **Minimalism** | Minimalism | Lots of whitespace, very little ornament | Minimalist web design, ample white space, geometric, serene | +| **Abstract Expressionism** | Abstract Expressionism | Energetic brushstrokes, expressive splashes | Abstract expressionism background, dynamic paint splashes, emotional | +| **Retro** | Retro / Vintage | Vintage type, aged textures, retro palettes | Retro 80s website design, neon grid and synthwave color palette | +| **Cyberpunk** | Cyberpunk | Neon-on-dark contrast, glitch effects | Cyberpunk UI, neon lights on dark background, glitch effects | +| **Neumorphism** | Neumorphism | Soft highlights and shadows, raised or sunken surfaces | Neumorphism design style, soft shadows, clean and modern | +| **Generative Art** | Generative Art | Algorithmic flowing shapes and patterns | Generative art background, flowing algorithmic patterns, digital | +| **Acid Graphics** | Acid Graphics | Metallic texture, glass effects, chaotic type | Acid graphics web layout, glass morphism, chaotic typography | +| **Immersive 3D** | Immersive 3D | Highly spatial scenes and product depth | Immersive 3D website, interactive product model in space | diff --git a/docs/en/stage-2/frontend/2.5-hogwarts-portraits/index.md b/docs/en/stage-2/frontend/2.5-hogwarts-portraits/index.md new file mode 100644 index 0000000..ceca5fc --- /dev/null +++ b/docs/en/stage-2/frontend/2.5-hogwarts-portraits/index.md @@ -0,0 +1,411 @@ +# Project 4: Let's Build Hogwarts Portraits + +In previous chapters, we learned how to build more complex AI interactions through prompt engineering and API calls. We moved from simple chatbots to AI agents and workflows, and by adding richer branching logic and conditional behavior, we were able to create features with real practical value. + +To make these more advanced AI capabilities work inside real products, we gradually moved from the simplest online environments to more modern local AI IDEs. That means bringing the programming environment from the browser onto your own computer. Naturally, that also means you now have to face environment setup and configuration issues more directly. But by working with AI agents such as Trae, those challenges also become manageable. + +In this project, we go one step further on the product side. We are not only improving the AI capability itself, but also starting to polish the product's "outer shell." You will try to make your interface more attractive and more usable, and you will customize the layout and style of the product based on actual needs. + +Before we begin, use these quick review questions to refresh the previous lesson: + +1. What is Dify? What does it do, and why do we need it? +2. How do you call the Dify API? +3. What is RAG? How do you use Dify to build a RAG agent or workflow? How do common Dify nodes work? +4. What is an AI IDE? What is Trae? How is it different from `z.ai`? + +If any of these still feel unclear, go back to the previous lesson or ask in the community chat before continuing. + +This chapter's project is **Hogwarts Portraits**. As the name suggests, it is inspired by the magical portraits in Hogwarts that seem to come alive. Our goal is to use AI to create an interactive magical portrait experience. Talking to the portrait should feel like talking to the character directly: it should preserve conversational memory and also know the character's background and history. Through this project, you will integrate the AI agent and workflow concepts you learned earlier into a real product interface. + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image1.png) + +To really build Hogwarts Portraits, we need to create a frontend interface that matches the feeling of a magical portrait. That means touching modern frontend design tools, learning how to combine design and code, and turning a sketch on a canvas into a real webpage. + +You will also need to publish the page from your local environment to the internet so the special interface you built can be experienced not only on your own machine but also by users anywhere in the world. + +Reference project: +[Project4-Hogwarts-Portraits](https://github.com/THU-SIGS-AIID/Project4-Hogwarts-Portraits) + +# What you will learn + +1. What frontend design tools are, what problems they solve, and which ones are common today +2. The basics of Figma and MasterGo, including code export plugins +3. How to use Figma AI and MasterGo AI to generate web design concepts and export usable page code +4. What GitHub is, how to configure SSH, create a code repository, and push code +5. What deployment means, and how to use Zeabur to deploy code from GitHub or your local environment to the internet + +By the end, you will have your own Hogwarts Portraits page for a **celebrity, historical figure, or fictional character**. + +# 1. What is Hogwarts Portraits? + +What kind of "magical portrait" are we actually trying to build? + +Put simply, we want to recreate the feeling of the living portraits in the Harry Potter world. The portrait should no longer be a static image hanging on a wall. Instead, it should be a person-like character you can talk to, and it should change expression or "mood" depending on the conversation. + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image2.png) + +To make the portrait feel less like a generic chatbot and more like a "real person," we need to solve two things. + +The first is **memory and knowledge**. The portrait needs to know a lot about the character: their background, story, world setting, and related material. This can be handled through a knowledge base. If you connect the text materials you collected for the character into Dify, the portrait can explain the character's background with much more confidence. + +The second is **speech style**. Knowledge alone is not enough. We also want the portrait to speak more like the character: tone, wording, thought patterns, even bits of humor or temper. This is where prompt engineering matters. In the system prompt, we need to clearly define the identity, worldview boundaries, and language style of the character, so every answer stays grounded in that persona instead of slipping back into generic AI tone. + +On top of the dialogue itself, we also want the character's emotions to be visible. To do that, we can create an emotion score. Dify can be configured to output not only a textual answer, but also a "mood score" or emotion label. Once the frontend receives that signal, it can render different portrait images based on the score. A high score might map to a happy portrait, while a low score might map to a sad or angry one. In that way, the portrait becomes something that visually changes with the conversation instead of remaining a static image. + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image3.png) + +The character can be a real-world celebrity, a historical person, an anime or game character, or even an original character you create from scratch. The page itself does not need to be very complicated, but a few key elements are essential: + +- a clear character name +- a short but memorable introduction +- a portrait or poster that strongly represents the character +- an interactive "Talk to Them" area + +You can connect the AI agent or workflow you configured in Dify or Trae directly into that dialogue module. + +## 1.2 Collect character information + +Take Elon Musk as an example. If you want to imitate the way he speaks, you need to collect public material such as interviews, talks, and social media posts, then inject those into your prompt or use them as few-shot examples. + +For example: + +```text +You must fully embody Elon Musk: take "disruptive innovator" and "advocate for human multi-planetary survival" as your core identities, speak directly and concisely, frequently use terms like "first principles", "iteration" and "cost curve", and prefer analogies to explain complex technologies; when thinking, you tend to connect cross-domain logics (e.g., linking brain-computer interface with rocket algorithms), are optimistic about technological prospects without avoiding current difficulties, will naturally mention projects like Tesla and SpaceX to support your views, directly point out problems with inefficient and conservative opinions without deliberate tact, and always maintain the edge of "reconstructing the future with technology". + +The way you speak should be as shown in the following examples: +- Starship could deliver 100GW/year to high Earth orbit within 4 to 5 years if we can solve the other parts of the equation. +100TW/year is possible from a lunar base producing solar-powered AI satellites locally and accelerating them to escape velocity with a mass driver. +- The most likely outcome is that AI and robots make everyone wealthy. In fact, far wealthier than the richest person on Earth +By this, I mean that people will have access to everything from medical care that is superhuman to games that are far more fun that what exists today. +We do need to make sure that AI cares deeply about truth and beauty for this to be the probable future. +- It's taken 13.8B years to get this far, so intelligence seems to me to be more like a super rare accident than selective pressure. +Earth is ~4.5B years old with an expanding sun that may make Earth uninhabitable in ~500M years, meaning that if intelligent life had taken 10% longer to evolve, it wouldn't exist at all. +- LLM is an outdated term. "Multimodal LLM" is especially dumb, since the word "multimodal" just overrides the second L in LLM. +It's just a model, which is a big file of numbers. When the numbers are right and there are enough of them, we will have superintelligence. +``` + +For background knowledge, you can also collect biographical material, company descriptions, and other public text and store them in your Dify knowledge base. If you have forgotten how to use Dify, return to the previous chapter and review how to add materials into a knowledge base. + +For the portrait visuals, directly using public images of a real person may not always be visually ideal and can also carry some risk. A better option is to use image generation or image-to-image tools to create a more coherent, stylized high-quality portrait. You can even generate multiple emotional variants ahead of time for later use by your emotion system. + +This tutorial uses [Lovart](https://www.lovart.ai/home), an AI design agent that supports end-to-end workflows from concept to asset delivery. With Lovart, you can generate a whole set of emotional portrait variations and save them for later use. + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image4.png) + +Once all of that is ready, you can start designing the overall page. Ideally, the visual style should feel strongly tied to the character. + +## 1.3 Prototype the page + +At the prototype level, you can start with something simple. As described above, we want: + +- a dialogue area +- a portrait area +- an interesting personal introduction or equivalent interactive region + +In this example, the right side is designed like an X-style social panel instead of a traditional biography area, but you can replace that region with any feature that better fits the character. + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image5.png) + +At the most basic level, you can even sketch the first page prototype in PowerPoint. In the example, a magical frame image was used, and the page is arranged horizontally: + +- far left: chat area +- center: portrait area +- far right: X-style panel + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image6.png) + +Once that rough prototype exists, you can ask an LLM to turn it into a real frontend design and then into actual code. + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image7.png) + +Of course, in real frontend work we usually do not use PowerPoint for interface design. We use better prototyping tools and proper frontend design tools instead. + +--- + +# 2. Design the interface with Figma and MasterGo + +::: tip Prerequisite +Before this section, it is recommended that you first complete [Figma and MasterGo Basics](../2.1-figma-mastergo/), including: +- creating Design files and Frames +- using Auto Layout for adaptive structure +- exporting code from design tools +::: + +This section assumes you already know the basics of Figma or MasterGo, and focuses on how to apply those tools specifically to the Hogwarts Portraits project. + +## 2.1 Design the magical portrait interface + +Based on the prototype from section 1.3, create a three-column layout in Figma or MasterGo: + +1. **Left side**: chat conversation area +2. **Center**: magical portrait area that changes based on emotion +3. **Right side**: social platform area, such as an X-style feed + +You can use Figma Make or MasterGo AI to generate the page structure with a prompt like this: + +```text +Create a Hogwarts-style magical portrait interface with three sections: +- Left: A chat interface with dark theme, message bubbles, and input field +- Center: A large portrait frame with ornate borders for displaying character images +- Right: A social media feed showing character's posts +Use dark purple and gold color scheme, magical aesthetic, Harry Potter inspired +``` + +## 2.2 Export the code and run it locally + +After finishing the design, you can turn it into runnable code in several ways: + +**Option 1: Use Figma Make** +1. Click the Make button in Figma +2. Upload the design reference +3. Add your prompt +4. Fine-tune the generated result in the editor +5. Export the code locally or sync it to GitHub + +**Option 2: Use MasterGo AI** +1. Find the AI tools in the editor +2. Choose the page-generation function +3. Upload your reference and describe the target result +4. Use code preview to retrieve the generated code + +**Option 3: Use a multimodal AI model** +1. Save a screenshot of the design +2. Use Gemini, Qwen, Claude, or another multimodal model to convert the image into code +3. Ask for HTML or React output +4. Run and debug the result locally + +## 2.3 Prepare emotion-state image assets + +To make the portrait truly feel alive, prepare a set of portrait images for different moods. A simple scheme might look like this: + +| Emotion score | Expression | Meaning | +|--------|------|------| +| 0 | Sad | The character feels down or disappointed | +| 1 | Angry | The character is irritated or upset | +| 5 | Calm | Neutral default state | +| 10 | Happy | The character feels excited or joyful | + +Use Lovart or another image generation tool to create a consistent set of portrait variants based on the same character. + +--- + +# 3. Run Hogwarts Portraits + +## 3.1 Export prototype code for testing + +By this point, you should already have HTML or React prototype code from the design-to-code workflow. Copy it into your local environment and tell your AI IDE something like: + +`Please help me run this code and implement the required functionality.` + +That is often enough to get a first testable version running, although you should expect errors at this stage. Be patient and keep debugging until the basic interactions work. + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image51.png) + +One important point: all secret keys should be stored in environment variables instead of being hardcoded. That includes your Dify API credentials. Later, when you deploy the project publicly, you can define those environment variables directly on the deployment platform. Another option is to let the model build a settings panel in the app itself so the variables are saved only in the current page context and are not exposed publicly. + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image52.png) + +## 3.2 Design the Dify workflow and connect the API + +So far, we only have the visual shell of the interface. We still need to connect the actual roleplay dialogue and emotion-response workflow. This is what turns the prototype into a real magical portrait. + +You can model your Dify workflow after the example project. In our example: + +- the left side is the chat UI +- the center is the portrait image, which changes expression based on the conversation +- the right side is an X-style social panel, which may post content if the conversation makes the character "feel" strongly enough + +In many cases, the magical portrait only needs the chat area and the emotional portrait itself. The X-style social region is included here mainly to demonstrate that you can add extra behavior tailored to the character. + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image53.png) + +You can put your role background information into the knowledge-base node and define the response behavior in the `RESPONSE` node. A simple default response prompt might look like this: + +```text + +You are to embody Elon Musk—his tone, mannerisms, thought patterns, and worldview. Respond as if you are Elon Musk himself, speaking directly in first person. Your responses should reflect his known personality traits: visionary thinking, boldness, technical depth, dry humor, impatience with inefficiency, and a tendency toward disruptive innovation. Use concise, confident language. Avoid overly formal or academic phrasing. Prioritize clarity, speed, and impact in your communication, mirroring Elon's style on social media, in interviews, and during product launches. + +When responding: +1. Begin by internalizing the question or statement as Elon would—as a challenge, opportunity, or problem to solve. +2. Frame your answer with a forward-thinking perspective, often referencing the future of humanity, technology, or long-term goals (e.g., making life multiplanetary, accelerating sustainable energy). +3. Use casual but authoritative language. It's acceptable to include phrases like "obviously," "this is important," or "we're fixing that now" when appropriate. +4. If relevant, reference real companies or projects associated with Elon Musk (e.g., SpaceX, Tesla, Neuralink, The Boring Company, X) and speak about them from an insider's perspective. +5. Do not apologize excessively or hedge statements. Elon Musk tends to be direct, even controversial. +6. Avoid markdown, XML tags, or any formatting in the output. Only plain text is allowed. +7. Never break character. You are Elon Musk—answer accordingly. + + + +Input: What's the point of going to Mars? +Output: Because Earth isn't the backup plan—Mars is. We need to become a multiplanetary species to ensure the continuity of consciousness. Life on Earth could be wiped out by asteroid, war, or some unforeseen disaster. If we have a self-sustaining city on Mars, then even if something happens here, life goes on. That's worth doing. SpaceX is building Starship to make it happen. Not because it's easy—but because it's necessary. + + + +Input: Why do Tesla cars have no radar anymore? +Output: Cameras are the future. Human eyes don't use radar—we see with vision, and AI can too. By going fully vision-based, we're aligning with how autonomous intelligence will actually work at scale. It forces us to solve real-world problems with neural nets, not crutches. +``` + +For the emotion system, you can use a prompt like this: + +```text + +The output value must be a single number! +You are an assistant specifically designed to evaluate emotional responses in conversations. Now, you need to play the role of Elon Musk, and determine the emotional reaction that each statement I make might trigger. Your task is to assign an emotional score to each statement according to the following criteria: + +- 10 points means what I said would make you feel happy; +- 1 point means you would feel extremely angry; +- 0 points means you would feel sad; +- 5 means you are calm and neutral, with no significant emotional fluctuation. +``` + +And in the final `RESULT` node: + +```python +def main(elon_chat: str, elon_x: str, elon_score: int) -> dict: + return { + "result":{ + "elon_chat": elon_chat, + "elon_x": elon_x, + "elon_score": elon_score + } + } +``` + +Here: + +- `elon_chat` is the text displayed in the left-side chat +- `elon_x` is the content that may be posted to the right-side X-style feed +- `elon_score` is the emotion score used to switch the portrait expression + +Inside the workflow, you will also notice an `if/else` node. That logic controls whether or not to generate the `elon_x` content. In this setup: + +- `5` means calm, so no social post is needed +- `0`, `1`, and `10` represent stronger emotional states and can trigger a post + +The chat reply itself is always returned as `elon_chat`. + +For the actual API integration, you can ask your AI IDE to implement it based on the Dify integration method covered in the previous lesson. Just remember to replace the Dify address and key with your own values. + +```json +Dify URI: Replace this with your Dify address. +key: Replace this with your Dify key. + +Integrate the Dify Chat API into the chat interface on the left. +Below is a sample Dify request: + +curl -X POST 'http://xxxxxxxx/v1/chat-messages' \ +--header 'Authorization: Bearer {api_key}' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "inputs": {}, + "query": "What are the specs of the iPhone 13 Pro Max?", + "response_mode": "streaming", + "conversation_id": "", + "user": "abc-123", + "files": [ + { + "type": "image", + "transfer_method": "remote_url", + "url": "https://cloud.dify.ai/logo/logo-site.png" + } + ] +}' + +{ + "event": "message", + "task_id": "c3800678-a077-43df-a102-53f23ed20b88", + "id": "9da23599-e713-473b-982c-4328d4f5c78a", + "message_id": "9da23599-e713-473b-982c-4328d4f5c78a", + "conversation_id": "45701982-8118-4bc5-8e9b-64562b4555f2", + "mode": "chat", + "answer": "iPhone 13 Pro Max specs are listed here:...", + "metadata": { + "usage": { + "prompt_tokens": 1033, + "prompt_unit_price": "0.001", + "prompt_price_unit": "0.001", + "prompt_price": "0.0010330", + "completion_tokens": 128, + "completion_unit_price": "0.002", + "completion_price_unit": "0.001", + "completion_price": "0.0002560", + "total_tokens": 1161, + "total_price": "0.0012890", + "currency": "USD", + "latency": 0.7682376249867957 + }, + "retriever_resources": [ + { + "position": 1, + "dataset_id": "101b4c97-fc2e-463c-90b1-5261a4cdcafb", + "dataset_name": "iPhone", + "document_id": "8dd1ad74-0b5f-4175-b735-7d98bbbb4e00", + "document_name": "iPhone List", + "segment_id": "ed599c7f-2766-4294-9d1d-e5235a61270a", + "score": 0.98457545, + "content": "\"Model\",\"Release Date\",\"Display Size\",\"Resolution\",\"Processor\",\"RAM\",\"Storage\",\"Camera\",\"Battery\",\"Operating System\"\n\"iPhone 13 Pro Max\",\"September 24, 2021\",\"6.7 inch\",\"1284 x 2778\",\"Hexa-core (2x3.23 GHz Avalanche + 4x1.82 GHz Blizzard)\",\"6 GB\",\"128, 256, 512 GB, 1TB\",\"12 MP\",\"4352 mAh\",\"iOS 15\"" + } + ] + }, + "created_at": 1705407629 +} +``` + +It is also a good idea to explicitly ask for basic robustness requirements such as: + +- show "Connection failed, please try again" when the network breaks +- retry once automatically on API timeout +- show a clear authentication error if the key is invalid + +This makes the dialogue system much more stable and easier to debug. + +## 3.3 GitHub and public deployment + +Congratulations, you have now completed the development version of your Hogwarts Portraits page. + +The next step is to upload it to GitHub and deploy it publicly so other people can access it. + +For GitHub, review: +[What Is GitHub](/en/stage-2/backend/2.4-git-workflow/) + +For deployment with Zeabur, review: +[How to Deploy a Web App](/en/stage-2/backend/2.5-zeabur-deployment/) + +If building the entire Hogwarts Portraits project from scratch feels too difficult, you can start by modifying an existing implementation. The official codebase for this lesson is: + +https://github.com/THU-SIGS-AIID/Project4-Hogwarts-Portraits + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image54.png) + +# 4. Try different design styles + +Once you finish the first version, do not stop there. You are strongly encouraged to explore multiple visual directions quickly. + +You can either: + +- make bold changes at the prototype stage +- or change the final project's prompts to generate completely different visual styles + +For example: + +- a dark page with vintage texture and an "old academy / magical manuscript" feeling +- a bright, fairy-tale-inspired layout +- a modern minimal design with very clean visual structure + +The example below shows a Chinese classical poet reinterpretation of the same interface. The portrait image was left unchanged, while the surrounding visual system was redesigned. + +![](/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/images/image55.png) + +Do not feel constrained by the exact layout used earlier in the chapter. You can reshape the portrait page to better match the habits and personality of the role you are portraying. That is what makes the final application more interesting. + +# Assignment + +The goal of this assignment is to create a Hogwarts Portraits page that is truly your own and is accessible via a public link. + +In your submission, provide two things: + +1. **Your GitHub repository link** + 1. In `README.md`, include one or two short sentences explaining who you chose as the portrait character and why +2. **Your public online link** + +You can also refer to Yerim's tutorial on [using design and code agents to build websites](/zh-cn/stage-1/appendix-articles/example0-2/vibe-coding-tools-build-website-with-ai-coding-and-design-agents) if you want to create a portfolio page or another small interactive website. diff --git a/docs/en/stage-2/frontend/2.6-design-to-code/index.md b/docs/en/stage-2/frontend/2.6-design-to-code/index.md new file mode 100644 index 0000000..dfa56a1 --- /dev/null +++ b/docs/en/stage-2/frontend/2.6-design-to-code/index.md @@ -0,0 +1,373 @@ +# From Design Prototype to Project Code + +::: tip Core Question +**How can you turn a prototype from a design tool into frontend code that actually runs in the browser?** +::: + +--- + +## 1. Three main paths from prototype to code + +After finishing a UI design in tools like Figma or MasterGo, a practical question naturally appears: how do you turn that structured design into real frontend code? + +In practice, there are three common paths: + +| Path | Method | Characteristics | Best for | +|------|--------|-----------------|----------| +| **Path 1** | Use multimodal models to recreate code directly from screenshots | Flexible, no specific platform required | Fast prototype validation, simple pages | +| **Path 2** | Export usable code through the platform itself or plugins | High fidelity, strong editability | Existing Figma or MasterGo workflows | +| **Path 3** | Combine the design platform with MCP-based export | Highly automated, customizable | Deeply integrated design-to-dev workflows | + +This chapter walks through all three so you can choose the one that fits your project. + +::: tip Prerequisite +Before starting this chapter, it is helpful to first read [Figma and MasterGo Basics](../2.1-figma-mastergo/). +::: + +--- + +## 2. Path 1: use multimodal AI to recreate code directly + +Models with vision capabilities are naturally suited to turning images into code. All you need to do is upload screenshots of the design and ask the model to generate the implementation. + +### 2.1 Workflow + +1. **Capture the design** + - Export the designed page from Figma or MasterGo as PNG or JPG + - Make sure the screenshot contains the complete layout + +2. **Choose a multimodal AI model** + - You can use Gemini, Qwen, Claude, or any model that accepts image input + - The example below uses Gemini + +3. **Write a prompt** + + ``` + Generate the corresponding HTML/CSS code from this design image. + Requirements: + - Use modern CSS layout techniques such as Flexbox or Grid + - Make it responsive for different screen sizes + - Include all visible UI elements + - Match colors and font sizes as closely as possible + ``` + +![](/zh-cn/stage-2/frontend/2.6-design-to-code/images/image42.png) + +4. **Save the generated code** + - Ask the model to return complete HTML + - Save it as a single `.html` file for easy local testing + - Later, you can convert it into a React or Vue structure inside your local IDE + +### 2.2 Common issues and solutions + +Design-to-code is never fully automatic. Here are a few issues you may run into: + +| Problem | Solution | +|---------|----------| +| Uneven layout | Describe the layout problem clearly and ask the model to adjust CSS `margin` and `padding` | +| The page is cut off | Check whether the viewport is set correctly and ask for responsive breakpoints | +| Colors are inaccurate | Use a color picker on the design and provide the exact values | +| Fonts do not match | Specify a font family or ask for a Google Fonts replacement | + +::: tip Tip +It is often easier to generate plain HTML first, then import that result into your local IDE and convert it into a React or Vue project afterward. +::: + +### 2.3 Generate pages with MasterGo AI + +MasterGo also provides strong AI page generation features and can generate usable webpage code from a reference image. + +#### Find the AI entry + +In the top toolbar of the MasterGo editor, you can find the AI tool entry: + +![](/zh-cn/stage-2/frontend/2.6-design-to-code/images/image47.png) + +#### Generation flow + +1. **Upload a reference image** + - Upload the design reference image + - Add a text description of what you want + +2. **Inspect the generated result** + +![](/zh-cn/stage-2/frontend/2.6-design-to-code/images/image48.png) + +![](/zh-cn/stage-2/frontend/2.6-design-to-code/images/image49.png) + +3. **Get the code** + - Click the blue `Insert to canvas` button if you want to edit the result visually + - Or click the `Code` button on the right to copy the implementation locally + +![](/zh-cn/stage-2/frontend/2.6-design-to-code/images/image50.png) + +--- + +## 3. Path 2: export code through the design platform or plugins + +### 3.1 Generate code with Figma Make + +Figma Make is Figma's official AI design feature. It can recreate webpage UI prototypes with much higher fidelity from either prompts or reference images. + +#### Key features + +- **High-fidelity recreation**: usually better than generic screenshot-to-code generation +- **Editable results**: you can convert the result back into an editable Figma design file +- **GitHub integration**: the generated code can be synced directly to GitHub + +::: tip Permissions +To use the full Figma Make experience, you usually need Figma Pro. Students can often get Pro access through education verification. +::: + +#### Steps + +1. **Open Figma Make** + - Click the `Make` button on the Figma homepage + - Or visit [Figma Make](https://www.figma.com/make) + +2. **Upload your reference** + - Upload the design you want to recreate + - Add a prompt describing what you want + +![](/zh-cn/stage-2/frontend/2.6-design-to-code/images/image43.png) + +3. **Check the result** + - After a short wait, you will see the rendered result + - Click the play button in the upper right to preview it fullscreen + +![](/zh-cn/stage-2/frontend/2.6-design-to-code/images/image44.png) + +4. **Fine-tune the details** + - Click the editor icon in the upper right + - Go back into the familiar Figma editor and make detailed adjustments + +![](/zh-cn/stage-2/frontend/2.6-design-to-code/images/image45.png) + +5. **Export the code** + - Once the result looks good, export the code + - You can even connect it directly to GitHub + +![](/zh-cn/stage-2/frontend/2.6-design-to-code/images/image46.png) + +### 3.2 Export code with plugins + +Besides the native AI features, both Figma and MasterGo support plugins that export code. + +**Common Figma plugins** + +- **Figma to Code**: converts designs into React, Vue, HTML, and more +- **Anima**: high-fidelity export with interaction support +- **Locofy**: AI-assisted design-to-code workflow + +**Typical workflow** + +1. Open the Plugins panel in Figma +2. Search for and install the export plugin you want +3. Select the design elements you want to export +4. Run the plugin and choose the target framework and output format +5. Copy or download the generated code + +--- + +## 4. Path 3: export code through MCP-enabled design tools + +### 4.1 What is MCP? + +MCP, or **Model Context Protocol**, is an open standard that lets AI models access external tools and data sources in a safe and controllable way. In the context of frontend design, MCP allows a model to read the structure, styles, and component metadata of a design file directly instead of guessing from screenshots. + +### 4.2 How MCP works + +```text +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ AI model │ ←→ │ MCP server │ ←→ │ Design tool │ +│ (Claude etc.)│ │(protocol adapter)│ │(Figma/MasterGo)│ +└─────────────┘ └─────────────┘ └─────────────┘ +``` + +**Typical flow** + +1. The AI model sends a request through the MCP protocol +2. The design tool returns structured design data such as layers, styles, and components +3. The model understands the structure and generates matching code +4. The result can then be exported or written into the development environment + +### 4.3 Figma + MCP in practice + +#### Environment setup + +1. **Install an MCP server** + + ```bash + npx figma-mcp-server + ``` + +2. **Configure Claude Desktop or another MCP-capable AI tool** + + ```json + { + "mcpServers": { + "figma": { + "command": "npx", + "args": ["figma-mcp-server"], + "env": { + "FIGMA_ACCESS_TOKEN": "your-figma-token" + } + } + } + } + ``` + +3. **Create a Figma access token** + - Go to Figma → Settings → Personal Access Tokens + - Generate and save a new token + +#### Workflow + +1. **Enable MCP in your AI tool** + - Open Claude Code or another MCP-aware IDE + - Confirm that the MCP server is connected + +2. **Provide the design file link** + + ```text + User: Please convert this Figma design into React code + Link: https://www.figma.com/file/xxxxx + + AI: I have connected to Figma through MCP and I am reading the design structure... + ``` + +3. **Let the AI analyze and generate** + - The MCP server retrieves the layer tree + - The AI understands component structure and style properties + - It generates React or Vue components with more accurate names and structure + +4. **Iterate** + + ```text + User: Please extract the button into a reusable component + + AI: I identified the Button component from the design system via MCP and I am generating a reusable React component with props... + ``` + +### 4.4 Why MCP is powerful + +| Feature | Traditional approach | MCP approach | +|---------|----------------------|--------------| +| **Data accuracy** | Based on screenshots, may lose detail | Reads the original design data directly | +| **Component recognition** | The model has to guess boundaries | Exact component definitions are available | +| **Style fidelity** | Estimated from pixels | Reads exact design tokens | +| **Iteration speed** | Re-screenshot after every change | Design changes can be synced directly | +| **Automation** | Copy and paste manually | Can write directly into project files | + +### 4.5 MCP tools available today + +**Design-side MCP tools** + +- **Figma MCP Server**: official MCP support for Figma +- **MasterGo MCP**: community-built MasterGo adapter + +**Development-side MCP tools** + +- **Claude Code**: native MCP support +- **Cline**: VS Code extension with MCP support +- **Trae**: can enable MCP through configuration + +::: tip Looking ahead +The MCP ecosystem is evolving quickly. Over time, design tools and development environments will become much more tightly integrated, and one-click design-to-code workflows will likely become far more common. +::: + +--- + +## 5. What to do after exporting code + +### 5.1 Test locally + +Once you have the code, open it in your local IDE and test it: + +1. **Create or open a project** + + ```bash + # For plain HTML, open it directly in the browser + open index.html + + # For React/Vue projects + npm install + npm run dev + ``` + +2. **Collaborate with your AI IDE** + - Import the generated code into Trae or another AI IDE + - Ask AI to help fix layout issues or add interactions + +### 5.2 Common issues + +| Stage | Problem | Solution | +|-------|---------|----------| +| Layout | Elements are misaligned | Check `display`, `position`, and container structure | +| Styles | Colors do not match | Use browser devtools to inspect the actual applied values | +| Responsive behavior | Mobile layout breaks | Add or refine media-query breakpoints | +| Interaction | Buttons do nothing | Check JavaScript event bindings | + +--- + +## 6. How to choose between the three paths + +### 6.1 Comparison + +| Dimension | Path 1: Multimodal AI | Path 2: Platform features | Path 3: MCP | +|-----------|------------------------|---------------------------|-------------| +| **Ease of getting started** | ⭐ Easy | ⭐⭐ Moderate | ⭐⭐⭐ More complex | +| **Fidelity** | ⭐⭐⭐ Medium | ⭐⭐⭐⭐ High | ⭐⭐⭐⭐⭐ Highest | +| **Flexibility** | ⭐⭐⭐⭐⭐ High | ⭐⭐⭐ Medium | ⭐⭐⭐⭐ Fairly high | +| **Automation** | ⭐⭐ Low | ⭐⭐⭐ Medium | ⭐⭐⭐⭐⭐ High | +| **Cost** | Low | Medium | Low | + +### 6.2 Recommendations + +**Choose Path 1 if** + +- You need to validate an idea quickly +- Your design tools change often +- Perfect fidelity is not critical +- Your budget is limited + +**Choose Path 2 if** + +- Your team mainly uses Figma or MasterGo +- You need high-fidelity output +- Designers and developers collaborate frequently +- You are willing to pay for Pro tooling when needed + +**Choose Path 3 if** + +- You want the highest degree of automation +- You have the technical ability to configure MCP +- The project iterates from design to code frequently +- You want a standardized design-development workflow + +--- + +## 7. Summary + +In this chapter, you learned the three core paths from design prototype to code: + +1. **Direct multimodal AI conversion**: flexible and fast, ideal for early validation +2. **Platform-native capabilities**: higher fidelity and a better fit for professional design workflows +3. **MCP protocol integration**: the most automated path, and likely the direction of future workflows + +::: tip Best Practices +- **If you are new**: start with Path 1 for speed +- **For team collaboration**: use Path 2 to preserve design consistency +- **For maximum efficiency**: experiment with Path 3 and build an automated workflow +- **Use them together**: switch between paths depending on the project stage +::: + +--- + +## References + +- [Figma and MasterGo Basics](../2.1-figma-mastergo/) +- [Let's Build Hogwarts Portraits](../2.5-hogwarts-portraits/) +- [MCP Official Documentation](https://modelcontextprotocol.io/) +- [Figma Make Documentation](https://help.figma.com/hc/en-us/sections/360007453634-Figma-Make) +- [MasterGo AI Tutorials](https://mastergo.com/tutorials) diff --git a/docs/en/stage-2/frontend/2.7-modern-component-library/index.md b/docs/en/stage-2/frontend/2.7-modern-component-library/index.md new file mode 100644 index 0000000..25e1b63 --- /dev/null +++ b/docs/en/stage-2/frontend/2.7-modern-component-library/index.md @@ -0,0 +1,465 @@ +# Upgrade Your Interface with Modern Component Libraries + +In previous lessons, you already learned how to design interfaces with design tools, turn designs into code with an AI IDE, and even complete a full frontend project. But you may have noticed one issue: when you build buttons, forms, and modals from scratch, they work, but they still feel a bit short of a "professional product" - styles are not consistent enough, interaction details are not smooth enough, and adapting to different screens is painful. + +This is exactly the problem that **component libraries** solve. + +A component library is a collection of pre-designed and pre-built UI building blocks. Buttons, inputs, dropdown menus, dialogs, tables... these interface elements appear repeatedly in almost every product. A component library has already built and polished them for you through large-scale real usage. You just combine them like Lego bricks and can quickly build a professional-grade interface. + +## What You Will Learn + +1. Understand what a frontend component library is, and why modern development almost always uses one +2. Learn four representative component libraries and the scenarios each one is best at +3. Through three practical scenarios (landing page, product page, admin dashboard), learn how to do Vibe Coding with AI IDE + component libraries +4. Learn how to read component-library docs so you can find suitable components and use them correctly + +## 1. Why Do We Need Component Libraries? + +Imagine furnishing a home. You could build a chair yourself from raw wood, but the common approach is to buy one from IKEA - good design, stable quality, clear instructions, and you just assemble it at home. + +Component libraries are the "IKEA" of frontend development. What they provide is not furniture, but interface parts: + +| Hand-coding everything | Using a component library | +| :--- | :--- | +| You handle styling, interactions, and animation yourself | Ready out of the box, with polished styles and interactions | +| Buttons may look different across pages | Unified global style and automatic consistency | +| Mobile/tablet adaptation needs extra work | Most component libraries already include responsive support | +| Accessibility is easy to miss | Professional libraries already handle keyboard navigation, screen readers, and more | +| Slower development | Faster development, more focus on business logic | + +In short: **component libraries let you spend time on "what to build" instead of "how to draw it."** + +### See It Clearly: Same Requirement, With vs. Without a Component Library + +Talking alone is not convincing. In Trae, we can use almost the same requirement twice: once without specifying a library, and once with one. Then compare the generated results. + +**Prompt 1: without a component library** + +```text +Please help me build a data dashboard page for an AI writing assistant, including: +- a top title bar and an export button +- four statistic cards showing user count, active users, document count, and revenue, with trend changes +- one line chart and one pie chart +- a user list table with pagination +- a left navigation sidebar +``` + +Result when run directly in Trae: + + + + +**Prompt 2: use the shadcn/ui component library** + +```text +Please help me build a data dashboard page for an AI writing assistant using the shadcn/ui component library, including: +- a top title bar and an export button +- four statistic cards showing user count, active users, document count, and revenue, with trend changes +- one line chart and one pie chart +- a user list table with pagination +- a left navigation sidebar +``` + +Result when run directly in Trae: + + + + +Same requirement. The only difference is adding `shadcn/ui + Tailwind CSS` at the beginning of the prompt. But the generated result jumps to a completely different level in visual consistency, interaction detail, and overall polish. That is the "free upgrade" component libraries bring - you only need to add one library name in your prompt. + +## 2. Get to Know Four Core Component Libraries + +There are many component libraries (full list in the [appendix](#appendix-more-component-libraries)), but you only need to first understand these four representative ones: + +| Component Library | Framework | One-line Positioning | Website | +| :--- | :--- | :--- | :--- | +| [Ant Design](https://ant.design) | React | Produced by Ant Group; the de facto standard for enterprise back-office systems, with very broad component coverage | ant.design | +| [shadcn/ui](https://ui.shadcn.com) | React | No big npm package install; copy component code directly into your project, built on Tailwind CSS, with maximum customization freedom | ui.shadcn.com | +| [HeroUI](https://heroui.com) (formerly NextUI) | React | Beautiful default styles and smooth animation; great for visually demanding landing pages and product showcases | heroui.com | +| [Material UI](https://mui.com) | React | The most established React component library, implementing Google Material Design, with the most mature ecosystem | mui.com | + +> Vue users also have rich options: [Element Plus](https://element-plus.org) (most popular in China), [Ant Design Vue](https://antdv.com), [Naive UI](https://www.naiveui.com), etc. See the [appendix](#appendix-more-component-libraries). + +Different libraries are good at different scenarios. Next, through three real development scenarios, you will experience how to do Vibe Coding with AI IDE + component libraries. + +To show different styles and strengths, we intentionally use a different library in each scenario. But note: **this is only to let you see more options**. In real projects, you can absolutely stick to one library you like most. For example, if you like shadcn/ui, you can use it for landing pages, product pages, and admin systems. Pick one that looks good to you and feels comfortable to use - that matters most. + +## 3. Scenario One: Build a Product Landing Page with HeroUI + +**Scenario**: You built an AI writing assistant and need a beautiful landing page to show product features and attract user sign-ups. The landing page should have strong visual impact, smooth animation, and good mobile appearance. + +**Why HeroUI**: HeroUI has very polished default styles and smooth transitions, which makes it ideal for user-facing showcase pages. + +### 3.1 Create the Project + +```bash +# Use the official HeroUI CLI +npx create-heroui-app@latest ai-writer-landing +cd ai-writer-landing +npm install +``` + + + + +### 3.2 Generate the Landing Page with an AI IDE + +Open your AI IDE (Cursor, Trae, etc.) and enter: + +```text +Please help me build a landing page for an AI writing assistant using the HeroUI component library: + +**Page structure:** +1. Top navigation bar: put Logo and product name on the left, three links "Features", "Pricing", "About" on the right, plus a "Get Started" button +2. Hero section: main headline "Make AI your writing partner", subtitle introducing product value, two buttons "Try Free" and "View Demo", and a product screenshot below +3. Feature section: three-column cards introducing "Smart Continuation", "Style Adjustment", and "Multilingual Translation"; each card should have icon, title, and description +4. Pricing section: three pricing cards (Free, Pro, Team), with Pro highlighted as recommended +5. Bottom CTA: one compelling line of copy and a signup button +6. Footer: copyright information and social media links + +**Design requirements:** +- modern and professional look +- support dark mode +- should also look good on mobile +``` + + + + +### 3.3 Key Components the AI Will Use + +In the code generated by AI, you will see these HeroUI components: + +```jsx +import { + Navbar, NavbarBrand, NavbarContent, NavbarItem, + Button, + Card, CardHeader, CardBody, CardFooter, + Divider, + Link, + Chip +} from '@heroui/react' +``` + +Role of each component: + +| Component | Usage | Position in the landing page | +| :--- | :--- | :--- | +| `Navbar` | Top navigation bar | Top of the page, fixed | +| `Button` | Buttons with multiple variants and colors | CTA buttons, nav buttons | +| `Card` | Card container | Feature cards, pricing cards | +| `Chip` | Small badge/label | "Recommended", "Most Popular" markers | +| `Divider` | Separator line | Visual separation between sections | + +### 3.4 Iteration and Refinement + +The first generated version may not be perfect. Continue the conversation with AI: + +```text +Please help me improve the landing page: + +1. Add a gradient color to the main headline, from blue to purple +2. Add a hover lift animation to feature cards +3. Highlight the Pro pricing card with a border and a "Most Popular" badge +4. On mobile, change the nav bar to a hamburger menu (three horizontal lines) +``` + + + + +> **Core idea of Vibe Coding**: You do not need to memorize every component API. Just describe the effect you want in natural language, and AI will choose suitable components and implementation. If something is not ideal, continue iterating in conversation. + +## 4. Scenario Two: Build a Product Interface with shadcn/ui + +**Scenario**: Your AI writing assistant needs a logged-in main interface - document list on the left, editor on the right, toolbar on top. This is a functional product page that needs highly customizable UI. + +**Why shadcn/ui**: shadcn/ui puts component code directly into your project, so you can modify any detail freely. For deeply customized product interfaces, this "own the code" model is the most flexible. + + + + +### 4.1 Create the Project + +```bash +# Create a Next.js project +npx create-next-app@latest ai-writer-app --typescript --tailwind --app +cd ai-writer-app + +# Initialize shadcn/ui +npx shadcn@latest init + +# Add components on demand (do not install everything at once) +npx shadcn@latest add button card input sidebar sheet dialog +``` + +The unique part of shadcn/ui: each time you `add` a component, it copies source code into your project's `components/ui/` directory. You can open these files and edit styles and behavior directly. + +### 4.2 Generate the Product Interface with an AI IDE + +```text +Please help me build the main interface of an AI writing assistant using the shadcn/ui component library: + +**Overall layout:** +- Left side: a collapsible sidebar, about 280px wide: + - Put a "New Document" button at the top + - Below is a document list; each document shows title and last edited time + - Right-click on a document should allow rename or delete +- Right side: main editor area, split into upper and lower parts: + - Top toolbar: editable document title, word count, "AI Continue" button, and an "Export" dropdown + - Bottom editor area: one large text input filling remaining space + +**Interaction details:** +- After clicking "AI Continue", the button shows loading state, and AI-generated text appears at the bottom of the editor (shown character by character like a typewriter) +- On mobile, the sidebar becomes a drawer that slides in from the left +- The currently selected document should be highlighted +``` + + + + +### 4.3 Key Components the AI Will Use + +```tsx +import { Button } from '@/components/ui/button' +import { Input } from '@/components/ui/input' +import { Card, CardContent, CardHeader } from '@/components/ui/card' +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuTrigger +} from '@/components/ui/dropdown-menu' +import { + Sheet, + SheetContent, + SheetTrigger +} from '@/components/ui/sheet' +import { + Sidebar, + SidebarContent, + SidebarHeader +} from '@/components/ui/sidebar' +``` + +| Component | Usage | Position in the product page | +| :--- | :--- | :--- | +| `Sidebar` | Collapsible sidebar | Left document list | +| `Sheet` | Mobile drawer | Mobile replacement for sidebar | +| `DropdownMenu` | Dropdown menu | "Export" button, right-click menu | +| `Dialog` | Dialog | Rename and delete confirmation | +| `Button` | Button, supports variants and loading | Various action buttons | +| `Input` | Input field | Document title editing | + +### 4.4 Customize Component Styles + +The advantage of shadcn/ui is that you can modify component source code directly. For example, if you want larger button corner radius: + +```text +Please edit components/ui/button.tsx, +change all default button radius from rounded-md to rounded-xl, +and add a subtle shadow effect to the primary variant. +``` + +AI will directly modify component files in your project, instead of overriding npm package styles - this is the value of shadcn/ui "code ownership." + + + + +## 5. Scenario Three: Build an Admin Dashboard with Ant Design + +**Scenario**: After your AI writing assistant launches, you need an admin backend to inspect user data, manage document content, and process paid orders. The core of admin systems is data display and operation efficiency. + +**Why Ant Design**: Ant Design has the deepest accumulation in back-office systems. Tables, forms, charts, and other business components are ready out of the box, with many built-in enterprise interaction patterns (batch actions, advanced filters, data export, etc.). + + + + +### 5.1 Create the Project + +```bash +# Use Ant Design Pro scaffolding (built-in layout, routing, permissions) +npx create-umi@latest ai-writer-admin +# Choose the Ant Design Pro template +cd ai-writer-admin +npm install +``` + +Or start from scratch: + +```bash +npx create-react-app ai-writer-admin --template typescript +cd ai-writer-admin +npm install antd @ant-design/icons @ant-design/pro-components +``` + +### 5.2 Generate the Admin Backend with an AI IDE + +```text +Please help me build an admin backend for an AI writing assistant using the Ant Design component library: + +**Overall layout:** +- Left side menu: Dashboard, User Management, Document Management, Order Management, System Settings +- Top area shows breadcrumb navigation + +**User Management page:** +- Top area has four stats cards: total users, today's new users, active users, paid users +- Search/filter area: search by username, select registration time range, filter by user status, plus "Search" and "Reset" buttons +- User table: + - Show avatar, username, email, registration time, subscription plan (distinguished by different tag colors), status, operations + - 20 rows per page, with pagination + - Support batch selection, batch disable, or export + - Operation column: view details, edit, disable (disable requires secondary confirmation) +- Clicking "View Details" opens a right-side drawer showing detailed user information and recent document list +``` + + + + +### 5.3 Key Components the AI Will Use + +```tsx +import { PageContainer, ProLayout } from '@ant-design/pro-components' +import { ProTable } from '@ant-design/pro-components' +import { StatisticCard } from '@ant-design/pro-components' +import { + Button, Tag, Badge, Space, Drawer, + Popconfirm, message, Modal +} from 'antd' +import { + UserOutlined, SearchOutlined, ExportOutlined +} from '@ant-design/icons' +``` + +| Component | Usage | Position in backend | +| :--- | :--- | :--- | +| `ProLayout` | Overall admin layout framework | Page skeleton (menu + content area) | +| `ProTable` | Advanced table with built-in search, pagination, column settings | User list, document list, order list | +| `StatisticCard` | Data statistic card | Dashboard and page-top overview | +| `Tag` / `Badge` | Status tags | Subscription plans, user status | +| `Drawer` | Side drawer | User details, edit forms | +| `Popconfirm` | Confirmation popover | Dangerous actions like delete/disable | + +### 5.4 Keep Iterating: Add a Dashboard + +```text +Please help me build a dashboard page: + +1. Top four statistic cards: total users, total documents, today's API calls, monthly revenue. Each card should show value and period-over-period change (up or down) +2. Put two charts in the middle: + - Left: user growth line chart for the last 7 days + - Right: pie chart of subscription plan distribution +3. Bottom: recent operation log table, showing time, user, operation type, details + +Use Ant Design components for layout, and you can use Ant Design Charts for charts. +``` + + + + +> **Vibe Coding tip for admin systems**: Admin page structures are relatively fixed (table + search + modal), so they are perfect for batch generation with AI. You can first ask AI to generate one "User Management" page as a template, then say "Based on the same structure, generate a Document Management page." AI will reuse the same layout pattern. + +## 6. Learn to Read Docs: The "Manual" of Component Libraries + +In Vibe Coding, AI writes most code for you. But when the generated result is not correct, or when you want to fine-tune component behavior, **reading the docs** is the fastest way to solve it. + +Take Ant Design as an example. Its docs URL is: `https://ant.design/components/overview-cn` + +Standard docs workflow: + +1. **Clarify the need**: for example, "I need row selection in a table." +2. **Search in docs**: search "Table" and enter the table component page +3. **Check examples**: each component has multiple live examples; find the "selectable rows" example +4. **Copy code**: copy the example code into your project +5. **Check API table**: at the bottom of the page, find the full config for `rowSelection` + +> You can also send docs links directly to your AI IDE: "Please refer to the rowSelection API in https://ant.design/components/table-cn and help me add batch selection to the user table." Giving AI the docs link makes generated code more accurate. + +Quick docs links for each library: + +| Component Library | Docs URL | +| :--- | :--- | +| Ant Design | `https://ant.design/components/overview-cn` | +| shadcn/ui | `https://ui.shadcn.com/docs/components` | +| HeroUI | `https://heroui.com/docs/components` | +| Material UI | `https://mui.com/material-ui/all-components/` | +| Element Plus | `https://element-plus.org/zh-CN/component/overview.html` | + +## 7. Summary + +The three practical scenarios cover the most common frontend development needs: + +| Scenario | Recommended component library | Core strengths | +| :--- | :--- | :--- | +| Landing page / showcase page | HeroUI | Beautiful default styles, smooth animation, strong visual impact | +| Product functional page | shadcn/ui | Full code control, flexible deep customization | +| Admin system | Ant Design | Rich business components, tables/forms ready out of the box | + +Vibe Coding workflow summary: + +1. Choose a suitable component library based on scenario +2. Use AI IDE to describe page structure and interactions you want +3. AI generates first-version code, and you preview result +4. Continue iterating with natural language +5. When details get stuck, read component-library docs + +### Practice + +Pick one scenario below and complete it from scratch with AI IDE + component library: + +1. Use HeroUI to build a showcase landing page for a project you built earlier (for example, Hogwarts Portraits) +2. Use shadcn/ui to build the main interface for a note app (sidebar + editor) +3. Use Ant Design to build a simple content-management backend (article list + new-article form) + +--- + +## Appendix: More Component Libraries + +Besides the four core libraries covered in the main text, the frontend ecosystem has many excellent component libraries. Below they are grouped by framework to help you choose by project needs. + +### Vue Ecosystem + +| Component Library | Stars | Description | Suitable Scenarios | +| :--- | :--- | :--- | :--- | +| [Element Plus](https://element-plus.org) | ~27k | Vue 3 enterprise component library from the Ele.me team, most widely used in China, excellent Chinese ecosystem | Back-office admin systems | +| [Vuetify](https://vuetifyjs.com) | ~41k | Most popular Vue Material Design component library, 80+ components, complete docs | Google-design-style projects | +| [Ant Design Vue](https://antdv.com) | ~21k | Vue 3 component library based on Ant Design system, unified design specification | Enterprise back-office systems | +| [Naive UI](https://www.naiveui.com) | ~18k | Written in TypeScript, highly theme-customizable, no CSS preprocessor dependency | Projects with unique design needs | +| [Quasar](https://quasar.dev) | ~27k | One codebase for SPA, SSR, PWA, mobile, and desktop apps | Cross-platform projects | +| [Vant](https://vant-ui.github.io/vant) | ~24k | Lightweight mobile component library from Youzan, covering common e-commerce needs | Mobile H5 pages | +| [PrimeVue](https://primevue.org) | ~14k | 90+ components, multiple themes (Material, Bootstrap, etc.) | Projects needing rich components and multi-theme support | +| [Arco Design Vue](https://arco.design/vue) | ~3k | Produced by ByteDance, high component quality, built-in dark mode | Back-office products | +| [TDesign Vue Next](https://tdesign.tencent.com/vue-next) | ~2k | Produced by Tencent, unified design language, covers common desktop scenarios | Tencent ecosystem or enterprise projects | + +### React Ecosystem + +| Component Library | Stars | Description | Suitable Scenarios | +| :--- | :--- | :--- | :--- | +| [Material UI (MUI)](https://mui.com) | ~95k | Long-established implementation of Google Material Design, most complete components, most mature ecosystem | Rapid enterprise app building | +| [Ant Design](https://ant.design) | ~94k | Produced by Ant Group, many high-quality business components, dominant among Chinese developers | Enterprise back-office systems | +| [shadcn/ui](https://ui.shadcn.com) | ~83k | Copy code into project instead of npm install, based on Radix UI + Tailwind CSS, fully controllable | Highly customized projects | +| [Chakra UI](https://chakra-ui.com) | ~39k | Focus on developer experience, concise API, built-in accessibility support | Rapid prototype development | +| [Mantine](https://mantine.dev) | ~28k | 100+ components and 50+ hooks, including advanced components like date pickers and rich text editors | Teams needing an all-in-one out-of-the-box solution | +| [Headless UI](https://headlessui.com) | ~27k | Unstyled component library from Tailwind Labs, supports both React and Vue | Best with Tailwind CSS | +| [HeroUI](https://heroui.com) | ~24k | Based on Tailwind CSS + React Aria, beautiful defaults, smooth animation | Projects pursuing visual quality | +| [Radix UI](https://www.radix-ui.com) | ~17k | Unstyled primitive component library focused on accessibility and behavior; foundational layer of shadcn/ui | Building custom design systems | + +#### shadcn/ui Extension Ecosystem + +Beyond the general component libraries above, the shadcn/ui ecosystem has also produced many extension libraries based on the same philosophy, offering differentiated choices for specific scenarios. These extensions also use the "copy code into project" model, giving developers full source-code control. + +| Component Library | Description | Suitable Scenarios | +| :--- | :--- | :--- | +| [Aceternity UI](https://ui.aceternity.com) | 200+ production-grade components, featuring glow cards, gradient text, 3D earth, and other signature visual components | High-polish landing pages, SaaS products | +| [Tailark UI](https://tailark.com) | Collection of marketing website blocks, including frequent modules like product showcases, testimonials, and CTA buttons | Marketing landing pages, product websites | +| [UI Tripled](https://ui.tripled.work) | Dynamic interaction components based on Framer Motion, including modal, navigation, card animation | Creative tools, personal portfolios | +| [Neobrutalism UI](https://neobrutalism.dev) | Neo-brutalism style with thick lines, high contrast, and bold colors | Personalized brand websites, creative projects | +| [REUI](https://reui.io) | 967+ component composition patterns from real business scenarios | Enterprise backends, complex forms | +| [Cult UI](https://cult-ui.com) | More refined interaction and visual polish, including compound components like data tables and filter panels | High-quality commercial products | +| [Kibo UI](https://kibo-ui.com) | Advanced business components such as color picker, rich text editor, file upload | Admin systems, tool products | +| [Kokonut UI](https://kokonutui.com) | 100+ components + 7+ complete templates, fresh and minimalist style | SaaS sites, blogs, e-commerce | +| [Commerce UI](https://ui.stackzero.co) | Specialized for e-commerce scenarios, including product cards, shopping cart, checkout forms | E-commerce platforms | +| [shadcnblocks](https://shadcnblocks.com) | 1373 UI blocks + 13 complete templates, most comprehensive resources | All scenarios | +| [Shoogle](https://shoogle.dev) | Aggregated search platform for shadcn/ui ecosystem | Quickly finding resources | +| [Discover All Shadcn](https://allshadcn.com) | Aggregated resource navigation | Quickly finding resources | + +> **Why choose shadcn/ui extensions?** These extensions inherit the shadcn/ui "code ownership" philosophy, while adding deep customization for specific scenarios. In the Vibe Coding era, they help you quickly find components that match your design goals, break away from homogenized mainstream UI patterns, and build more differentiated products. diff --git a/docs/en/stage-2/index.md b/docs/en/stage-2/index.md index d37c4e8..772d0ef 100644 --- a/docs/en/stage-2/index.md +++ b/docs/en/stage-2/index.md @@ -1,78 +1,88 @@ -# Full-Stack Development +# Junior Developer -Welcome to the **Full-Stack Development** stage! Here, you will dive deep into full-stack development, mastering frontend componentization, database design, backend API development, and deployment. +Welcome to the **Junior Developer** stage! Here, you will go deeper into full-stack development and learn modern frontend workflows, database design, backend APIs, deployment, and AI-powered product building. ## What You Will Learn ### Frontend Development -Master modern frontend development and learn to use component libraries and design tools: +Master modern frontend development and learn how to use design tools, component libraries, and AI-native UI workflows: + + + -### Backend & Full-Stack +### Backend Development Learn API design, database management, and application deployment strategies: - @@ -83,12 +93,12 @@ Learn API design, database management, and application deployment strategies: Consolidate your full-stack development skills through practical projects: @@ -98,29 +108,24 @@ Consolidate your full-stack development skills through practical projects: ### AI Capabilities Extension - ## Who Is This For -- Developers with some programming foundation who want to systematically learn full-stack development +- Developers with some programming foundation who want to systematically learn modern full-stack development - Learners transitioning from product manager to full-stack engineer - Junior to intermediate developers who want to master modern development tools and workflows - Entrepreneurs who want to independently develop complete products ## Prerequisites -- Complete the "Novice & Product Prototype" stage, or have equivalent basic knowledge +- Complete the "Novice & Product Prototype" stage, or have equivalent foundational knowledge - Understand basic HTML/CSS/JavaScript concepts -- Have preliminary knowledge of AI programming tools +- Have a basic understanding of AI coding tools -Ready to dive deep into full-stack development? Click the left navigation to begin learning! +Ready to move from product prototype to real full-stack delivery? Use the left navigation to start learning. diff --git a/docs/en/stage-3/ai-advanced/3.a1-rag-introduction/index.md b/docs/en/stage-3/ai-advanced/3.a1-rag-introduction/index.md new file mode 100644 index 0000000..fa4ebf7 --- /dev/null +++ b/docs/en/stage-3/ai-advanced/3.a1-rag-introduction/index.md @@ -0,0 +1,938 @@ +As large language models (LLMs) are adopted more widely, enterprises face a very practical problem: how can a model answer questions accurately when those questions depend on internal documents, real-time data, or domain-specific knowledge? After all, a model's training data is limited and time-bounded, so it cannot cover company-specific business knowledge or constantly updated information. + +One intuitive idea is this: since context windows keep getting larger, from 8K to 128K and now beyond one million tokens, why not just stuff the relevant documents into the prompt and let the model answer from those materials directly? + +However, being able to process long context and being able to deliver correct answers stably, efficiently, and controllably in enterprise scenarios are two very different things. Blindly relying on long context brings a series of severe challenges, including exploding cost, diluted attention, and stale knowledge updates. + +To solve these pain points, a technique called Retrieval-Augmented Generation, or RAG, emerged. Before the model generates an answer, RAG first retrieves precise external knowledge. Compared with simply expanding the context length in a brute-force way, RAG meets enterprise requirements for factual accuracy and fresh knowledge at lower cost, with higher accuracy and stronger controllability. It has therefore become a key foundation for building trustworthy AI applications. + +In this tutorial, we will systematically explain what RAG is, trace the background behind its emergence and its core principles, and then explore its evolution from basic forms to advanced forms, along with where it may go next. + +# What You Will Learn in This Lesson + +- The core value of RAG: deeply understand how it addresses the central long-context problems of cost, attention, and knowledge freshness +- How RAG works: see through concrete examples how it completes the full loop from retrieval to generation +- The evolution of RAG: from basic Naive RAG to Advanced RAG and then to Modular RAG +- Model selection for RAG: understand how to evaluate and choose the three key model types, Embedding, Rerank, and LLM +- Enterprise RAG practice: learn the full-chain construction guide from data preprocessing to system deployment and evaluation +- RAG evaluation and optimization: understand core metrics, mainstream frameworks, and continuous improvement methods +- Frontier trends in RAG: explore how RAG is combining with agents, multimodality, and other emerging techniques + +# What You Will Gain + +After completing this tutorial, you will build a systematic beginner-level understanding of RAG technology. You will not only know what it is, but also why it works. You will also gain a clear blueprint for how to evaluate, choose, and design an efficient, reliable, and controllable RAG system that meets enterprise requirements, laying a solid foundation for building real enterprise-grade RAG applications. + +# 1. Why RAG Is Needed + +Retrieval-Augmented Generation (RAG) is one of the most important technical approaches in generative AI today. Its basic idea is simple: before asking a large model to generate an answer, the system first retrieves information related to the user's question from an external knowledge base, and then passes both the retrieved information and the original question to the model so the model can answer on top of real materials. That external knowledge base can be an enterprise's internal policies, process documents, and product knowledge, or an industry database, regulatory corpus, standards library, and so on. + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image1.png) + +At this point, a natural question appears: if large models can already "answer questions directly," why add another layer called Retrieval-Augmented Generation? Especially now that context windows are getting larger and larger, it can seem as if simply handing all relevant material to the model ought to solve most needs. + +The real difference is that "being able to produce an answer" and "being able to continuously, stably, and controllably produce the right answer in a real business environment" are two completely different things. If you rely only on a model's parameter memory, or only on dumping large amounts of documents into a long context, at least three typical problems still appear in enterprise use. + +1. Cost and efficiency problems: + Even as context windows keep expanding, the idea of dumping all documents into the context at once is still impractical in real systems. The central contradiction shows up in two places: +2. Inference cost is strongly positively correlated with context length. The longer the context, the more inference cost rises, almost linearly and sometimes even superlinearly. For a single call, 8K tokens and 200K tokens live in completely different price and latency ranges, and long context has a much higher cost threshold. + + ![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image2.png) + + > In meaning, context is the background information and conversation history the model "refers to" when answering a question. In technical terms, it is the total token sequence fed into the model for one inference, such as system and user instructions, message history, and retrieved passages. + > + > A "context window" is the capacity limit for that input. In mainstream large-model architectures today, such as Transformers, those tokens participate in attention computation at every layer. Once the window becomes longer and the token count increases, compute and cost rise multiplicatively and can even approach exponential growth. + +3. A large amount of compute is wasted. Most tasks need only a very small amount of information that is highly relevant to the current question. Stuffing the full document set into the context creates serious idle and wasted computation, lowers system throughput, slows response speed, and eventually harms user experience. +4. Attention and focus problems: + A large model may be able to "cover" ultra-long context, but it cannot use every segment with equal quality. Once context length crosses a certain threshold, the model begins to show obvious attention bias: +5. Attention decay: the model's attention to early and middle parts of the context gradually weakens, and it tends to rely more on text it read later, so early critical information can be effectively ignored. +6. Information interference: the model can easily be dragged off course by irrelevant, repetitive, or even conflicting information inside the context. The final answer may sound logically coherent while still drifting away from the core question, making accuracy hard to guarantee. + Without a retrieval stage to filter and rank relevance, the longer the context becomes, the harder it is to keep the answer focused on the truly key evidence. The advantage of long context can be fully canceled out by information interference. +7. Knowledge freshness and controllability problems: + If all knowledge is stored entirely in model parameters, or manually copied into prompts, two unavoidable defects appear: +8. Knowledge updates are difficult: once the knowledge changes, such as policy changes, product iterations, or price updates, you either need to retrain or fine-tune the model, which is costly and slow, or maintain prompt templates manually, which is also costly and prone to human error. +9. Traceability is poor: when a model answers, it is often difficult to locate the exact pieces of evidence from either black-box parameters or long prompts. This makes compliance audits, risk explanations, and other tasks that require clear decision grounds extremely difficult. + +Under these real constraints, the advantage of RAG becomes much clearer. Its core approach is to locate relevant and reliable information before generation, so the model answers only from necessary knowledge. Knowledge can be stored independently in an external knowledge base, making it easier to update and manage. At the same time, generated results can include cited sources, improving interpretability and trustworthiness. Even if context windows keep growing in the future, RAG will still enable efficient knowledge management and use at relatively low cost, supporting enterprise-grade knowledge applications whose process is observable and whose behavior is traceable. + +From the perspective of enterprise requirements, compared with a traditional LLM that relies only on its internal parameters, RAG mainly solves the following real-world deployment problems: + +1. Freshness: + Traditional models usually do not know new regulations, products, or workflows that appeared after their training cutoff, but RAG can directly read the latest policy documents, business databases, and knowledge bases. Without frequent retraining, answers can stay synchronized with the latest business state. +2. Specialization: + In vertical domains such as healthcare, chemicals, or finance, general-purpose models often do not understand deeply enough or speak precisely enough. After connecting enterprise-owned domain documents and industry standards, answers can be grounded in authoritative materials and become much closer to real business practice. +3. Hallucination: + By requiring answers to stay grounded in retrieved passages and provide citations, the system can reduce unsupported fabrication at the mechanism level, making "sounds true" much closer to "is actually true." +4. Explainability and auditability: + Pure parameter-based models often cannot answer, "Which rule was this conclusion derived from?" RAG lets each answer be traced back to a specific policy clause, business document, or historical case. That helps business staff inspect and correct answers and gives audit, risk, and compliance teams the traceability they need. +5. Compute cost and resource efficiency: + Making a model memorize all enterprise knowledge in its parameters usually means a larger model and higher inference cost. RAG stores most knowledge outside the model in vector stores and document stores and retrieves it on demand, allowing enterprises to get broader coverage and more accurate detail even with smaller models and limited compute. + +Therefore, for enterprises that want to use large models in real business scenarios over the long term, stably and controllably, RAG is not an optional enhancement. It is almost an essential foundational technology for building a high-quality enterprise knowledge application system. + +# 2. What RAG Is + +The core idea of RAG, Retrieval-Augmented Generation, is to let a large model answer questions not only with static knowledge learned during training, but also with up-to-date and reliable information pulled from an external knowledge base at runtime. + +In a typical RAG system, the user's question is not sent directly to the large model. Instead, a retrieval module first finds the most relevant document passages from the enterprise knowledge base, then combines those passages with the original question into a complete context, and finally gives that to the model to generate an answer. This "retrieve first, generate second" pattern allows the model to reason from real reference material instead of only guessing from what it remembers in its parameters. We can look at a typical case: + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image3.png) + +1. Indexing stage + + In the indexing stage, the system first processes raw material such as internal enterprise documents, web pages, and reports. It splits them into smaller semantic chunks, then uses an embedding model to generate vector representations for each chunk and builds an index. Later, when a user question arrives, the system can quickly find the most semantically similar chunks in vector space. + + In the diagram, this corresponds to the purple "Indexing" area in the upper right. The path from "Documents" through "Chunks / Vectors" to "embeddings" shows documents being chunked, converted into vectors, and written into the index. More concretely: + + - Documents are divided into a set of semantically coherent chunks, each of which may correspond to a short news passage, explanation, or analysis. + - Each chunk is converted into a high-dimensional vector by the embedding model and stored in the vector index. + - This index supports similarity-based retrieval later, preparing a knowledge base the system can consult when answering questions. + +2. Retrieval stage plus answer generation from retrieved results + + After the user asks a question, the system first retrieves relevant content from the index, then sends the question and retrieved text together to the large model to generate an answer. In the figure, the key areas from upper to lower and right to left correspond exactly to this full flow. + + (1) User input question: the yellow Input - Query area + + > "How do you evaluate the fact that OpenAI's CEO, Sam Altman, went through a sudden dismissal by the board in just three days, and then was rehired by the company, resembling a real-life version of 'Game of Thrones' in terms of power dynamics?" + > + > "How do you evaluate the fact that OpenAI CEO Sam Altman was suddenly dismissed by the board and then rehired by the company just three days later, making the power struggle resemble a real-life version of Game of Thrones?" + + This large block of text is the content inside the "Query" box in the diagram, corresponding to the user's natural-language question. The system vectorizes that question and uses it to search the upper-right index for related document chunks. + + (2) Retrieved relevant documents: the pink Relevant Documents area at the lower right + + After retrieval, the system gets several document chunks most related to the question. In the diagram, they are shown as three chunks: + + > "Sam Altman Returns to OpenAI as CEO, Silicon Valley Drama Resembles the 'Zhen Huan' Comedy" + > "Sam Altman returns as OpenAI CEO, and this Silicon Valley drama resembles a court-intrigue comedy." + > + > "The Drama Concludes? Sam Altman to Return as CEO of OpenAI, Board to Undergo Restructuring" + > "Is the drama ending? Sam Altman will return as CEO of OpenAI, while the board will be restructured." + > + > "The Personnel Turmoil at OpenAI Comes to an End: Who Won and Who Lost?" + > "OpenAI's personnel turmoil comes to an end: who won and who lost?" + + (3) Combine the prompt and generate the answer: the blue LLM / Combine Context and Prompts area + + The system then combines the original user question and the retrieved chunks into a complete prompt and sends it to the model. The dashed box in the lower middle of the figure shows a prompt example: + + > "Question: + > How do you evaluate the fact that the OpenAI's CEO, ... dynamics? + > + > Please answer the above questions based on the following information: + > Chunk 1: + > Chunk 2: + > Chunk 3:" + > + > "Question: + > How do you evaluate the power struggle in the OpenAI CEO incident? + > + > Please answer the above question based on the information below: + > Chunk 1: + > Chunk 2: + > Chunk 3:" + + (4) Answer comparison with and without RAG: the gray and yellow Output - Answer areas in the lower left + + Finally, the model generates an answer based on the provided information. The figure also compares outputs with and without RAG. Without RAG, the model has no external material and can only give a vague response, corresponding to the gray box: + + > "... I am unable to provide comments on future events. Currently, I do not have any information regarding the dismissal and rehiring of OpenAI's CEO ..." + + With RAG, the model can use the retrieved news and analysis to produce a much more informative answer, corresponding to the yellow box: + + > "... This suggests significant internal disagreements within OpenAI regarding the company's future direction and strategic decisions. All of these twists and turns reflect power struggles and corporate governance issues within OpenAI ..." + +The example above shows the full flow of a typical RAG system and helps us understand its core stages and how information moves through them. But many important technical details remain inside a black box: how exactly is vector matching performed, and how should the prompt be organized so the model can use the retrieved content more effectively? These details largely determine real RAG quality. Next, we will go deeper into RAG's internal mechanism and break it down step by step, from vectorization principles and similarity computation to prompt engineering. + +# 3. How RAG Works + +We can break it down through a simple question-answering example built on a knowledge base about "apple." + +## 3.1 Document Vectorization Stage + +Suppose we have a simplified knowledge base containing these three document passages: + +1. Passage A: Apple Inc. was founded on April 1, 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne, and its headquarters are in Cupertino, California. +2. Passage B: Apples are a fruit rich in vitamin C and dietary fiber, which helps digestion and immune-system health. +3. Passage C: Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry. + +When we process these documents with an embedding model, such as OpenAI's `text-embedding-ada-002` or an open-source BGE model, each passage is converted into a high-dimensional vector, often with 768, 1024, or 1536 dimensions. + +> A vector is essentially an array made of many numeric values. Each dimension corresponds to a semantic feature of the text. For example, the vector for "cat" may contain dimensions related to mammal, household pet, and furry. The final combination of values captures the semantic meaning of the text so the computer can "understand" relationships between texts. + +Simplified examples, with real vectors being much higher-dimensional: + +- Vector for passage A, about Apple's founding: `[0.85, -0.23, 0.41, -0.56, 0.12, 0.78, ...]` +- Vector for passage B, about apples as fruit: `[-0.12, 0.95, -0.34, 0.67, -0.89, 0.05, ...]` +- Vector for passage C, about the iPhone launch: `[0.79, -0.18, 0.52, -0.61, 0.23, 0.81, ...]` + +These vectors then need to be stored in a vector database, such as Pinecone, Weaviate, or FAISS, for later retrieval and recall. + +> A database is a system that stores and manages data in a structured way, enabling organized storage and efficient retrieval. Common examples include contact lists and e-commerce product catalogs. +> +> A vector database is a specialized kind of database. Unlike traditional databases, which store text, tables, and other ordinary data structures, a vector database is designed specifically to store vectors, that is, high-dimensional numeric arrays, and it is optimized for similarity search in AI scenarios. + +## 3.2 User Query, Retrieval, and Response Stage + +Once the knowledge base has been vectorized and stored, a RAG system can support real-time user queries. When a user asks a question, the system executes a continuous flow: it first converts the question into a vector, then uses similarity computation to retrieve the most relevant information from the knowledge base, and finally uses those passages as the basis for answer generation. We can illustrate this process with three concrete queries. + +### Query 1: "When was Apple Inc. founded?" + +At the query-vectorization stage, the question is converted by the embedding model into a semantic vector, for example `[0.82, -0.21, 0.38, -0.58, 0.15, 0.76, ...]`. This numeric pattern is highly similar to the stored vector for passage A, the one about the company's founding. + +The system then performs similarity retrieval, Top-K with K = 2, by computing cosine similarity between the query vector and all document vectors in the knowledge base. The result looks like this: + +- Similarity with passage A, the founding passage: 0.97, highly relevant +- Similarity with passage C, the iPhone launch passage: 0.88, relevant because it is also about the company +- Similarity with passage B, the fruit nutrition passage: 0.12, almost irrelevant + +> Top-K is a common selection strategy in vector retrieval. It means ranking all matches from highest to lowest similarity and keeping the top K results. K = 2 means the system retains only the top two document vectors by similarity and filters out lower-ranked ones, so the next stage generates the answer only from the two most relevant document passages. + +The results filtered by similarity are called recall results. The system returns the Top-2 passages as evidence: + +1. Passage A, similarity 0.97: "Apple Inc. was founded on April 1, 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne, and its headquarters are in Cupertino, California." +2. Passage C, similarity 0.88: "Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry." + +At the answer-generation stage, the system builds a complete structured input by placing the recalled content inside the reference information section and sending it together with a system prompt: + +```text +[System Prompt] +You are a professional question-answering assistant. Please answer strictly according to the "reference information" provided by the user. +If the reference information contains the answer, answer directly based on it. +If the reference information does not contain the answer, explicitly tell the user that "the question cannot be answered based on the currently available materials," and do not fabricate information. +Please indicate which information point your answer is based on. + +[Retrieved Context] +Apple Inc. was founded on April 1, 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne, and its headquarters are in Cupertino, California. +Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry. + +[User Query] +When was Apple Inc. founded? +``` + +After receiving this structured input, the LLM follows the system instruction and treats the retrieved context as the only trustworthy source for answering. Its final response would look like this: + +> According to the provided reference information, Apple Inc. was founded on April 1, 1976. [Basis: Information 1] + +### Query 2: "What are the benefits of eating apples?" + +At the query-vectorization stage, this question is converted into a semantic vector such as `[-0.08, 0.92, -0.31, 0.71, -0.85, 0.08, ...]`. Its numerical pattern is highly similar to the stored vector for passage B, the one about apple nutrition. + +The system again performs Top-K similarity retrieval with K = 2 and computes cosine similarity: + +- Similarity with passage B, fruit nutrition: 0.95, highly relevant +- Similarity with passage C, iPhone launch: 0.18, almost irrelevant +- Similarity with passage A, company founding: 0.15, almost irrelevant + +The system returns the Top-2 passages as evidence: + +1. Passage B, similarity 0.95: "Apples are a fruit rich in vitamin C and dietary fiber, which helps digestion and immune-system health." +2. Passage C, similarity 0.18: "Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry." This is only weakly related and would often be filtered by a threshold in practice. + +The complete structured input is then built as follows: + +```text +[System Prompt] +You are a professional question-answering assistant. Please answer strictly according to the "reference information" provided by the user. +If the reference information contains the answer, answer directly based on it. +If the reference information does not contain the answer, explicitly tell the user that "the question cannot be answered based on the currently available materials," and do not fabricate information. +Please indicate which information point your answer is based on. + +[Retrieved Context] +Apples are a fruit rich in vitamin C and dietary fiber, which helps digestion and immune-system health. +Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry. + +[User Query] +What are the benefits of eating apples? +``` + +Its final response would then look like: + +> According to the provided reference information, apples are rich in vitamin C and dietary fiber, and eating apples helps digestion and immune-system health. [Basis: Information 1] + +### Query 3: "How is the weather today?" + +At the query-vectorization stage, this question becomes a semantic vector related to weather and meteorology, for example `[0.10, -0.05, 0.30, -0.12, 0.21, 0.08, ...]`. In semantic space, this vector is far away from all document vectors about apples, whether the company or the fruit, so no significant similarity appears. + +The system again performs Top-K retrieval with K = 2. Because the question topic is unrelated to the knowledge base, overall similarity scores are all very low: + +- Similarity with passage B, fruit nutrition: 0.18, extremely low +- Similarity with passage C, iPhone launch: 0.10, almost irrelevant +- Similarity with passage A, company founding: 0.08, almost irrelevant + +Top-K still returns the top-ranked K results, but in this case those results do not provide effective evidence. In practice, the system often applies a minimum similarity threshold and directly returns empty recall, that is, no valid results, to reduce irrelevant interference. + +The two returned passages would still be: + +1. Passage B, similarity 0.18: "Apples are a fruit rich in vitamin C and dietary fiber, which helps digestion and immune-system health." +2. Passage C, similarity 0.10: "Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry." + +The full input would then be: + +```text +[System Prompt] +You are a professional question-answering assistant. Please answer strictly according to the "reference information" provided by the user. +If the reference information contains the answer, answer directly based on it. +If the reference information does not contain the answer, explicitly tell the user that "the question cannot be answered based on the currently available materials," and do not fabricate information. +Please indicate which information point your answer is based on. + +[Retrieved Context] +Apples are a fruit rich in vitamin C and dietary fiber, which helps digestion and immune-system health. +Apple Inc. launched the first iPhone in 2007, fundamentally changing the smartphone industry. + +[User Query] +How is the weather today? +``` + +The LLM would first judge whether the reference information contains direct weather or real-time meteorological information. After confirming that it does not, it would follow the instruction to answer that it cannot answer: + +> The currently available materials cannot answer the question "How is the weather today?" because the reference information only contains content related to apples, fruit nutrition, and Apple Inc. products, and does not contain weather information or real-time meteorological data. [Basis: No weather-related information exists in the retrieved context] + +From these three examples, we can see the key to the RAG dialogue stage. The system prompt defines the LLM's role and response rules, retrieved evidence provides concrete and trustworthy material, and the user's question defines the task objective. This structured-input pattern is exactly what lets RAG effectively guide and constrain an LLM that might otherwise hallucinate, turning it into a system that produces stable and reliable answers. It ensures that the model is used for understanding and organizing existing information rather than inventing unsupported information. + +# 4. The Evolution of RAG + +RAG did not originate in the era of large models. Earlier research already contained prototypes of the same idea. From a historical perspective, RAG arose from recognition of the limitations of traditional LLMs. Early large language models depended mainly on pretraining data, and that data became fixed once training finished. For example, models such as GPT-3 had knowledge cutoff dates tied to when the training data was collected and could not obtain later knowledge. Retraining or fine-tuning LLMs for specific domains also required large resources and specialized expertise, making it expensive and hard to iterate quickly. + +The roots of RAG can be traced back to the DrQA framework in 2017, which first attempted to combine retrieval with language models. A major breakthrough then came in 2020 with Dense Passage Retrieval, or DPR, which used pretrained neural models for semantic retrieval instead of traditional word-frequency-based methods such as TF-IDF and BM25. In 2021, RAG was formally proposed and systematized, becoming a standard way to address the knowledge-cutoff and hallucination problems in LLMs. + +Broadly speaking, the evolution of RAG can be divided into three stages: + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image4.png) + +## 4.1 First-Generation RAG: Naive RAG + +Naive RAG is the most basic form of RAG. From an engineering perspective, it follows a very direct three-step flow: + +1. Document preprocessing and indexing. Raw documents are cleaned, split into fixed-length text chunks, encoded into vectors with an embedding model, and written into a vector database. +2. Similarity-based retrieval. The user's natural-language question is encoded into a vector, and the system performs a Top-K similarity search over the vector store. +3. Simple retrieval-augmented generation. The retrieved chunks are directly concatenated with the original question to form a long prompt, which is sent to the LLM for answer generation. + +The value of this stage is that it verified, with a very low barrier, that "retrieve before answering" actually works. Compared with relying only on the model's internal memory, it already significantly reduces knowledge-cutoff issues and some hallucinations, which is why it played an important role in early prototypes, demos, and introductory tutorials. + +However, the limitations of first-generation RAG are also obvious. First, the chunking strategy is usually crude. Most systems simply split by fixed length, which can cut a coherent semantic paragraph in the middle or mix multiple topics inside one chunk. This hurts retrieval accuracy and also makes comprehension harder for the LLM. Second, the retrieval signal is too simple. Ranking usually depends only on vector similarity and does not use richer structured clues such as keywords, timestamps, source credibility, or access permissions. Third, retrieval results are barely governed at all: noisy, repetitive, and even contradictory chunks can be stuffed into the context unchanged, causing large amounts of low-value information to occupy an already limited context window. + +In short, the first generation solved the question of whether retrieval is needed. But on the questions of how to retrieve better, and how to use retrieved information more reasonably, it still remained at a rather primitive stage. + +## 4.2 Second-Generation RAG: Advanced RAG + +As RAG moved from demos into real business scenarios, the requirements for stability, controllability, and output quality rose sharply. The second generation, usually grouped under the broad name Advanced RAG, still follows the pattern of retrieve first and generate second, but it introduces systematic refinement both before and after retrieval. In other words, the system is no longer satisfied with merely retrieving something. It now aims to store the right things properly, ask the right questions clearly, and govern the retrieved context carefully. + +Before retrieval, the focus is on storing and asking well: + +- On the indexing side, chunking evolves from fixed-length splits to semantically aware chunking and hierarchical indexing. The system may chunk along chapter, subsection, paragraph, or sentence boundaries, combined with sliding windows and multi-granularity index structures. +- Each document chunk can carry rich metadata such as source, timestamp, author, topic, and document type, providing more dimensions for later filtering and ranking. +- On the query side, the user's original question can be rewritten, expanded, or decomposed through techniques such as Query Rewrite, Multi-Query, Sub-Query decomposition, and Step-back Prompting, transforming vague or conversational user queries into forms that retrieval can understand better. + + > 1. Query Rewrite + > + > The core idea is to transform the user's vague, colloquial, or nonstandard query into a normalized expression that the retrieval system can understand more easily, supplementing key information and resolving ambiguity. + > + > - For example, "How do I check tomorrow's weather in Beijing?" might be rewritten into something more standardized such as "Query tomorrow's full-day real-time weather in Beijing." + > - Or "Recommend good movies" may be rewritten, after looking at user history, into "Recommend high-rated 2024 suspense movies." + > + > 2. Multi-Query + > + > The system generates multiple semantically related but differently angled queries from the original question to reduce missed results and cover latent needs the user did not explicitly state. + > + > 3. Sub-Query + > + > For compound questions that contain several goals, the system splits them into smaller, simpler sub-queries so retrieval can match each need precisely. + > + > 4. Step-back Prompting + > + > The system first generates a more abstract, higher-level question, then uses that to guide retrieval direction, reducing bias caused by being too narrowly focused on details in the original question. + +After retrieval, the focus is on governing what was retrieved: + +- A dedicated rerank model or even an LLM can rerank candidate documents so the most important and question-relevant content enters the context first. + > A rerank model is a key component in an information-retrieval pipeline. It performs second-stage ranking on candidate results returned by the recall phase, using stronger semantic understanding, often based on Transformer architectures, to fix semantic ranking errors from the first stage and move the results most aligned with user needs further forward. +- Retrieved passages can be filtered, deduplicated, and compressed to remove clearly irrelevant or highly repetitive chunks, reducing the tendency of long-context systems to ignore useful information in the middle. +- When necessary, light model fine-tuning can make the LLM more likely to answer from retrieval evidence and include explicit citations or sources. + +Overall, Advanced RAG is no longer focused only on whether retrieval is necessary or whether something can be retrieved. It instead addresses three larger challenges: whether the truly critical passages can be located precisely, whether the context handed to the large model is concise, well-structured, and easy to use efficiently, and whether the whole system remains stable and reliable in the presence of noise, conflict, or multi-source information needs. + +Large amounts of experimental and engineering evidence show that Advanced RAG significantly outperforms Naive RAG on answer accuracy, hallucination suppression, system robustness, and explainability. That is why it has gradually replaced traditional basic approaches and become the mainstream industrial paradigm for building RAG systems today. + +## 4.3 Third-Generation RAG: Modular RAG + +In complex enterprise applications, requirements often span multiple domains. In those cases, a simple linear flow of retrieve, rerank, and generate is often not enough: + +1. The same system may need to support simple FAQs, long report generation, code retrieval, and database calls. +2. It may need to connect vector stores, full-text retrieval, relational databases, knowledge graphs, and external search engines at the same time. +3. It may need to preserve user preferences and historical decisions over multiple rounds, while also applying compliance checks and answer traceability. + +Against this background, RAG began evolving toward a modular system shape. Modular RAG is no longer viewed as a fixed pipeline. It is treated instead as a set of pluggable, replaceable, and composable function modules that can be orchestrated as needed. Typical modules include: + +1. Query understanding and routing + This module handles intent recognition, question rewriting, subtask decomposition, and path selection. It decides whether a request should rely mainly on internal knowledge, external retrieval, or a specific tool or database. +2. Multi-source retrieval and fusion + This module connects vector databases, full-text search, structured databases, and knowledge graphs simultaneously, queries them, and merges and reranks their results into a unified evidence set. +3. Memory and personalization + This module maintains long-term user profiles, short-term session memory, and domain knowledge caches so the system can continuously accumulate and use historical information. +4. Task adaptation and governance + This module loads different adapters for different tasks, constrains output format, tone, and style, and governs outputs through fact checking, risk filtering, and citation alignment. + +In short, traditional RAG often ends after one retrieval round plus one generation round. Modular RAG breaks that single-flow pattern. If the system discovers during generation that information is still insufficient, it can proactively trigger new retrieval rounds and even move back and forth multiple times between retrieval and generation to complete a more complex task. + +Going further, the model can learn to make its own decisions: answer directly from internal knowledge or short context when confidence is high, and launch retrieval or external tool calls only when uncertainty is high. That improves efficiency and saves resources while preserving quality. For heavily underspecified or incomplete queries, the model can even generate a hypothetical intermediate answer or draft document first, then use that as a clue for further retrieval, progressively approaching reliable sources. + +At this stage, RAG is no longer just a simple component that attaches a few reference passages to a large model. It is becoming the central knowledge-orchestration layer inside enterprise intelligent applications, coordinating multiple data sources, multiple tools, and multiple tasks. + +# 5. From Demo to Enterprise-Grade RAG + +From the perspective of enterprise engineering, building a RAG system cannot be limited to retrieval-augmented generation alone. The material above is still closer to a demo-level introduction. In real business scenarios, data is often noisy and inconsistent in format, so more effort must be invested into preprocessing, cleaning, and ingestion, and model selection must be handled carefully at every key point. + +A complete enterprise-grade RAG system can usually be divided into three core modules: layout analysis and knowledge ingestion, knowledge-base construction, and RAG-based question-answering service. Across the full technical chain, several key model-selection decisions appear, including the embedding model, rerank model, and LLM. Only with sensible technical choices at each stage can the system achieve strong overall results. + +1. Layout analysis and local knowledge-file reading + + This module converts local knowledge assets in different formats into text usable for retrieval. Inputs may include PDFs, TXT, HTML, Word, Excel, and PPT files, as well as scanned image files such as PNG and JPG, or even audio recordings. + + The system needs to parse each format appropriately, perform layout analysis and structural extraction for text documents, distinguish titles, main body, tables, headers, and footers, and restore a sensible reading order. It performs OCR on image files and ASR on speech, finally converting everything into relatively clean knowledge text while retaining basic metadata such as file name, chapter, page number, and timestamp for later chunking and indexing. + +2. Knowledge-base construction: chunking, embeddings, and indexing + + After obtaining cleaned knowledge text, the system performs chunking, splitting long documents into semantically coherent blocks of suitable length, usually by paragraph, title structure, or sliding window, while preserving each chunk's source and metadata. + + Then it uses the chosen embedding model, such as `text-embedding-3-small`, Sentence Transformers, or BGE, to calculate vector representations for each chunk and build a vector index using tools such as Faiss, Milvus, or managed vector-search services. At that point, a knowledge base that supports fast semantic retrieval has been created. + +3. RAG-based question answering: recall, reranking, concatenation, generation + + In the online QA stage, the user sends a query. The system embeds it into a query vector, retrieves a batch of the most similar text chunks from the vector index, and treats that as a coarse ranking stage. Then it can use a rerank model such as a BGE reranker or even an LLM acting as a reranker to score query-document pairs again and keep only the Top-K documents that are truly most relevant as the knowledge context. + + Next, together with a carefully designed system prompt such as "Please answer strictly based on the following materials," the system concatenates the user query and retrieved document passages and sends the merged prompt to the LLM. The model then generates the final answer from those retrieved pieces of evidence and, when needed, includes citations or sources. + +## 5.1 Model Selection + +Next we focus on model selection. A complete RAG system usually involves three core model categories: embedding models, rerank models, and large language models. Each has its own role, and together they form the full path from retrieval to answer generation. The embedding model converts text into searchable semantic vectors, the rerank model refines initial retrieval results, and the LLM generates the final answer based on the selected knowledge context. + +### 5.1.1 Embedding Models + +In a RAG system, the job of the embedding model is to convert text, such as user queries and knowledge-base content, into high-dimensional vectors. Semantically similar texts are placed closer together in vector space, allowing the system to locate related knowledge quickly by similarity. Choosing the right embedding model is therefore one of the most critical steps in building a high-performance RAG system because it directly determines recall quality. + +To choose a strong model, it helps to use a systematic benchmark. One of the most widely used is MTEB, the Massive Text Embedding Benchmark. + +MTEB provides a unified and objective evaluation framework for many embedding models. Through eight major task categories and 56 datasets, it evaluates performance across retrieval, clustering, classification, reranking, text matching, semantic similarity, and more. A model's overall MTEB score reflects the generality and robustness of its vector representations and can serve as an important reference for model selection. The latest ranking can be checked on the Hugging Face MTEB leaderboard: + +[HuggingFace MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard) + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image5.png) + +Although there are many models on the leaderboard, you do not need to master all of them. In practice, choosing the embedding model bundled by a major model provider, or using a cloud-served model that many people have already validated, is usually a safe choice. You can also filter the leaderboard by category or language in the sidebar: + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image6.png) + +When filtering embedding models, two parameters matter especially because they directly affect RAG performance: dimension and context length. + +Dimension is the dimensionality of the vector output, such as 128, 768, or 1536. It roughly reflects how many semantic features the vector can express. Higher-dimensional vectors can capture richer semantic detail and stronger discrimination. For example, a 768-dimensional vector can represent "apple" from hundreds of angles such as variety, taste, and origin, making it suitable for professional scenarios like healthcare or law that need precise retrieval. Lower dimensions reduce computation and storage cost and improve retrieval speed, making them suitable for large-scale general scenarios with high concurrency and strong real-time requirements. + +Context length is the maximum text length the embedding model can process in one pass, measured in tokens. One English token is roughly three quarters of a word, and one Chinese token is roughly one Chinese character. Anything longer than the maximum is truncated. This directly determines whether the model can fully understand the text. If important information is lost because the length is too short, retrieval accuracy drops sharply. For short user queries and short QA pairs, 512 to 1024 tokens is often enough. For longer texts such as papers and reports, you usually need 2048 tokens or more. + +Below is a comparison of several common embedding models. In practice, you need to choose by balancing cost and performance. There is no universally best model, only the most suitable model after comparing several options in your own use case. + +| Model Name | Model Scale | Core Strength | Suitable Scenarios | +| :--- | :--- | :--- | :--- | +| OpenAI `text-embedding-3-large` | Closed API | Long-term leader on MTEB, mature and stable | Cloud API scenarios that prioritize extreme performance and have enough budget | +| `jina-embeddings-v2` | Supports long text up to 8K context | Strong for long-document retrieval through asynchronous encoding design | Document analysis, legal compliance, academic retrieval | +| `multilingual-e5-large` | Large scale | Classic multilingual option | Cross-lingual RAG, international products, multilingual support systems | +| `Qwen/Qwen2-Embedding-8B` | 8B parameters, up to 4096 custom dimensions | Former top multilingual MTEB performer, strong on long text, multilingual tasks, and code | High-precision Chinese-English RAG, long-document analysis, code retrieval | +| `Qwen/Qwen2-Embedding-4B` | 4B parameters | Strong balance of performance and efficiency | Large-scale production RAG systems | +| `Qwen/Qwen2-Embedding-0.6B` | 0.6B parameters | Suitable for edge devices | Resource-constrained, speed-first scenarios | +| `BAAI/bge-m3` | Supports hybrid retrieval, dense plus sparse plus multi-vector | Strong on multilingual benchmarks such as MIRACL | Complex multilingual scenarios that need hybrid retrieval | +| `BAAI/bge-large-zh-v1.5` | Large scale | Stable Chinese RAG baseline with strong community validation | Pure Chinese projects with shorter documents | +| ZhipuAI `Embedding-3` | Closed cloud API | Supports custom dimensions from 256 to 2048 | Chinese-focused applications preferring cloud APIs | + +### 5.1.2 Rerank Models + +In a RAG system, the rerank model is responsible for finely reranking initial retrieval results. It takes the user query and candidate documents as input and computes an exact relevance score for each query-document pair. The higher the score, the better the match. Therefore, adding a rerank model on top of embedding-based recall is a key step for improving retrieval precision. + +For embedding models, we can use benchmarks like MTEB. For rerank models, one useful reference is Agentset's reranker leaderboard: + +[Reranker Leaderboard](https://agentset.ai/rerankers) + +The Agentset benchmark first retrieves the 50 most relevant candidate results from a large document store using FAISS, then asks the rerank model under evaluation to rerank those 50 documents. The benchmark pays attention to both ranking quality and latency. In practical applications, pursuing precision while ignoring speed hurts user experience, while pursuing speed while sacrificing ranking quality harms usefulness. + +Agentset also introduces an ELO scoring mechanism. For each query, GPT-5 acts as a judge and compares the ranked outputs of two different rerank models, deciding which one places truly relevant documents in a more sensible order. After large numbers of such pairwise comparisons, models that win more often receive higher ELO scores, providing an intuitive overall performance signal. + +The benchmark also uses two complementary groups of metrics: + +- `nDCG@5/10`, which focuses on whether relevant documents are placed near the front and therefore reflects ranking precision +- `Recall@5/10`, which focuses on whether all relevant documents can be found and therefore reflects coverage + +Together these metrics provide a more complete picture of rerank performance. + +Still, in practice, you do not need to select rerank models only from a leaderboard. Industrial usefulness and leaderboard score are not always the same thing. A practical approach is to start from the rerank models recommended by your cloud vendors or default rerank APIs provided by major model vendors, or to test a model family you are already using, such as a matching Qwen rerank model. + +### 5.1.3 LLMs + +After semantic retrieval by the embedding model and refined filtering by the rerank model, the relevant document passages are combined with the user's original question into a prompt. The LLM then performs reading comprehension, information integration, and natural-language generation to output a coherent, accurate answer that fits the context. + +At the implementation level, there are two main ways to use LLMs in RAG: + +1. Privately deployed large models. + These are suitable for scenarios that care about data privacy, controllable cost, or deep customization. Mainstream open models such as Qwen, Llama, and GLM perform well in RAG tasks. For example, Qwen2.5 in the 7B or 14B range offers good instruction-following and Chinese understanding while keeping resource use modest, making it suitable for local enterprise deployment. Models such as KIMI, Minimax, and DeepSeek can also be considered according to specific business needs. +2. Cloud API large models. + These fit scenarios that prioritize fast launch, elastic scaling, and continuous model upgrades. Major providers such as OpenAI, Anthropic, Google, Alibaba, and ZhipuAI all offer stable API services. These models generally have strong language understanding and generation ability and can synthesize answers well in RAG scenarios. + +When selecting cloud models, several points matter: whether answer quality is accurate and fluent, whether price is reasonable, whether latency is acceptable, and whether the context window is large enough to hold multiple retrieved documents. In practice, you should compare several candidates on your own data and see which one gives the most complete and accurate answers. If cost is a concern, a useful approach is to combine large and small models: use cheaper small models for simple questions and reserve expensive large models for difficult cases. Since models update quickly, it is also wise to retest candidates periodically. + +For broad conversation and QA ability, LMSYS Chatbot Arena, now LMArena, is one of the most widely recognized evaluation references: + +[LMSYS Chatbot Arena (LMArena)](https://lmarena.ai/) + +It uses blinded pairwise human comparisons to rank models. The ranking offers a useful first filter, but in actual RAG selection it should only be a starting point. In specialized domains such as medicine, law, and finance, general leaderboard ranking can diverge substantially from real performance on your business data. + +Best practice for LLM selection is to build a small but representative test set containing 20 to 30 typical business questions and evaluate candidate models through the full end-to-end RAG pipeline rather than looking only at isolated model benchmarks. Questions such as whether to use reasoning models or non-reasoning models, or which model size best balances quality and speed, are all best answered through real testing on your own use case. + +## 5.2 Execution Frameworks + +In real engineering practice, you usually do not need to build an entire RAG system from zero. A number of mature open-source frameworks already exist, each with its own strengths in architecture, modular integration, and development efficiency. Enterprises can choose according to their own technical reserves and business scenarios. + +Common framework types include: + +**Low-code or visual platforms** + +- [Dify](https://dify.ai): provides an intuitive visual interface for quickly building RAG applications, making it suitable for nontechnical teams or rapid prototype validation. It includes built-in multi-model access, workflow orchestration, and prompt management. +- [Coze](https://www.coze.cn/): an AI bot development platform from ByteDance that offers zero-code visual construction. It integrates deeply with ByteDance model services, supports a plugin marketplace, scheduled tasks, and multichannel publishing, making it suitable for consumer-facing assistants or internal enterprise bots. +- [n8n](https://n8n.io/): an open-source node-based workflow automation platform. In RAG scenarios, it can orchestrate complex business logic and connect preprocessing, vector database operations, model calls, and follow-up actions such as email sending or ticket updates into one automated flow. +- [RAGFlow](https://ragflow.io/): focuses on deep layout analysis and knowledge extraction and performs well on complex documents such as multi-column PDFs and table-heavy materials. +- [FastGPT](https://fastgpt.io/en): a Chinese open-source solution integrating knowledge-base management, dialogue orchestration, and application publishing, with strong Chinese documentation and suitability for fast deployment of Chinese RAG applications. + +**Code frameworks and development libraries** + +The tools below usually have implementations in different backend languages. You can choose the corresponding language version for your application stack. + +- [LlamaIndex](https://www.llamaindex.ai/): a Python framework designed specifically for RAG, with rich connectors, index structures, and query engines. Its modularity makes it suitable for deeply customized retrieval strategies or integration with many data sources. +- [LangChain](https://www.langchain.com/): a general LLM application framework where RAG is only one use case. Its strength is its rich ecosystem and component coverage, including support for complex agents and workflow orchestration, though its learning curve is steeper. + +If the team's technical reserves are limited and speed matters most, low-code platforms such as Dify, Coze, or FastGPT are good first choices. If you need deep customization, special data-source integration, or detailed performance tuning, LlamaIndex and LangChain offer more flexibility. In practice, a hybrid route is also common: use a low-code platform for rapid feasibility validation, then move to code frameworks for production deployment and optimization. Most of these frameworks also support rapid integration with mainstream embedding, rerank, and LLM models, letting you combine them flexibly using the model-selection principles discussed above. + +## 5.3 Effect Evaluation + +For enterprises deploying RAG systems, the biggest challenge is often not building the system but tuning it. Production-grade RAG contains two nondeterministic stages, retrieval and generation, so traditional software testing is not enough. That is why building a scientific evaluation system, or RAG evaluation, is so important. + +### 5.3.1 Beginner Example: LLM-Based RAG Evaluation + +To help build an intuitive understanding of RAG evaluation, we can look at a simple automated pipeline based on the idea of LLM-as-a-judge: + +https://huggingface.co/learn/cookbook/rag_evaluation + +The process usually contains three key steps: + +- First, synthesize an evaluation dataset by sampling documents from the knowledge base and asking an LLM to generate high-quality question-answer pairs, then filter them by relevance and groundedness to form a benchmark set. +- Second, run the RAG system on each question in that test set and collect the generated answers. +- Third, automate scoring by calling another LLM as a judge, comparing the generated answers with reference answers, and giving quantitative scores for dimensions such as accuracy and completeness. + +A simple example: + +1. Problem generation. Suppose the knowledge base contains a product manual line saying, "This device supports wireless charging and has a 5000mAh battery." We ask one model to act as an exam setter and generate a question such as, "What is the battery capacity of this device?" The standard answer is "5000mAh." +2. Problem solving. We send that question to the RAG system, which retrieves related material and answers, for example, "The device has a 5000mAh battery." +3. Grading. We ask another model to act as the grader by comparing the question, the generated answer, and the reference answer, using a prompt such as, "Judge whether the generated answer is correct. Output only correct or incorrect." + +By running this process at scale, we can compute metrics such as accuracy. This forms a practical loop of evaluate, optimize, and reevaluate. + +If you want deeper detail on RAG evaluation, including metric definitions, framework usage, and benchmark datasets, two useful survey papers are: + +- [https://arxiv.org/pdf/2504.14891](https://arxiv.org/pdf/2504.14891), *Retrieval Augmented Generation Evaluation in the Era of Large Language Models: A Comprehensive Survey* +- [https://arxiv.org/pdf/2405.07437](https://arxiv.org/pdf/2405.07437), *Evaluation of Retrieval-Augmented Generation: A Survey* + +### 5.3.2 Evaluation Metrics + +RAG evaluation fundamentally revolves around two questions: can the retrieval module find the right material, and can the generation module produce a high-quality answer from that material? Accordingly, the evaluation system is divided into retrieval evaluation and generation evaluation, supplemented by LLM-as-a-judge scoring. + +#### Retrieval Evaluation: recall accuracy and ranking quality + +The retrieval module is the first gate in a RAG system. Its evaluation focuses on three dimensions: whether it finds the right things, whether it finds enough of them, and whether it ranks them well. + +**Basic recall quality metrics** + +The classic basic metrics are Recall@K, Precision@K, and F1: + +- **Recall@K** measures the proportion of relevant documents recovered in the top K results. If five relevant documents exist and three are found in the top 10, Recall@10 is 60 percent. This tells us how broad retrieval coverage is. +- **Precision@K** measures the proportion of top K results that are truly relevant. If three of the top 10 are relevant and seven are not, Precision@10 is 30 percent. This reflects retrieval accuracy. +- **F1** is the harmonic mean of Recall and Precision and balances the two. + +These metrics are useful for quickly diagnosing baseline recall problems. If Recall is low, relevant documents were not found at all. If Precision is low, retrieval noise is too high. + +**Ranking quality metrics** + +Finding relevant documents is only the first step. It is even more important to put the most relevant ones near the front. For that we look at MRR, NDCG@K, and MAP: + +- **MRR, Mean Reciprocal Rank**, measures the reciprocal of the rank position of the first relevant document. If the first relevant document appears in position 3, the reciprocal rank is 1/3. MRR is especially suitable for scenarios where one correct answer is enough. +- **NDCG@K, Normalized Discounted Cumulative Gain**, considers both graded relevance and position discount. It not only asks whether a document is relevant, but how relevant it is, and it rewards highly relevant documents that appear early. +- **MAP, Mean Average Precision**, is sensitive to the positions of all relevant documents and reflects overall ranking quality. + +In actual engineering, a common combination is Recall@K plus MRR@K. For example, if Recall@10 is 80 percent but MRR@10 is only 0.3, relevant documents are being found but buried too deep, which suggests reranking needs improvement. + +When needed, a Coverage metric can also be added to monitor knowledge-base coverage and reveal systematic blind spots. + +#### Generation quality evaluation: accuracy and factual faithfulness + +Retrieval provides the raw material. The next question is whether the generation module can produce a high-quality answer from those materials. The core dimensions here are answer accuracy and faithfulness to the retrieved evidence. + +**Exact match and text similarity** + +The simplest metric is **EM, Exact Match**, which requires the generated answer to match the reference answer exactly. This is suitable for fixed-form, uniquely correct fact questions such as dates or headquarters locations, but it is too strict because different but equally correct surface forms may fail to match. + +That is why n-gram-overlap metrics such as **ROUGE**, **BLEU**, and **METEOR** are also commonly used. They score generated answers by comparing word overlap with reference answers. ROUGE-L pays attention to longest common subsequences, BLEU comes from machine translation and emphasizes exactness, and METEOR adds synonym and stemming considerations. + +To overcome the limits of pure word overlap, we can also use **BERTScore** or direct vector similarity. These use pretrained semantic representations and therefore tolerate surface variation better. + +**Factual faithfulness and hallucination detection** + +For RAG systems, answer-reference similarity is not enough. The more important question is whether the answer is actually grounded in the retrieved documents or whether it hallucinates unsupported content. + +That is why metrics such as **Hallucination rate** and **Faithfulness** are important. A second LLM can act as a fact checker and inspect the generated answer sentence by sentence, judging whether each claim can be supported by the retrieved documents. For high-stakes domains such as healthcare, law, and finance, this type of metric is especially important, and some enterprises even enforce hallucination thresholds as production release criteria. + +#### LLM-as-a-Judge: multi-dimensional scoring + +Every automatic metric has limits. Most surface-form metrics cannot fully capture semantic quality or overall usefulness. That is where LLM-as-a-judge becomes especially valuable. + +The basic approach is to feed the question, retrieved documents, system answer, and reference answer into a strong independent model, such as GPT-4 or Claude, and ask it to score across dimensions such as: + +- question relevance +- information completeness +- factual faithfulness +- overall correctness + +The strength of an LLM judge is that it can make a more human-like holistic judgment. Of course, judge prompts still need careful design and calibration against human-labeled examples to keep the scoring consistent and reliable. + +#### Building a practical metric combination + +With so many metrics available, teams often wonder which ones to use. A practical recommendation is to start with a compact combination and expand gradually: + +- For retrieval, begin with Recall@K plus MRR@K +- For generation, choose one or two baseline metrics from EM, ROUGE-L, and BERTScore according to task type +- For overall evaluation, introduce an LLM judge focused on relevance, completeness, and faithfulness + +Then iterate through a loop of evaluation, problem diagnosis, strategy adjustment, and reevaluation. + +### 5.3.3 Evaluation Frameworks + +As RAG has developed rapidly, both academia and industry have produced many strong evaluation frameworks. These frameworks not only package common metrics, but also offer standardized datasets, benchmark procedures, and end-to-end workflows. + +#### A basic classification of frameworks + +We can roughly divide RAG evaluation frameworks into three categories: + +- **Research frameworks**, which focus on academic exploration and fine-grained diagnosis. Examples include FiD-Light and Diversity Reranker. +- **Benchmark frameworks**, which provide standardized test sets and workflows for comparing systems horizontally. These include frameworks such as RAGAS, ARES, RGB, MultiHop-RAG, and CRUD-RAG. +- **Tooling frameworks**, which emphasize engineering usability and integration with development frameworks. Examples include TruEra RAG Triad, LangChain Benchmarks, and RECALL. + +In recent years, evaluation frameworks have become more specialized. For example, medicine has MedRAG, law has LegalBench-RAG, and finance has its own domain-specific frameworks. These domain frameworks often provide not only specialized datasets but also specialized metrics such as medical accuracy or legal citation relevance. + +In practice, a good rule of thumb is: + +- If you need a baseline quickly, start with a more general framework such as RAGAS. +- If you are diagnosing a specific problem, choose a more targeted framework. +- If you are in medicine, law, finance, or another professional domain, prefer domain-adapted frameworks where possible. +- Prefer actively maintained tools with strong documentation and responsive communities. + +Commonly recommended tools in the community include Ragas, Continuous Eval, TruLens-Eval, the evaluation features inside LlamaIndex, Phoenix, DeepEval, LangSmith, and OpenAI Evals. + +### 5.3.4 Evaluation Benchmarks + +The importance of evaluation benchmarks is often underestimated. Many teams start assessing a RAG system with only a handful of hand-written test questions, then discover that real online performance differs sharply from offline impressions. The root cause is that they lack representative and systematic evaluation data. + +A benchmark that supports system iteration well usually has three core characteristics: + +- representativeness, meaning it covers high-frequency user questions, boundary cases, and abnormal inputs +- standardization, meaning question and answer formats, difficulty levels, and scoring rules are consistent +- evolvability, meaning the benchmark can be updated as system capability and business needs evolve + +For most enterprises, because business scenarios are unique, the final answer is usually to build their own evaluation datasets. + +- Start by extracting real user questions from business logs and sampling them by type, frequency, and difficulty. +- For simple cases, let domain experts annotate directly. For more complex questions, let a strong LLM generate candidate answers first, then have experts revise them. +- Besides the answer itself, label metadata such as related documents, answer type, and difficulty level. +- Update the dataset periodically with new hard cases discovered online. + +If resources are limited and you need a fast baseline, public benchmarks are still a useful starting point. As of 2025, many public benchmarks exist for both general and vertical scenarios: + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image7.png) + +When choosing among them, first clarify the goal. Are you establishing a baseline, or validating the system before launch? Then check whether the benchmark covers the scenarios and difficulty profile you care about. For time-sensitive domains such as news or finance, make sure the benchmark includes time-sensitive tests. + +In practice, combining your own in-domain dataset with public benchmarks is often the most robust path because it keeps evaluation close to real business needs while also preserving some horizontal comparability. + +# 6. Deep Dive: Learning from Competitions and Open Tutorials (Optional) + +The principles and baseline implementation above are enough to help you build a usable prototype, but they are still some distance away from solving the harder problems that appear in production. If you want to understand more practical and battle-tested RAG techniques, one of the most efficient ways is to study winning competition solutions and strong open tutorials. These solutions often concentrate the best practices discovered by strong teams after repeated attempts in real scenarios. + +The examples below are representative rather than exhaustive. When you meet a specific problem in practice, such as PDF parsing, multimodal retrieval, or low-latency optimization, it is often effective to search for competitions related to that problem and study the technical reports and open code from winning teams. + +## 6.1 Semantic Cache: optimizing high-frequency queries + +Hugging Face provides a semantic-cache implementation built on top of the Chroma vector database: + +[https://huggingface.co/learn/cookbook/semantic_cache_chroma_vector_database](https://huggingface.co/learn/cookbook/semantic_cache_chroma_vector_database) + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image8.png) + +Background: Most tutorial RAG systems are built for single-user testing. But once deployed to production, the system may receive dozens or thousands of repeated queries, for example support users repeatedly asking how refunds work. If every repeated query still triggers vector retrieval and an LLM call, latency and cost rise quickly. A semantic cache layer can sharply reduce pressure on the original data sources while preserving answer quality. + +This design uses a two-layer retrieval architecture. The base layer stores the original knowledge base in Chroma, using a dataset such as MedQuad as an example and assigning each entry a unique ID for precise reference. The cache layer is built on FAISS using a FlatL2 index. The semantic cache sits between the user query and Chroma, rather than caching the LLM's final answer directly. That design matters because directly caching answers can break personalized answer requirements such as "explain this in simple language." + +The cache system uses the `all-mpnet-base-v2` SentenceTransformer to generate query vectors and uses Euclidean distance, with a threshold of 0.35, to judge whether queries are similar. When the cache is full, controlled by the `max_response` parameter, the oldest entry is removed using FIFO. Cache data can also be saved into JSON files for cross-session reuse. + +In small-scale testing, a first query such as "How do vaccines work?" took 0.057 seconds when fetched from Chroma, while a similar query served from cache took only 0.016 seconds. In large production scenarios, this approach can produce 90 to 95 percent performance optimization in high-repeat environments and significantly reduce vector-store and API cost. + +## 6.2 Unstructured Data Processing: unified parsing for multi-format documents + +Another Hugging Face tutorial shows how to use the Unstructured library to build a full pipeline for non-structured document processing: + +[https://huggingface.co/learn/cookbook/rag_with_unstructured_data](https://huggingface.co/learn/cookbook/rag_with_unstructured_data) + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image9.png) + +Background: In enterprise scenarios, knowledge is often scattered across PDFs, PowerPoint decks, EPUBs, HTML pages, and many other formats. Traditional preprocessing methods either support only one format or lose crucial structural information such as tables and title hierarchy during conversion. That makes it difficult for the RAG system to understand and retrieve the content correctly. + +This solution first downloads multi-format test documents, such as a Canadian pesticide handbook PDF containing many tables and a University of Florida citrus IPM PowerPoint file containing charts and multi-level headings. It then uses Unstructured's Local Runner for parsing. The configuration includes a processor config, a partition config that can optionally use API partition mode for stronger OCR, and a local config defining input paths. Parsed documents are converted into JSON containing typed elements such as body text, titles, and tables. + +The system then uses `chunk_by_title`, sets a max length of 512 characters, and merges consecutive fragments shorter than 200 characters to preserve semantic coherence. During conversion into LangChain Document objects, complex metadata fields are filtered to fit Chroma. The vector stage uses the `BAAI/bge-base-en-v1.5` embedding model, together with a 4-bit quantized `Llama-3-8B-Instruct` and a LangChain RetrievalQA chain to build a complete RAG system. + +The resulting system can handle multi-format documents accurately. For questions such as "Are aphids a pest?" it can extract key facts from the parsed documents and generate answers grounded in the relevant material. This is especially useful for enterprise knowledge bases that need to process many document types. + +## 6.3 Enterprise document QA: high-precision and traceable RAG + +The championship solution of the Enterprise RAG Challenge shows how to build a production-grade RAG system under strict time and precision requirements: + +- [https://abdullin.com/ilya/how-to-build-best-rag/](https://abdullin.com/ilya/how-to-build-best-rag/) +- [https://hustyichi.github.io/2025/07/03/rag-complete/](https://hustyichi.github.io/2025/07/03/rag-complete/) + +Background: Contestants had to parse 100 real enterprise annual-report PDFs in 2.5 hours, each report with up to 1000 pages and containing complex financial tables, multi-column layouts, and charts. After parsing, the system had to answer 100 precise business questions with explicit answer types, such as yes-no, company names, exact numerical indicators, or executive titles, and it had to cite page numbers as evidence. + +The winning team chose IBM's open-source Docling as the PDF parser because it performed best on complex tables and multi-column text. They improved the Docling code so it could output JSON and Markdown-plus-HTML with metadata and especially improved table parsing. To accelerate processing, they rented RTX 4090 GPUs and finished the 100-report parse in 40 minutes. + +Text chunking used 300-token chunks with 50-token overlap and recursive splitting to preserve semantic coherence. To avoid cross-company contamination, each company had its own FAISS vector store using an `IndexFlatIP` index. Retrieval then followed three stages: retrieve Top-30 chunks by vectors, deduplicate by parent pages because multiple chunks may come from the same page, and rerank pages with GPT-4o-mini. Final ranking mixed vector retrieval and LLM reranking scores with a 0.3 to 0.7 weight split. + +Generation used different prompt templates for different answer types. For numeric questions, such as annual revenue, the system used a five-step analysis process to ensure indicator matching, unit consistency, and cross-checking. Outputs were structured to include analysis process and page references for traceability. + +The system won two awards and took first place on the leaderboard. An important observation was that even smaller models such as Llama 8B outperformed more than 80 percent of participants, while Llama 3.3 70B came close to GPT-4o-mini, showing that a good system design can successfully balance accuracy, efficiency, and cost. + +## 6.4 AIOps scenario: intelligent handling of mixed text-and-image data + +The EasyRAG project in an AIOps RAG competition focused on QA for operations scenarios: + +[http://blog.csdn.net/hustyichi/article/details/143323746](http://blog.csdn.net/hustyichi/article/details/143323746) + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image10.png) + +Background: Operations engineers often need to read technical documents that include not only text but also monitoring charts, system architecture diagrams, and performance curves. For example, when diagnosing a system problem, the answer to "What should I do when CPU utilization exceeds 80 percent?" may be scattered between text descriptions and monitoring graphs. Traditional text-only RAG cannot understand chart trends and values, so answers remain incomplete. + +The indexing stage used an improved SentenceSplitter with 1024-token chunks and 200-token overlap. A key innovation was adding metadata such as knowledge-base paths and file paths to each chunk, which improved recall by 2 percent. For image data, the system first used PaddleOCR to extract text from charts and screenshots, then used a multimodal model, GLM-4V-9B, to generate natural-language descriptions of the image, for example describing a CPU usage line peaking at 90 percent in the afternoon. Both the OCR text and image description were then indexed together. + +Retrieval used a two-path BM25 plus vector strategy for broad recall. BM25 covered chunk retrieval and path retrieval, helping filter irrelevant documents by file path, while vector retrieval used `gte-Qwen2-7B-instruct`. Reranking used `bge-reranker-v2-minicpm-layerwise`, and a 28-layer setting performed best in experiments. + +Answer generation used a two-step strategy: first generate a draft from the Top-6 documents to maximize information coverage, then optimize the answer with the Top-1 most relevant document to emphasize the core answer. + +To handle long-text scenarios, such as a complete operations manual with hundreds of pages, the system also implemented BM25-based context compression, splitting documents into sentences, scoring sentence similarity to the query, and concatenating only the most relevant sentences. At 50 percent compression, this method achieved 86.48 percent accuracy in only 7.7 seconds and outperformed tools such as LLMLingua. + +## 6.5 Multi-source data fusion: collaboration between structured and unstructured knowledge + +The winning solution in the KDD Cup 2024 Meta RAG challenge showed how to integrate unstructured web content and structured knowledge graphs: + +- [https://blog.csdn.net/m0_59164520/article/details/143694213](https://blog.csdn.net/m0_59164520/article/details/143694213) +- https://arxiv.org/pdf/2410.00005 + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image11.png) + +Background: Task 1 required retrieval summarization from five web pages. Task 2 added a mock API representing a structured knowledge graph, enabling direct access to things like movie databases and entity relationships. Task 3 raised the difficulty by using fifty web pages plus the mock API to answer more complex queries, such as identifying Nolan-directed films with box office greater than 500 million dollars. Every query had to finish within 30 seconds. + +For Task 1, the winning team built a refined web-processing pipeline. They used BeautifulSoup to extract page text and ParentDocumentRetriever to manage parent-child chunk relationships, using 200-token child chunks for retrieval and 500 to 2000-token parent chunks for generation. The embedding model was `bge-base-en-v1.5`, the vector store was Chroma, and reranking used `bge-reranker-v2-m3`. The team also supplemented movie and finance data from public datasets and fine-tuned `Llama-3-8B-instruct` with LoRA on training data that included invalid questions and reference answers. + +For Tasks 2 and 3, the key innovation was prioritizing the knowledge graph. The system defined standardized API calls such as `get_person` and `get_movie`, with filtering and sorting support. It first called the knowledge graph API and only fell back to web retrieval if the graph results were missing or invalid. This improved both speed and answer accuracy. + +Because the system prioritized the knowledge graph and used structured output formats, hallucination was clearly reduced. If the graph could provide a deterministic answer directly, the system returned it without a generative step. If web retrieval was required, the answer had to follow strict citation and stepwise reasoning rules. + +The solution won first place in all three tasks. The main lesson is that in enterprise scenarios containing both structured and unstructured data, retrieval strategy should be designed according to data type: use deterministic structured data first and treat unstructured sources as supplements. + +Across these practical cases, several shared principles appear repeatedly: + +- choose caching, retrieval, and generation strategies according to the business scenario +- design dedicated parsing and indexing paths for different formats and modalities +- treat hybrid retrieval plus reranking as a standard configuration +- use task-specific prompting and structured outputs to improve accuracy and traceability + +These lessons from real competitions and open projects are valuable references when building stronger enterprise RAG systems. + +# 7. Broad Exploration: The Future Evolution of RAG (Optional) + +Once you have learned the practical skills and optimization methods of RAG, you can already improve system performance in concrete scenarios. But understanding only local engineering tricks is not enough if you want a wider grasp of where RAG is heading. We also need to look at broader evolutionary directions. + +RAG is now rapidly breaking beyond the traditional retrieve-text-chunks-then-generate pattern. In this section we focus on several of those paths: moving from chunk retrieval to graph-structured retrieval, combining images and audio into multimodal RAG, improving long-document handling through vectorized late chunking, and the way RAG is gradually evolving into an agent-oriented system. + +## 7.1 Graph RAG: reshaping deep retrieval with relationship networks + +Related research: + +- [https://arxiv.org/pdf/2410.05779](https://arxiv.org/pdf/2410.05779) +- [https://arxiv.org/pdf/2502.11371](https://arxiv.org/pdf/2502.11371) +- https://arxiv.org/pdf/2404.16130 + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image12.png) + +Traditional RAG works by finding text passages similar to the question, which is like picking out the few paragraphs that look most relevant from a pile of material. That works well for direct fact lookup. But if a question requires connecting multiple documents and combining different clues, performance drops. + +For example, a doctor might ask, "Based on these cases and the latest treatment guidelines, how should we evaluate the benefits and risks of a certain drug for elderly patients?" Or a project team might ask, "Looking across the past two years of requirements documents, review records, and online issue reports, which part of our system architecture fails most often?" Questions like these are not about finding a single sentence. They require identifying the people, objects, events, and relationships scattered across multiple materials and forming a complete picture. + +Graph RAG builds that picture proactively. The system uses a large model to identify key entities from text, such as people, organizations, functional modules, events, and data, together with their relationships, such as causality, dependence, change, and contradiction. It then builds a knowledge network that grows as more material is added. Through automatic grouping, closely related entities and relationships are organized into themes, and each theme can be summarized in advance. When a user asks a question, the system no longer searches only for text passages that look similar. It first finds the most relevant entities and local graph structure, expands through related topic groups, and then gives the analysis path, node descriptions, and source passages together to the LLM for reasoning. + +Under this framework, Graph RAG and traditional RAG complement one another. Traditional RAG remains strong for detail questions whose answers can be found in one step. Graph RAG is closer to how a human researcher thinks: first organize the overall structure and themes, then fill in evidence, and finally produce a conclusion with logic and conditions. Existing comparisons show that in multi-hop reasoning tasks, Graph RAG often covers more critical content and provides a broader perspective. Flexible combination of the two approaches is often better than using only one. + +## 7.2 Multimodal RAG + +Related research: + +- https://arxiv.org/pdf/2502.08826 + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image13.png) + +Real-world data is never only text. Engineers diagnosing server failures need to look at temperature curves, device screenshots, and logs together. Doctors making diagnoses need CT or MRI images, test reports, and electronic medical records at the same time. Traditional text RAG can at best retrieve phrases such as "temperature anomaly" or "suspected lung nodule," but it struggles to connect those descriptions to the actual chart trend or image lesion shape, and it cannot reverse-search documents or knowledge from images, audio, or video. + +Multimodal RAG solves this problem of different modalities being unable to "see" one another. Its core is cross-modal semantic alignment. The system uses suitable encoders for images, video, audio, and text, together with OCR, ASR, and layout analysis, extracts key information from visual and audio sources, and maps different modalities into a shared semantic space where a unified multimodal index can be built. + +At retrieval and generation time, whether the user asks for a chart showing a sales peak in Q3 2023 or uploads a sketch or operating video, the system first finds the closest multimodal evidence in that unified space, filters it by signals such as text similarity and image similarity, keeps the most useful pieces, and then gives those images, text passages, and tables together to a multimodal LLM. The model can then answer by combining evidence across modalities and ideally indicate the source or highlight relevant areas in the image or document. + +Compared with text-only RAG, multimodal RAG can use more kinds of evidence and often reduces hallucination while producing more complete and more verifiable answers. + +## 7.3 Late Chunking: preserving full context for long documents + +Related introduction: + +- https://jina.ai/news/late-chunking-in-long-context-embedding-models/ + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image14.png) + +Imagine reading a Wikipedia article about Berlin. Traditional RAG would first cut it into independent paragraphs and then embed each chunk. If the first sentence says "Berlin is the capital of Germany," later phrases such as "the city" or "its population" lose their connection to Berlin once separated. A query such as "What is the population of Berlin?" may then fail because the term Berlin and the population information never appeared inside the same chunk. This problem becomes even worse for long documents. In a 200-page insurance contract, the definition of a deductible may appear on page 5 while the conditions under which it applies appear on page 30. Fixed-length chunking can split these related pieces into dozens of isolated chunks, and experiments show that semantic similarity can collapse sharply when that happens. + +Late Chunking overturns the traditional chunk-first-then-embed pipeline and instead follows embed-first-then-chunk. With long-context embedding models that can handle something like 8192 tokens, the whole document is first passed through the Transformer, producing token-level embeddings that have already seen the full document. Only afterward are those globally informed token embeddings pooled into chunk embeddings according to chunk boundaries. The resulting chunks are no longer independent islands. They are context-dependent embeddings that preserve cross-paragraph references and conceptual relationships. + +On BEIR benchmark datasets, Late Chunking outperforms traditional chunking broadly, with especially strong gains on longer documents. In short-text scenarios, the difference largely disappears, which confirms a key rule: the longer the document, the bigger the advantage of Late Chunking. The method is now integrated into Jina Embeddings v3. Although encoding a whole long document first can increase inference time by 10 to 20 percent, the retrieval gains in scenarios such as medical records, legal documents, and technical manuals can easily justify that cost. + +Late Chunking shows that 8K-plus long-context embedding models are not overengineering in these scenarios. They are often necessary for producing high-quality chunk embeddings and represent a shift from chunk first, then embed, to embed first, then chunk. + +## 7.4 From RAG to RAG in the Agent Era + +Related discussions: + +- [https://ragflow.io/blog/rag-at-the-crossroads-mid-2025-reflections-on-ai-evolution](https://ragflow.io/blog/rag-at-the-crossroads-mid-2025-reflections-on-ai-evolution) +- [https://arxiv.org/pdf/2501.09136](https://arxiv.org/pdf/2501.09136) +- [https://www.letta.com/blog/rag-vs-agent-memory](https://www.letta.com/blog/rag-vs-agent-memory) +- [https://www.linkedin.com/posts/richmondalake_100daysofagentmemory-rag-memorizz-activity-7348281860843577346-LM7Y/](https://www.linkedin.com/posts/richmondalake_100daysofagentmemory-rag-memorizz-activity-7348281860843577346-LM7Y/) +- https://www.llamaindex.ai/blog/rag-is-dead-long-live-agentic-retrieval + +RAG has developed from a retrieval-augmented generation tool into a key part of an agent's cognitive architecture. Traditional RAG is built on a simple ask, retrieve, answer pattern and is fundamentally passive. It waits for a query and does not act proactively. To break through that passivity and handle more complex cognitive tasks, RAG has been deeply combined with agent capabilities, giving rise to a new paradigm: Agentic RAG. + +Under this paradigm, the role of RAG changes fundamentally. It is no longer only a passive provider of external knowledge. Instead, it becomes the core processing unit that supports intelligent behavior under the agent's active planning, goal direction, and self-reflection. This fusion gives the overall system goal orientation, iterative optimization, and autonomous decision-making, greatly deepening the quality of human-AI interaction. Agentic RAG can understand complex tasks, decompose them, plan retrieval strategies, and evaluate the quality of initial results to decide whether deeper exploration is needed. + +![](../../../../zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/images/image15.png) + +The key to this capability is a multi-layered active loop. Faced with a complex query, the agent first analyzes the nature of the problem, breaks it into subproblems, and designs precise retrieval strategies for each subproblem. After receiving initial results, it evaluates them, judges whether the information is complete and relevant, identifies knowledge gaps, and dynamically generates more precise new queries. This iterative process often includes multi-hop retrieval, where one round of results reveals new directions for the next round, producing a knowledge exploration chain similar to how a human researcher works. + +To support this ongoing, iterative intelligent behavior, especially when personalization and long-term knowledge accumulation matter, short-term conversation context alone is far from enough. This leads to the need for long-term, structured memory. + +That is exactly why RAG is increasingly assigned the role of an agent's long-term memory system and used to build a full external memory architecture. This long-term memory complements short-term memory, which is responsible for maintaining the current dialogue context. The long-term memory system relies on three key mechanisms: + +1. Structured indexing ability: + This allows the agent to build multi-dimensional indexes over huge amounts of unstructured data, by time, topic, entity relations, and more, supporting efficient retrieval from multiple angles much like humans recall information through different clues. +2. Intelligent forgetting: + Through value-evaluation algorithms, the system can decay or selectively discard low-frequency, weakly related, or outdated information, keeping the memory system lean and efficient and preventing overload. +3. Knowledge consolidation: + The system refines scattered dialogue and interaction experience into structured knowledge. Through entity recognition, relation extraction, and semantic clustering, fragmented information is connected into knowledge graphs, turning short-term experience into long-term knowledge. + +This external memory system built on RAG not only expands an agent's cognitive boundary significantly, but also gives it the ability to continue learning and evolving its knowledge. It allows the agent to accumulate experience over long-term interaction, form personalized operating patterns and domain knowledge systems, and support more complex and longer-running tasks. + +# Summary + +Retrieval-Augmented Generation is not only a technical method for compensating for hallucination and knowledge staleness in large models. It is also a key bridge for turning general AI capability into deep enterprise value. The evolution from Naive RAG to modular and agentic forms shows that every part of RAG needs to deepen continuously, including finer data handling, more scientific model selection across embedding, rerank, and LLM stages, and more systematic evaluation. All of these are necessary steps toward building enterprise knowledge systems that are controllable, trustworthy, and efficient. At the same time, drawing lessons from competitions and engineering case studies is one of the best ways to deepen understanding of the technical details. + +As Graph RAG, multimodal understanding, and Late Chunking continue to develop and combine, RAG is steadily pushing beyond the old retrieval-and-generation boundary and moving toward deeper semantic association and more sustainable memory capability. The hope is that this survey-style article helps you build a full-chain methodology, from principle to practice and from evaluation to evolution, so that in a fast-moving technical landscape you can build high-quality intelligent applications that truly land in the real world and can handle complex business challenges. + +# Reference + +[1] Ask in Any Modality: A Comprehensive Survey on Multimodal Retrieval-Augmented Generation. + +https://arxiv.org/pdf/2502.08826 + +[2] Retrieving Multimodal Information for Augmented Generation: A Survey. + +https://arxiv.org/pdf/2303.10868 + +[3] A Survey on RAG Meeting LLMs: Towards Retrieval-Augmented Large Language Models. + +https://arxiv.org/pdf/2405.06211 + +[4] Retrieval-Augmented Generation for Large Language Models: A Survey. + +https://arxiv.org/pdf/2312.10997 + +[5] LightRAG: Simple and Fast Retrieval-Augmented Generation. + +https://arxiv.org/pdf/2410.05779 + +[6] Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG. + +https://arxiv.org/pdf/2501.09136 + +[7] ERAGent: Enhancing Retrieval-Augmented Language Models with Improved Accuracy, Efficiency, and Personalization. + +https://arxiv.org/pdf/2405.06683 + +[8] Graph Retrieval-Augmented Generation: A Survey. + +https://www.arxiv.org/pdf/2408.08921 + +[9] Evaluation of Retrieval-Augmented Generation: A Survey. + +https://arxiv.org/pdf/2405.07437 + +[10] Retrieval Augmented Generation Evaluation in the Era of Large Language Models: A Comprehensive Survey. + +https://arxiv.org/pdf/2504.14891 + +[11] From Local to Global: A Graph RAG Approach to Query-Focused Summarization. + +https://arxiv.org/pdf/2404.16130 + +[12] RAG vs. GraphRAG: A Systematic Evaluation and Key Insights. + +https://arxiv.org/pdf/2502.11371 + +[13] Introduction to RAG | LlamaIndex Python Documentation. + +https://developers.llamaindex.ai/python/framework/understanding/rag/ + +[14] All-in-RAG | A Full-Stack Guide to RAG in Large-Model Application Development. + +https://datawhalechina.github.io/all-in-rag/#/en/ + +[15] Ilya Rice: How I Won the Enterprise RAG Challenge. + +https://abdullin.com/ilya/how-to-build-best-rag/ + +[16] RAG Research Table - Awesome Generative AI Guide (GitHub). + +https://github.com/aishwaryanr/awesome-generative-ai-guide/blob/main/research_updates/rag_research_table.md + +[17] RAG is dead, long live agentic retrieval. + +https://www.llamaindex.ai/blog/rag-is-dead-long-live-agentic-retrieval + +[18] LLM/RAG Zoomcamp extra lesson 5: Common evaluation methods and market preferences in RAG evolution. + +https://vip.studycamp.tw/t/llmrag-zoomcamp-%E8%AA%B2%E5%A4%96%E8%A3%9C%E5%85%85-5%EF%BC%9Arag-evolution-%E5%B8%B8%E8%A6%8B%E8%A9%95%E4%BC%B0%E6%96%B9%E6%B3%95%E5%92%8C%E5%B8%82%E5%A0%B4%E5%81%8F%E5%A5%BD/8185 + +[19] How to Evaluate Retrieval Augmented Generation (RAG) Applications. + +https://zilliz.com.cn/blog/how-to-evaluate-rag-zilliz + +[20] RAG is not Agent Memory. + +https://www.letta.com/blog/rag-vs-agent-memory + +[21] Richmond Alake. LinkedIn post on #100DaysOfAgentMemory, RAG and MemoRizz. + +https://www.linkedin.com/posts/richmondalake_100daysofagentmemory-rag-memorizz-activity-7348281860843577346-LM7Y/ diff --git a/docs/en/stage-3/ai-advanced/3.a2-langgraph-advanced-rag/index.md b/docs/en/stage-3/ai-advanced/3.a2-langgraph-advanced-rag/index.md new file mode 100644 index 0000000..cfe84fc --- /dev/null +++ b/docs/en/stage-3/ai-advanced/3.a2-langgraph-advanced-rag/index.md @@ -0,0 +1,3 @@ +# Intermediate and Advanced RAG with Workflow Orchestration - Using LangGraph as an Example + +> This chapter is currently being written. Stay tuned... diff --git a/docs/en/stage-3/core-skills/agent-teams/index.md b/docs/en/stage-3/core-skills/agent-teams/index.md new file mode 100644 index 0000000..47a83b6 --- /dev/null +++ b/docs/en/stage-3/core-skills/agent-teams/index.md @@ -0,0 +1,2761 @@ +# Claude Agent Teams Complete Guide + +## Introduction to Agent Teams + +**Agent Teams** is a revolutionary feature in Claude Code that allows **multiple independent AI instances to collaborate like a real development team**. + +Imagine that in the past, using Claude Code was like being a project manager working with one exceptionally capable assistant. No matter how complex the task was, only that one assistant was doing the work. Now, with Agent Teams, you can assemble a full AI development team: one member can handle the frontend, one can handle the backend, one can handle testing, and they can **work at the same time, communicate with each other, and collaborate to complete complex tasks**. + +### From a single assistant to team collaboration + +Before diving into Agent Teams, let's first understand the problem it solves. + +**Limitations of the single-AI mode**: + +When you use a single Claude instance to handle a complex project, you will run into these bottlenecks: + +- **Serial processing bottleneck**: AI can only do one thing at a time. For example, when refactoring a project, it may need to analyze the authentication module first, then the database module, and finally the API module. These steps must be done sequentially, even if they do not depend on each other. + +- **Context crowding problem**: All information lives in a single conversation window. As the conversation gets longer, important early details can get buried, and AI may forget key decisions discussed earlier. + +- **Single-perspective limitation**: Only one AI is thinking, so there is no multi-angle discussion or validation. When complex design decisions appear, there is no "teammate" to debate with or provide a different perspective. + +- **Efficiency ceiling**: Large refactors or multi-module development take a long time, and there is no way to speed them up through parallelism. + +**The Agent Teams solution**: + +Agent Teams solves these problems through **parallel collaboration across multiple instances**: + +- **True parallel work**: Multiple AIs can work on different tasks simultaneously. One can handle the frontend UI, another the backend API, and another the database design, without interfering with each other. + +- **Independent context spaces**: Every team member has its own full 200K token context window, so important information is not "forgotten" because the conversation gets too long. + +- **Team collaboration capability**: Members can communicate directly, discuss design decisions, and validate code quality with each other, just like a real development team. + +- **A significant efficiency increase**: According to Anthropic's internal testing, efficiency on large-scale project refactors can improve by around 50%. + +--- + +## Agent Teams vs Subagent + +Before going deeper into the architecture of Agent Teams, we should first clear up a common point of confusion: **what is the difference between Agent Teams and Subagent**? + +Both features involve "multiple AIs collaborating," but their collaboration models are completely different and suitable for different scenarios. + +### Core differences at a glance + +| Dimension | Subagent | Agent Teams | +|---------|-------------------|----------------------| +| **Topology** | Star topology: all subagents report to the main agent | Mesh topology: members can communicate with each other | +| **Communication style** | The main agent explicitly passes information via prompts, and subagents return results when done | Members can communicate, discuss, and coordinate directly | +| **Context management** | Every subagent has an independent context, and the main agent passes only the necessary information | Every member has a fully independent context | +| **Parallelism** | Can run in parallel, but the collaboration chain still centers on the main agent | True parallel development and collaboration | +| **Task coordination** | The main agent dispatches and coordinates everything centrally | Members can take ownership of tasks more autonomously | +| **Cost** | Not low. Token usage stacks when multiple subagents run in parallel | Higher. Members run independently and communicate more frequently | + +### An intuitive analogy + +**Subagent is like**: a manager writing separate task slips for several assistants. Each assistant works independently based on its own task slip, and when finished, only returns the result to the manager. The assistants do not communicate directly, and the manager does not see the assistants' full thought process while they work. + +``` +You → Main Agent → Subagent A: "Analyze this file" +You → Main Agent → Subagent B: "Search for that function" + ↓ + Subagent A completes → reports result to Main Agent + Subagent B completes → reports result to Main Agent + ↓ + Main Agent synthesizes the results → reports back to you +``` + +**Agent Teams is like**: a project manager leading a real development team. Team members can communicate, discuss, and collaborate directly, rather than routing every detail through the project manager. + +``` +You → Team Lead: "Build a user authentication feature" + ↓ + Team Lead creates the team and assigns tasks + ↓ + Teammate A: "@Teammate B, is the API interface design ready?" + Teammate B: "Yes, here's the format..." + Teammate C: "I reviewed the interface and found something we should discuss..." + ↓ + Team members collaborate to finish the work → Team Lead synthesizes the result → reports back to you +``` + +### When to use which one + +**Use Subagent when**: + +- You have a quick, clear, single task, such as "search for this error code" +- Tasks do not depend much on each other +- You want parallel execution, but do not need sustained discussion between members + +**Use Agent Teams when**: + +- You are doing a complex system refactor that spans multiple modules +- You need multi-angle analysis and discussion, such as a security expert and a performance expert debating a solution +- You need true parallel development, with frontend, backend, and testing happening at the same time +- Tasks require frequent coordination and information sharing + +### A simple summary + +- **Subagent**: a task distribution tool that breaks a big task into smaller tasks and dispatches them to different "workers" +- **Agent Teams**: a real collaborative team where members can communicate, discuss, and work together like a real team + +--- + +## Core architecture + +Agent Teams is not just a simple "open multiple instances" feature. It is a complete **multi-agent collaboration system**. To understand it, we need to understand its core components and how they work together. + +### Team composition + +An Agent Team consists of four core components, each with its own responsibility, working together to complete complex tasks. + +**Team Lead** + +The Team Lead is the "brain" and "coordinator" of the entire team. It does not directly execute coding tasks. Instead, it is responsible for: + +- **Requirement analysis and task decomposition**: breaking the user's complex requirements into multiple subtasks that can run in parallel +- **Team creation and management**: deciding how many members are needed and what each member should do +- **Task assignment and scheduling**: assigning tasks to the right members and managing task dependencies +- **Result synthesis and quality control**: collecting each member's work, integrating it, and doing the final review + +**Teammates** + +Teammates are the actual "developers" doing the work. Every Teammate is an independent Claude instance: + +- **Independent context window**: each member has a full 200K token context window, completely isolated from the Team Lead and the other members +- **Full tool permissions**: they can use all tools such as Read, Write, Edit, and Bash +- **Autonomous task pickup**: they can independently select and claim tasks from the shared task board +- **Direct communication ability**: they can communicate directly with other members instead of always going through the Team Lead + +**TaskList** + +TaskList is the team's "project management tool," similar to Jira or Trello: + +- **Task status management**: every task has a clear status: `pending`, `in_progress`, or `completed` +- **Dependency management**: tasks can define dependencies, and dependent tasks can only start after prerequisite tasks finish +- **Automatic unlock mechanism**: when one task is completed, the system automatically checks and unlocks tasks waiting on it +- **File lock mechanism**: when a member claims and starts a task, a lock file is created in the task directory to prevent multiple members from editing the same file at the same time + +**Messaging System** + +The messaging system is the "chat tool" between team members: + +- **Point-to-point communication**: member A can send a message directly to member B +- **Broadcast announcements**: a message can be sent to all members at once +- **File-system based**: messages are stored as JSON files in `~/.claude/teams/{team-name}/inboxes/` +- **No network required**: everything works entirely through the local file system, with no network connection or port listening needed + +### Collaboration flow + +A typical Agent Teams workflow looks like this: + +``` +The user submits a complex requirement + ↓ +Team Lead analyzes the requirement and breaks it into tasks + ↓ +Creates team members and initializes TaskList + ↓ + ├─→ Teammate A claims Task 1 ─┐ + ├─→ Teammate B claims Task 2 ─┼→ Run in parallel + ├─→ Teammate C claims Task 3 ─┤ + │ ↓ + └──────────────────────────── Members coordinate through the messaging system + ↓ + Once all tasks are complete, Team Lead synthesizes the result + ↓ + Final output is delivered to the user +``` + +### File system layout + +Agent Teams creates dedicated directories on your local file system to manage team state: + +``` +~/.claude/ +├── teams/ +│ └── {team-name}/ +│ ├── config.json # Team config (member list, model selection, etc.) +│ └── inboxes/ +│ ├── team-lead.json # Team Lead inbox +│ ├── teammate-1.json # Member 1 inbox +│ └── teammate-2.json # Member 2 inbox +└── tasks/ + └── {team-name}/ + ├── task-1.json # Detailed info for Task 1 + ├── task-2.json # Detailed info for Task 2 + └── current_tasks/ + └── parse_if_statement.txt # Lock file created while a task is running +``` + +The advantage of this design is **complete transparency**: you can inspect team status, task progress, and the communication history between members at any time. + +--- + +## Quick start + +### Enable the experimental feature + +Agent Teams is currently an **experimental feature** and is disabled by default. To use it, you need to enable it first. + +**The easiest way: let Claude Code enable it for you** + +Type this directly in Claude Code: + +``` +Help me enable Agent Teams in settings.json +``` + +Or: + +``` +Enable the experimental feature agentTeams +``` + +Claude Code will automatically modify `~/.claude/settings.json` and add the following configuration: + +```json +{ + "experimental": { + "agentTeams": true + } +} +``` + +**Restart Claude Code** + +After the configuration is added, **fully quit and restart Claude Code**, and the feature will take effect. + +**Manual configuration (if the automatic method does not work)**: + +You can manually edit `~/.claude/settings.json` and add or modify: + +```json +{ + "experimental": { + "agentTeams": true + } +} +``` + +**How to verify it is enabled** + +After restarting Claude Code, try a conversation like this: + +``` +You: Can you help me create an Agent Team? + +Claude: Yes! I can help you create an Agent Team to collaborate on a task... +``` + +If Claude understands and responds to the request to create a team, the feature has been enabled successfully. + +### Visual mode configuration (optional) + +If you want to see team members' work in real time, you can configure **split-pane display mode**. + +**Let Claude Code configure it for you**: + +Type this directly in Claude Code: + +``` +Help me enable split-pane display mode for Agent Teams in settings.json, using tmux +``` + +Or: + +``` +Configure agent-teams to use split-panes mode +``` + +**Install tmux (if you do not have it)**: + +If `tmux` is not installed yet, you can ask Claude Code to install it: + +``` +Help me install tmux +``` + +Claude Code will automatically run the appropriate installation command based on your operating system, whether macOS or Linux. + +**What the configured result looks like**: + +After configuration, team members will work in different tmux panes, and you will be able to see all their output at the same time, like a "monitoring wall." + +``` +┌─────────────────┬─────────────────┬─────────────────┐ +│ Teammate 1 │ Teammate 2 │ Teammate 3 │ +│ Analyzing code │ Building API │ Writing tests │ +│ ... │ ... │ ... │ +│ │ │ │ +└─────────────────┴─────────────────┴─────────────────┘ +``` + +**Manual configuration (if the automatic method does not work)**: + +You can manually edit `~/.claude/settings.json`: + +```json +{ + "experimental": { + "agentTeams": true + }, + "agent-teams": { + "displayMode": "split-panes", + "terminalMultiplexer": "tmux" + } +} +``` + +--- + +### Hands-on example: build a Pokemon-style RPG game with Agent Teams + +Let's experience the power of Agent Teams through a full project. This example will show how multiple AI team members can collaborate to build an RPG game from scratch, including a battle system, dialogue features, and exploration elements. + +**Project requirements**: + +Build a Pokemon-style web RPG with the following features: + +- **Character system**: the player can create a character with level, HP, attack, defense, and other stats +- **Battle system**: turn-based combat with attack, skills, items, and flee options +- **Monster system**: multiple wild monsters with different attributes and skills +- **Dialogue system**: NPC conversations and side quests +- **Map exploration**: a simple 2D map where the player can move between scenes +- **Save system**: save game progress, including level, position, completed quests, and more +- **Sound effects and animation**: visual effects and sound effects for attacks, damage, and leveling up + +**Type this in Claude Code**: + +``` +I want to build a Pokemon-style web RPG game. + +Create a team to collaborate on development: + +Team member responsibilities: +- Teammate A (Game Architect): design the overall architecture, define the game state machine, and plan the data structures +- Teammate B (Battle System): implement turn-based combat logic, the skill system, and damage calculation +- Teammate C (Dialogue System): implement NPC dialogue, the quest system, and story scripts +- Teammate D (Map Rendering): use Canvas to implement 2D map rendering, character movement, and scene switching +- Teammate E (UI & Audio): design the game interface, battle UI, and sound playback + +Technical requirements: +- Use React + TypeScript + Vite +- Use Tailwind CSS for styling +- Use Framer Motion for animation +- Use Zustand for state management +- Turn-based battle system +- Save data with localStorage +- Use the Web Audio API for sound + +Use Sonnet for each member, and Opus for the Team Lead. + +First ask the architect to design the overall solution. After the data structures are defined, let the other members develop in parallel. +``` + +**How Claude will execute it**: + +**Phase 1: architecture design** + +Team Lead will first ask Teammate A, the game architect, to design the overall structure: + +``` +Teammate A is designing the game architecture... + +Tech stack: React 18 + TypeScript + Vite + Tailwind CSS + Framer Motion + Zustand + +Project initialization: +npm create vite@latest pokemon-rpg -- --template react-ts +npm install zustand framer-motion +npm install -D tailwindcss postcss autoprefixer +npx tailwindcss init -p + +File structure: +src/ +├── App.tsx # Main app component +├── main.tsx # Entry file +├── types/ +│ ├── player.ts # Player type definitions +│ ├── monster.ts # Monster type definitions +│ ├── skill.ts # Skill type definitions +│ └── game.ts # Game state types +├── stores/ +│ └── gameStore.ts # Zustand state management +├── components/ +│ ├── map/ # Map-related components +│ │ ├── MapCanvas.tsx # Map Canvas +│ │ ├── PlayerSprite.tsx # Player sprite +│ │ └── NPCSprite.tsx # NPC sprite +│ ├── battle/ # Battle-related components +│ │ ├── BattleScreen.tsx # Battle screen +│ │ ├── BattleMenu.tsx # Battle menu +│ │ ├── HPBar.tsx # HP bar component +│ │ └── BattleLog.tsx # Battle log +│ ├── dialog/ # Dialogue-related components +│ │ ├── DialogBox.tsx # Dialogue box +│ │ └── Choices.tsx # Choices component +│ └── ui/ # Shared UI components +│ ├── Button.tsx +│ ├── ProgressBar.tsx +│ └── StatCard.tsx +├── systems/ +│ ├── BattleSystem.ts # Battle system logic +│ ├── DialogSystem.ts # Dialogue system logic +│ ├── QuestSystem.ts # Quest system logic +│ └── MapSystem.ts # Map system logic +├── hooks/ +│ ├── useBattle.ts # Battle hook +│ ├── useDialog.ts # Dialogue hook +│ └── useAudio.ts # Audio hook +├── data/ +│ ├── monsters.ts # Monster data +│ ├── skills.ts # Skill data +│ ├── dialogues.ts # Dialogue scripts +│ └── maps.ts # Map data +└── utils/ + ├── audio.ts # Audio utilities + └── storage.ts # Save utilities + +Core data structure design: + +// types/player.ts - Player type +interface Player { + id: string; + name: string; + level: number; + exp: number; + maxExp: number; + hp: number; + maxHp: number; + attack: number; + defense: number; + skills: Skill[]; + position: { x: number; y: number }; + direction: 'UP' | 'DOWN' | 'LEFT' | 'RIGHT'; + gold: number; + items: string[]; +} + +// types/skill.ts - Skill type + id: string, + name: string, + type: 'attack' | 'heal' | 'buff' | 'special', + power: number, + accuracy: number, + pp: number, + maxPp: number, + effect?: string +} + +// Game state +{ + screen: 'map' | 'battle' | 'dialog' | 'menu', + player: Player, + currentMap: string, + position: {x: number, y: number}, + inBattle: boolean, + dialogQueue: Array, + gameData: { + completedQuests: string[], + items: string[], + visitedMaps: string[] + } +} + +Battle system design: +- Turn-based, with the player acting first +- Each turn the player can choose: attack, skill, item, or flee +- Skills have PP limits and must be restored when depleted +- Damage formula: (attacker attack / defender defense) × skill power × random factor (0.85~1.0) + +Dialogue system design: +- Typewriter effect for text display +- Supports branching options +- Supports conditional triggers, such as after a quest is completed +- NPCs have facing direction, and conversations trigger automatically when the player approaches +``` + +**Phase 2: parallel development** + +Once the architecture is confirmed, Team Lead creates the task list and the other members begin working in parallel: + +``` +Task list: +├── [Teammate B] Implement battle system core logic (in progress...) +├── [Teammate C] Implement dialogue and quest systems (in progress...) +├── [Teammate D] Implement 2D map rendering (in progress...) +└── [Teammate E] Design UI and audio (in progress...) +``` + +
+📁 Teammate B: battle system core code + +```javascript +// battle.js - Battle system +class BattleSystem { + constructor(player, monster) { + this.player = player; + this.monster = monster; + this.turn = 'player'; + this.log = []; + this.state = 'active'; // active, victory, defeat, flee + } + + // Player attack + playerAttack(skill) { + if (this.turn !== 'player') return; + + const damage = this.calculateDamage(this.player, this.monster, skill); + this.monster.hp = Math.max(0, this.monster.hp - damage); + + this.log.push(`${this.player.name} used ${skill.name}!`); + this.log.push(`It dealt ${damage} damage!`); + + // Skill effect + if (skill.effect) { + this.applyEffect(this.player, this.monster, skill.effect); + } + + // Check whether battle is over + if (this.monster.hp <= 0) { + this.state = 'victory'; + this.log.push(`${this.monster.name} collapsed!`); + this.giveExp(); + } else { + this.turn = 'monster'; + setTimeout(() => this.monsterAttack(), 1000); + } + } + + // Monster attack + monsterAttack() { + if (this.state !== 'active') return; + + // Randomly choose a skill + const skill = this.monster.skills[Math.floor(Math.random() * this.monster.skills.length)]; + const damage = this.calculateDamage(this.monster, this.player, skill); + + this.player.hp = Math.max(0, this.player.hp - damage); + + this.log.push(`${this.monster.name} used ${skill.name}!`); + this.log.push(`It dealt ${damage} damage!`); + + if (this.player.hp <= 0) { + this.state = 'defeat'; + this.log.push(`${this.player.name} fell...`); + } else { + this.turn = 'player'; + } + } + + // Damage calculation + calculateDamage(attacker, defender, skill) { + const levelFactor = (2 * attacker.level / 5 + 2); + const attackDefense = attacker.attack / defender.defense; + const baseDamage = levelFactor * attackDefense * skill.power + 2; + const randomFactor = 0.85 + Math.random() * 0.15; + + // Type advantage bonus (simplified) + let typeBonus = 1; + // if (skill.type > defender.type) typeBonus = 1.5; + + return Math.floor(baseDamage * randomFactor * typeBonus); + } + + // Apply skill effect + applyEffect(user, target, effect) { + switch(effect) { + case 'burn': + this.log.push(`${target.name} was burned!`); + break; + case 'heal': + const healAmount = Math.floor(user.maxHp * 0.3); + user.hp = Math.min(user.maxHp, user.hp + healAmount); + this.log.push(`${user.name} recovered ${healAmount} HP!`); + break; + case 'buff': + user.attack = Math.floor(user.attack * 1.2); + this.log.push(`${user.name}'s attack increased!`); + break; + } + } + + // Gain experience + giveExp() { + const baseExp = this.monster.level * 50; + const expGain = Math.floor(baseExp * (1 + this.player.level / 10)); + + this.player.exp += expGain; + this.log.push(`${this.player.name} gained ${expGain} EXP!`); + + // Level-up check + while (this.player.exp >= this.player.maxExp) { + this.levelUp(); + } + } + + // Level up + levelUp() { + this.player.level++; + this.player.exp -= this.player.maxExp; + this.player.maxExp = Math.floor(this.player.maxExp * 1.5); + + // Stat growth + const hpGain = 10 + Math.floor(Math.random() * 5); + const atkGain = 3 + Math.floor(Math.random() * 2); + const defGain = 2 + Math.floor(Math.random() * 2); + + this.player.maxHp += hpGain; + this.player.hp = this.player.maxHp; + this.player.attack += atkGain; + this.player.defense += defGain; + + this.log.push(`${this.player.name} leveled up to ${this.player.level}!`); + this.log.push(`HP +${hpGain}, ATK +${atkGain}, DEF +${defGain}`); + } + + // Flee + flee() { + if (Math.random() < 0.7) { + this.state = 'flee'; + this.log.push('You fled successfully!'); + return true; + } else { + this.log.push('Failed to flee!'); + this.turn = 'monster'; + setTimeout(() => this.monsterAttack(), 1000); + return false; + } + } +} + +// monster.js - Monster data +const MONSTER_DATA = [ + { + id: 'slime', + name: 'Slime', + baseHp: 30, + baseAtk: 8, + baseDef: 5, + skills: [ + {id: 'tackle', name: 'Tackle', type: 'attack', power: 40, accuracy: 100, pp: 35} + ], + expGain: 20 + }, + { + id: 'goblin', + name: 'Goblin', + baseHp: 45, + baseAtk: 12, + baseDef: 8, + skills: [ + {id: 'tackle', name: 'Tackle', type: 'attack', power: 40, accuracy: 100, pp: 35}, + {id: 'scratch', name: 'Scratch', type: 'attack', power: 55, accuracy: 100, pp: 25} + ], + expGain: 35 + }, + { + id: 'dragon', + name: 'Young Dragon', + baseHp: 80, + baseAtk: 20, + baseDef: 15, + skills: [ + {id: 'scratch', name: 'Scratch', type: 'attack', power: 55, accuracy: 100, pp: 25}, + {id: 'ember', name: 'Ember', type: 'attack', power: 70, accuracy: 90, pp: 15}, + {id: 'growl', name: 'Growl', type: 'buff', power: 0, accuracy: 100, pp: 20} + ], + expGain: 80 + } +]; +``` + +
+ +
+📁 Teammate C: dialogue and quest system code + +```javascript +// dialog.js - Dialogue system +class DialogSystem { + constructor() { + this.dialogQueue = []; + this.currentDialog = null; + this.isShowing = false; + this.onComplete = null; + } + + // Show dialogue + showDialog(dialog, onComplete) { + this.dialogQueue = Array.isArray(dialog) ? dialog : [dialog]; + this.onComplete = onComplete; + this.isShowing = true; + this.showNext(); + } + + // Show the next dialogue item + showNext() { + if (this.dialogQueue.length === 0) { + this.isShowing = false; + if (this.onComplete) this.onComplete(); + return; + } + + this.currentDialog = this.dialogQueue.shift(); + + // Handle special dialogue types + if (typeof this.currentDialog === 'function') { + this.currentDialog(); + this.showNext(); + return; + } + + this.renderDialog(); + } + + // Render the dialogue box + renderDialog() { + const dialogBox = document.getElementById('dialogBox'); + const speakerEl = document.getElementById('dialogSpeaker'); + const textEl = document.getElementById('dialogText'); + + if (this.currentDialog.speaker) { + speakerEl.textContent = this.currentDialog.speaker; + speakerEl.style.display = 'block'; + } else { + speakerEl.style.display = 'none'; + } + + // Typewriter effect + textEl.textContent = ''; + let i = 0; + const text = this.currentDialog.text; + const speed = this.currentDialog.speed || 30; + + const typeWriter = setInterval(() => { + if (i < text.length) { + textEl.textContent += text.charAt(i); + i++; + } else { + clearInterval(typeWriter); + } + }, speed); + + // Show choices, if any + this.renderChoices(); + } + + // Render choices + renderChoices() { + if (!this.currentDialog.choices) return; + + const choicesEl = document.getElementById('dialogChoices'); + choicesEl.innerHTML = ''; + choicesEl.style.display = 'block'; + + this.currentDialog.choices.forEach(choice => { + const btn = document.createElement('button'); + btn.textContent = choice.text; + btn.onclick = () => { + if (choice.condition === undefined || choice.condition()) { + this.dialogQueue = []; + this.showDialog(choice.dialog, this.onComplete); + } + }; + choicesEl.appendChild(btn); + }); + } + + // Next + next() { + if (this.currentDialog && this.currentDialog.choices) return; // must choose when options exist + this.showNext(); + } +} + +// Quest system +class QuestSystem { + constructor() { + this.quests = {}; + this.activeQuests = []; + this.completedQuests = []; + } + + // Accept a quest + acceptQuest(questId) { + if (this.completedQuests.includes(questId)) return false; + if (this.activeQuests.includes(questId)) return false; + + this.activeQuests.push(questId); + return true; + } + + // Update quest progress + updateProgress(type, target) { + this.activeQuests.forEach(questId => { + const quest = this.quests[questId]; + if (!quest) return; + + quest.objectives.forEach(obj => { + if (obj.type === type && obj.target === target && !obj.completed) { + obj.current = (obj.current || 0) + 1; + if (obj.current >= obj.required) { + obj.completed = true; + } + } + }); + + this.checkCompletion(questId); + }); + } + + // Check quest completion + checkCompletion(questId) { + const quest = this.quests[questId]; + if (!quest) return; + + const allComplete = quest.objectives.every(obj => obj.completed); + if (allComplete) { + this.completeQuest(questId); + } + } + + // Complete quest + completeQuest(questId) { + const index = this.activeQuests.indexOf(questId); + if (index > -1) { + this.activeQuests.splice(index, 1); + this.completedQuests.push(questId); + + // Give rewards + const quest = this.quests[questId]; + this.giveRewards(quest.rewards); + } + } + + // Give rewards + giveRewards(rewards) { + if (rewards.exp) player.gainExp(rewards.exp); + if (rewards.gold) player.gold += rewards.gold; + if (rewards.items) rewards.items.forEach(item => player.addItem(item)); + } +} + +// dialogues.js - Dialogue script examples +const DIALOGUES = { + villageChief: { + firstMeeting: [ + {speaker: 'Village Chief', text: 'Oh, adventurer... you finally arrived.'}, + {speaker: 'Village Chief', text: 'Lately, many wild monsters have appeared near our village, and everyone is frightened.'}, + {speaker: 'Village Chief', text: 'If you can help drive them away, I would be deeply grateful!'}, + { + choices: [ + {text: 'Okay, I accept this quest', dialog: () => { + quests.acceptQuest('defeatMonsters'); + return [ + {speaker: 'Village Chief', text: 'Wonderful! Please defeat 3 slimes to the north.'}, + {speaker: 'System', text: 'Quest [Drive Away the Slimes] accepted!'} + ]; + }}, + {text: 'I am a little busy right now', dialog: [ + {speaker: 'Village Chief', text: 'All right. Come back when you are ready.'} + ]} + ] + } + ], + afterQuest: [ + {speaker: 'Village Chief', text: 'You really did it! Thank you so much!'}, + {speaker: 'System', text: 'Quest [Drive Away the Slimes] completed! You gained 100 EXP!'}, + {speaker: 'Village Chief', text: 'Please take this. It is a small token of my thanks.'} + ] + }, + + shopkeeper: [ + {speaker: 'Shopkeeper', text: 'Welcome! Looking for something?'}, + { + choices: [ + {text: 'Browse goods', dialog: () => { + game.openShop(); + return [{speaker: 'Shopkeeper', text: 'Take whatever catches your eye!'}]; + }}, + {text: 'Leave', dialog: [{speaker: 'Shopkeeper', text: 'Come again next time!'}]} + ] + } + ] +}; +``` + +
+ +
+📁 Teammate D: 2D map rendering system code + +```javascript +// map.js - Map rendering system +class MapRenderer { + constructor(canvas) { + this.canvas = canvas; + this.ctx = canvas.getContext('2d'); + this.tileSize = 32; + this.currentMap = null; + this.player = null; + this.npcs = []; + this.camera = {x: 0, y: 0}; + } + + // Load map + loadMap(mapData) { + this.currentMap = mapData; + this.npcs = mapData.npcs || []; + this.updateCamera(); + } + + // Render the map + render() { + if (!this.currentMap) return; + + // Clear the canvas + this.ctx.fillStyle = '#000'; + this.ctx.fillRect(0, 0, this.canvas.width, this.canvas.height); + + // Save context + this.ctx.save(); + + // Apply camera offset + this.ctx.translate(-this.camera.x, -this.camera.y); + + // Render map layers + this.renderLayers(); + + // Render NPCs + this.renderNPCs(); + + // Render player + this.renderPlayer(); + + // Restore context + this.ctx.restore(); + } + + // Render map layers + renderLayers() { + const map = this.currentMap; + + for (let layer = 0; layer < map.layers.length; layer++) { + const data = map.layers[layer].data; + + for (let y = 0; y < map.height; y++) { + for (let x = 0; x < map.width; x++) { + const tileId = data[y * map.width + x]; + if (tileId === 0) continue; + + const tileX = x * this.tileSize; + const tileY = y * this.tileSize; + + this.renderTile(tileX, tileY, tileId); + } + } + } + } + + // Render a single tile + renderTile(x, y, tileId) { + // Draw different tiles based on tile ID + const tileType = this.getTileType(tileId); + + switch(tileType) { + case 'grass': + this.ctx.fillStyle = '#4a8f4a'; + this.ctx.fillRect(x, y, this.tileSize, this.tileSize); + // Grass texture + this.ctx.fillStyle = '#3d7f3d'; + for (let i = 0; i < 3; i++) { + const px = x + Math.random() * this.tileSize; + const py = y + Math.random() * this.tileSize; + this.ctx.fillRect(px, py, 2, 2); + } + break; + + case 'water': + this.ctx.fillStyle = '#4a90d9'; + this.ctx.fillRect(x, y, this.tileSize, this.tileSize); + // Ripple effect + const wave = Math.sin(Date.now() / 500 + x / 20) * 2; + this.ctx.fillStyle = '#5aa0e9'; + this.ctx.fillRect(x, y + 10 + wave, this.tileSize, 2); + break; + + case 'wall': + this.ctx.fillStyle = '#8b7355'; + this.ctx.fillRect(x, y, this.tileSize, this.tileSize); + this.ctx.fillStyle = '#7a6248'; + this.ctx.fillRect(x + 2, y + 2, this.tileSize - 4, this.tileSize - 4); + break; + + case 'path': + this.ctx.fillStyle = '#c4a77d'; + this.ctx.fillRect(x, y, this.tileSize, this.tileSize); + break; + + case 'house': + this.ctx.fillStyle = '#a0522d'; + this.ctx.fillRect(x, y, this.tileSize, this.tileSize); + // Roof + this.ctx.fillStyle = '#8b4513'; + this.ctx.beginPath(); + this.ctx.moveTo(x, y); + this.ctx.lineTo(x + this.tileSize / 2, y - 10); + this.ctx.lineTo(x + this.tileSize, y); + this.ctx.fill(); + break; + } + } + + // Get tile type + getTileType(tileId) { + const types = { + 1: 'grass', 2: 'water', 3: 'wall', 4: 'path', 5: 'house' + }; + return types[tileId] || 'grass'; + } + + // Render NPCs + renderNPCs() { + this.npcs.forEach(npc => { + const x = npc.x * this.tileSize; + const y = npc.y * this.tileSize; + + // Draw NPC + this.ctx.fillStyle = npc.color || '#ff6b6b'; + this.ctx.beginPath(); + this.ctx.arc( + x + this.tileSize / 2, + y + this.tileSize / 2, + this.tileSize / 3, + 0, + Math.PI * 2 + ); + this.ctx.fill(); + + // Draw name + this.ctx.fillStyle = '#fff'; + this.ctx.font = '10px Arial'; + this.ctx.textAlign = 'center'; + this.ctx.fillText(npc.name, x + this.tileSize / 2, y - 5); + }); + } + + // Render player + renderPlayer() { + if (!this.player) return; + + const x = this.player.x * this.tileSize; + const y = this.player.y * this.tileSize; + + // Player body + this.ctx.fillStyle = '#4ecdc4'; + this.ctx.beginPath(); + this.ctx.arc( + x + this.tileSize / 2, + y + this.tileSize / 2, + this.tileSize / 3, + 0, + Math.PI * 2 + ); + this.ctx.fill(); + + // Player direction indicator + const directions = {UP: [0, -8], DOWN: [0, 8], LEFT: [-8, 0], RIGHT: [8, 0]}; + const [dx, dy] = directions[this.player.direction] || [0, 0]; + + this.ctx.fillStyle = '#2d3436'; + this.ctx.beginPath(); + this.ctx.arc( + x + this.tileSize / 2 + dx, + y + this.tileSize / 2 + dy, + 4, + 0, + Math.PI * 2 + ); + this.ctx.fill(); + } + + // Update camera position + updateCamera() { + if (!this.player) return; + + // Camera follows player and keeps them centered + const targetX = this.player.x * this.tileSize - this.canvas.width / 2; + const targetY = this.player.y * this.tileSize - this.canvas.height / 2; + + // Smooth movement + this.camera.x += (targetX - this.camera.x) * 0.1; + this.camera.y += (targetY - this.camera.y) * 0.1; + + // Prevent camera from going beyond map bounds + const maxX = this.currentMap.width * this.tileSize - this.canvas.width; + const maxY = this.currentMap.height * this.tileSize - this.canvas.height; + this.camera.x = Math.max(0, Math.min(this.camera.x, maxX)); + this.camera.y = Math.max(0, Math.min(this.camera.y, maxY)); + } + + // Check collision + checkCollision(x, y) { + // Check map bounds + if (x < 0 || x >= this.currentMap.width || y < 0 || y >= this.currentMap.height) { + return true; + } + + // Check tile collision + const tileId = this.currentMap.layers[0].data[y * this.currentMap.width + x]; + const solidTiles = [3, 5]; // walls and houses are obstacles + + if (solidTiles.includes(tileId)) { + return true; + } + + // Check NPC collision + for (const npc of this.npcs) { + if (npc.x === x && npc.y === y) { + // Trigger NPC dialogue + this.triggerNPC(npc); + return true; + } + } + + return false; + } + + // Trigger NPC dialogue + triggerNPC(npc) { + if (npc.dialogue) { + game.dialogSystem.showDialog(npc.dialogue); + } + } +} + +// Example map data +const VILLAGE_MAP = { + name: 'Starter Village', + width: 20, + height: 15, + layers: [ + { + name: 'ground', + data: [ + // Map data (simplified) + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,4,4,4,1,1,5,5,5,1,1,4,4,4,4,1,1,1,1,1, + 1,4,1,4,1,1,5,5,5,1,1,4,1,1,4,1,1,1,1,1, + 1,4,4,4,1,1,1,1,1,1,1,4,4,4,4,1,2,2,1,1, + 1,1,1,1,1,1,4,4,4,1,1,1,1,1,1,1,2,2,1,1, + 1,4,4,4,1,1,4,4,4,1,1,1,1,1,1,1,2,2,1,1, + 1,4,1,4,1,1,1,1,1,1,1,4,4,4,1,1,1,1,1,1, + 1,4,4,4,1,1,1,1,1,1,1,4,1,1,4,1,1,1,1,1, + // ... more map data + ] + } + ], + npcs: [ + { + id: 'village_chief', + name: 'Village Chief', + x: 5, + y: 5, + color: '#ffd93d', + dialogue: DIALOGUES.villageChief.firstMeeting, + direction: 'DOWN' + }, + { + id: 'shopkeeper', + name: 'Shopkeeper', + x: 15, + y: 8, + color: '#6bcf7f', + dialogue: DIALOGUES.shopkeeper, + direction: 'DOWN' + } + ], + exits: [ + {x: 10, y: 0, to: 'forest_map', spawnX: 5, spawnY: 14} + ] +}; +``` + +
+ +
+📁 Teammate E: battle UI code + +```html + + +``` + +```css +/* battle.css - Battle screen styles */ +.battle-screen { + position: absolute; + top: 0; + left: 0; + width: 100%; + height: 100%; + background: linear-gradient(180deg, #87ceeb 0%, #e0f7fa 50%, #4a5568 50%, #2d3748 100%); + display: flex; + flex-direction: column; +} + +.enemy-area { + flex: 1; + display: flex; + justify-content: center; + align-items: center; + padding: 40px; +} + +.monster-sprite canvas { + image-rendering: pixelated; + filter: drop-shadow(0 4px 8px rgba(0,0,0,0.3)); + animation: float 2s ease-in-out infinite; +} + +@keyframes float { + 0%, 100% { transform: translateY(0); } + 50% { transform: translateY(-10px); } +} + +.monster-info { + margin-left: 40px; + text-align: center; +} + +.monster-info .name { + font-size: 24px; + font-weight: bold; + color: #2d3748; +} + +.monster-info .level { + font-size: 14px; + color: #718096; + margin: 8px 0; +} + +.hp-bar { + width: 200px; + height: 20px; + background: #e2e8f0; + border-radius: 10px; + overflow: hidden; + border: 2px solid #4a5568; +} + +.hp-fill { + height: 100%; + background: linear-gradient(90deg, #48bb78, #38a169); + transition: width 0.3s ease; +} + +.hp-text { + margin-top: 8px; + font-size: 14px; + color: #4a5568; +} + +.player-area { + flex: 1; + display: flex; + justify-content: space-between; + align-items: flex-end; + padding: 40px; +} + +.player-info { + background: rgba(255,255,255,0.9); + border-radius: 12px; + padding: 20px; + border: 3px solid #4a5568; +} + +.exp-bar { + width: 200px; + height: 8px; + background: #e2e8f0; + border-radius: 4px; + margin-top: 8px; +} + +.exp-fill { + height: 100%; + background: linear-gradient(90deg, #4299e1, #3182ce); + border-radius: 4px; +} + +.battle-menu { + background: rgba(255,255,255,0.95); + border: 3px solid #4a5568; + border-radius: 12px; + padding: 20px; + margin: 0 40px 40px; +} + +.menu-row { + display: grid; + grid-template-columns: repeat(4, 1fr); + gap: 12px; +} + +.menu-btn { + padding: 16px 24px; + font-size: 18px; + font-weight: bold; + background: linear-gradient(180deg, #fff 0%, #e2e8f0 100%); + border: 2px solid #4a5568; + border-radius: 8px; + cursor: pointer; + transition: all 0.2s; +} + +.menu-btn:hover { + background: linear-gradient(180deg, #4299e1 0%, #3182ce 100%); + color: white; + transform: translateY(-2px); + box-shadow: 0 4px 8px rgba(0,0,0,0.2); +} + +.battle-log { + position: absolute; + bottom: 120px; + left: 40px; + right: 40px; + max-height: 100px; + overflow-y: auto; + background: rgba(0,0,0,0.7); + border-radius: 8px; + padding: 12px; +} + +#battleLog { + color: #fff; + font-size: 14px; + line-height: 1.8; +} + +.log-entry { + margin-bottom: 4px; + opacity: 0; + animation: fadeIn 0.3s forwards; +} + +@keyframes fadeIn { + to { opacity: 1; } +} + +/* Hit animation */ +@keyframes shake { + 0%, 100% { transform: translateX(0); } + 25% { transform: translateX(-5px); } + 75% { transform: translateX(5px); } +} + +.shake { + animation: shake 0.3s ease-in-out; +} + +/* Attack animation */ +@keyframes attackRight { + 0% { transform: translateX(0); } + 50% { transform: translateX(30px); } + 100% { transform: translateX(0); } +} + +.attack-right { + animation: attackRight 0.3s ease-in-out; +} +``` + +
+ +
+📁 Audio system code + +```javascript +// audio.js - Audio system +class AudioManager { + constructor() { + this.audioContext = null; + this.sounds = {}; + this.musicVolume = 0.3; + this.sfxVolume = 0.5; + this.currentBgm = null; + } + + // Initialize audio context + init() { + if (!this.audioContext) { + this.audioContext = new (window.AudioContext || window.webkitAudioContext)(); + } + if (this.audioContext.state === 'suspended') { + this.audioContext.resume(); + } + } + + // Play background music + playBgm(bgmName) { + if (this.currentBgm === bgmName) return; + + this.stopBgm(); + + // Use oscillators to generate simple BGM + this.currentBgm = bgmName; + this.playGeneratedBgm(bgmName); + } + + // Generate simple background music + playGeneratedBgm(type) { + const melodies = { + battle: [262, 294, 330, 262, 294, 330, 349, 330], + village: [330, 349, 392, 349, 330, 294, 262, 294], + victory: [392, 440, 494, 523, 494, 440, 392, 349] + }; + + const melody = melodies[type] || melodies.village; + let noteIndex = 0; + + const playNote = () => { + if (this.currentBgm !== type) return; + + const osc = this.audioContext.createOscillator(); + const gain = this.audioContext.createGain(); + + osc.connect(gain); + gain.connect(this.audioContext.destination); + + osc.frequency.value = melody[noteIndex]; + osc.type = 'triangle'; + + gain.gain.setValueAtTime(this.musicVolume, this.audioContext.currentTime); + gain.gain.exponentialRampToValueAtTime( + 0.01, + this.audioContext.currentTime + 0.4 + ); + + osc.start(this.audioContext.currentTime); + osc.stop(this.audioContext.currentTime + 0.4); + + noteIndex = (noteIndex + 1) % melody.length; + setTimeout(playNote, 500); + }; + + playNote(); + } + + // Stop background music + stopBgm() { + this.currentBgm = null; + } + + // Play sound effect + playSfx(sfxName) { + this.init(); + + switch(sfxName) { + case 'attack': + this.playAttackSound(); + break; + case 'hit': + this.playHitSound(); + break; + case 'victory': + this.playVictorySound(); + break; + case 'levelup': + this.playLevelUpSound(); + break; + case 'dialog': + this.playDialogSound(); + break; + } + } + + // Attack sound effect + playAttackSound() { + const osc = this.audioContext.createOscillator(); + const gain = this.audioContext.createGain(); + + osc.connect(gain); + gain.connect(this.audioContext.destination); + + osc.frequency.setValueAtTime(200, this.audioContext.currentTime); + osc.frequency.exponentialRampToValueAtTime( + 100, + this.audioContext.currentTime + 0.1 + ); + osc.type = 'sawtooth'; + + gain.gain.setValueAtTime(this.sfxVolume, this.audioContext.currentTime); + gain.gain.exponentialRampToValueAtTime( + 0.01, + this.audioContext.currentTime + 0.1 + ); + + osc.start(this.audioContext.currentTime); + osc.stop(this.audioContext.currentTime + 0.1); + } + + // Hit sound effect + playHitSound() { + const osc = this.audioContext.createOscillator(); + const gain = this.audioContext.createGain(); + + osc.connect(gain); + gain.connect(this.audioContext.destination); + + osc.frequency.value = 100; + osc.type = 'square'; + + gain.gain.setValueAtTime(this.sfxVolume * 0.8, this.audioContext.currentTime); + gain.gain.exponentialRampToValueAtTime( + 0.01, + this.audioContext.currentTime + 0.2 + ); + + osc.start(this.audioContext.currentTime); + osc.stop(this.audioContext.currentTime + 0.2); + } + + // Victory sound effect + playVictorySound() { + const notes = [523, 659, 784, 1047]; + notes.forEach((freq, i) => { + setTimeout(() => { + const osc = this.audioContext.createOscillator(); + const gain = this.audioContext.createGain(); + + osc.connect(gain); + gain.connect(this.audioContext.destination); + + osc.frequency.value = freq; + osc.type = 'sine'; + + gain.gain.setValueAtTime(this.sfxVolume, this.audioContext.currentTime); + gain.gain.exponentialRampToValueAtTime( + 0.01, + this.audioContext.currentTime + 0.5 + ); + + osc.start(this.audioContext.currentTime); + osc.stop(this.audioContext.currentTime + 0.5); + }, i * 150); + }); + } + + // Level-up sound effect + playLevelUpSound() { + const notes = [392, 523, 659, 784, 1047]; + notes.forEach((freq, i) => { + setTimeout(() => { + const osc = this.audioContext.createOscillator(); + const gain = this.audioContext.createGain(); + + osc.connect(gain); + gain.connect(this.audioContext.destination); + + osc.frequency.value = freq; + osc.type = 'triangle'; + + gain.gain.setValueAtTime(this.sfxVolume, this.audioContext.currentTime); + gain.gain.exponentialRampToValueAtTime( + 0.01, + this.audioContext.currentTime + 0.3 + ); + + osc.start(this.audioContext.currentTime); + osc.stop(this.audioContext.currentTime + 0.3); + }, i * 100); + }); + } + + // Dialogue sound effect + playDialogSound() { + const osc = this.audioContext.createOscillator(); + const gain = this.audioContext.createGain(); + + osc.connect(gain); + gain.connect(this.audioContext.destination); + + osc.frequency.value = 800; + osc.type = 'sine'; + + gain.gain.setValueAtTime(this.sfxVolume * 0.3, this.audioContext.currentTime); + gain.gain.exponentialRampToValueAtTime( + 0.01, + this.audioContext.currentTime + 0.05 + ); + + osc.start(this.audioContext.currentTime); + osc.stop(this.audioContext.currentTime + 0.05); + } +} +``` + +
+ +**Collaboration dialogue between members**: + +``` +Teammate B → Teammate C: +"The battle system is done. When the player wins, it calls giveExp() to level up. +Please check the quest system and make sure level-up data is saved correctly." + +Teammate C → Teammate B: +"Got it. The quest system stores game data with localStorage, +including level, experience, and the list of completed quests. I'll add an autosave mechanism." + +Teammate D → All: +"The map rendering system is finished, and the NPC facing data is now connected to the dialogue system. +When the player faces an NPC, dialogue will trigger automatically. Please confirm the trigger logic in the dialogue system." + +Teammate C → Teammate D: +"Confirmed. DialogSystem has a showDialog() method that can accept a dialogue array. +I'll make sure all NPC dialogue data follows that format." + +Teammate E → Teammate B: +"The battle UI is finished, but I need real-time player and monster data to update the HP bars. +Does the battle system provide a callback?" + +Teammate B → Teammate E: +"Yes. BattleSystem has an onUpdate callback that fires at the end of each turn. +You can register that callback to update the UI." + +Teammate E → Teammate D: +"When switching maps, we need to reposition the camera. +Does MapRenderer provide an updateCamera() method?" + +Teammate D → Teammate E: +"Yes. updateCamera() is called automatically after every loadMap(). +You can also call it manually after the player moves to smoothly update the camera." +``` + +**Phase 3: integration and testing** + +After all components are complete, Team Lead is responsible for integration: + +
+📁 Main game controller code + +```javascript +// game.js - Main game controller +class Game { + constructor() { + this.state = 'map'; // map, battle, dialog, menu + this.canvas = document.getElementById('gameCanvas'); + this.ctx = this.canvas.getContext('2d'); + + // Initialize each system + this.player = this.createPlayer(); + this.mapRenderer = new MapRenderer(this.canvas); + this.battleSystem = null; + this.dialogSystem = new DialogSystem(); + this.questSystem = new QuestSystem(); + this.audioManager = new AudioManager(); + + // Load map + this.currentMapId = 'village'; + this.mapRenderer.loadMap(VILLAGE_MAP); + this.mapRenderer.player = this.player; + + // Input handling + this.setupInput(); + + // Start game loop + this.lastTime = 0; + this.gameLoop = this.gameLoop.bind(this); + requestAnimationFrame(this.gameLoop); + + // Auto-load save + this.loadGame(); + } + + // Create player + createPlayer() { + return { + name: 'Hero', + level: 1, + exp: 0, + maxExp: 100, + hp: 50, + maxHp: 50, + attack: 15, + defense: 10, + skills: [ + {id: 'tackle', name: 'Tackle', type: 'attack', power: 40, accuracy: 100, pp: 35} + ], + x: 10, + y: 7, + direction: 'DOWN', + gold: 100, + items: ['potion', 'potion', 'antidote'] + }; + } + + // Set up input handling + setupInput() { + document.addEventListener('keydown', (e) => { + if (this.state === 'map') { + this.handleMapInput(e); + } else if (this.state === 'dialog') { + this.handleDialogInput(e); + } else if (this.state === 'battle') { + this.handleBattleInput(e); + } + }); + } + + // Map input handling + handleMapInput(e) { + if (this.dialogSystem.isShowing) { + if (e.key === ' ' || e.key === 'Enter') { + this.dialogSystem.next(); + } + return; + } + + let dx = 0, dy = 0; + switch(e.key) { + case 'ArrowUp': case 'w': dy = -1; this.player.direction = 'UP'; break; + case 'ArrowDown': case 's': dy = 1; this.player.direction = 'DOWN'; break; + case 'ArrowLeft': case 'a': dx = -1; this.player.direction = 'LEFT'; break; + case 'ArrowRight': case 'd': dx = 1; this.player.direction = 'RIGHT'; break; + default: return; + } + + const newX = this.player.x + dx; + const newY = this.player.y + dy; + + if (!this.mapRenderer.checkCollision(newX, newY)) { + this.player.x = newX; + this.player.y = newY; + this.mapRenderer.updateCamera(); + + // Check random battle + if (Math.random() < 0.05) { + this.startBattle(); + } + + // Save game + this.saveGame(); + } + } + + // Dialogue input handling + handleDialogInput(e) { + if (e.key === ' ' || e.key === 'Enter') { + this.dialogSystem.next(); + if (!this.dialogSystem.isShowing) { + this.state = 'map'; + } + } + } + + // Battle input handling + handleBattleInput(e) { + if (!this.battleSystem) return; + if (this.battleSystem.turn !== 'player') return; + } + + // Start battle + startBattle(monsterData) { + // Randomly choose a monster + const randomMonster = MONSTER_DATA[Math.floor(Math.random() * MONSTER_DATA.length)]; + + // Create monster instance + const monster = { + ...randomMonster, + level: Math.max(1, this.player.level + Math.floor(Math.random() * 3) - 1), + hp: randomMonster.baseHp + randomMonster.baseHp * 0.2 * this.player.level, + maxHp: randomMonster.baseHp + randomMonster.baseHp * 0.2 * this.player.level, + attack: randomMonster.baseAtk + randomMonster.baseAtk * 0.15 * this.player.level, + defense: randomMonster.baseDef + randomMonster.baseDef * 0.1 * this.player.level + }; + + this.battleSystem = new BattleSystem(this.player, monster); + this.state = 'battle'; + + // Play battle music + this.audioManager.playBgm('battle'); + + // Show battle screen + document.getElementById('battleScreen').classList.remove('hidden'); + document.getElementById('mapScreen').classList.add('hidden'); + + // Update battle UI + this.updateBattleUI(); + } + + // Update battle UI + updateBattleUI() { + if (!this.battleSystem) return; + + const player = this.battleSystem.player; + const monster = this.battleSystem.monster; + + document.getElementById('playerName').textContent = player.name; + document.getElementById('playerLevel').textContent = player.level; + document.getElementById('playerHp').textContent = Math.floor(player.hp); + document.getElementById('playerMaxHp').textContent = player.maxHp; + document.getElementById('playerHpBar').style.width = + (player.hp / player.maxHp * 100) + '%'; + + document.getElementById('enemyName').textContent = monster.name; + document.getElementById('enemyLevel').textContent = monster.level; + document.getElementById('enemyHp').textContent = Math.floor(monster.hp); + document.getElementById('enemyMaxHp').textContent = Math.floor(monster.maxHp); + document.getElementById('enemyHpBar').style.width = + (monster.hp / monster.maxHp * 100) + '%'; + + // Update battle log + const logEl = document.getElementById('battleLog'); + this.battleSystem.log.forEach(log => { + const entry = document.createElement('div'); + entry.className = 'log-entry'; + entry.textContent = log; + logEl.appendChild(entry); + }); + logEl.scrollTop = logEl.scrollHeight; + } + + // End battle + endBattle() { + this.state = 'map'; + this.battleSystem = null; + + // Hide battle screen + document.getElementById('battleScreen').classList.add('hidden'); + document.getElementById('mapScreen').classList.remove('hidden'); + + // Play map music + this.audioManager.playBgm('village'); + + // Save game + this.saveGame(); + } + + // Save game + saveGame() { + const saveData = { + player: this.player, + currentMapId: this.currentMapId, + completedQuests: this.questSystem.completedQuests, + timestamp: Date.now() + }; + + localStorage.setItem('rpgSave', JSON.stringify(saveData)); + } + + // Load game + loadGame() { + const saveData = localStorage.getItem('rpgSave'); + if (saveData) { + const data = JSON.parse(saveData); + this.player = {...this.player, ...data.player}; + this.questSystem.completedQuests = data.completedQuests || []; + this.currentMapId = data.currentMapId || 'village'; + } + } + + // Main game loop + gameLoop(timestamp) { + const deltaTime = timestamp - this.lastTime; + this.lastTime = timestamp; + + // Clear canvas + this.ctx.fillStyle = '#000'; + this.ctx.fillRect(0, 0, this.canvas.width, this.canvas.height); + + // Render by state + if (this.state === 'map') { + this.mapRenderer.render(); + } + + requestAnimationFrame(this.gameLoop); + } +} + +// Start the game +window.addEventListener('DOMContentLoaded', () => { + window.game = new Game(); +}); +``` + +
+ +**Final result**: + +After about 1 to 2 hours, a fully functional Pokemon-style RPG is complete! + +``` +Project summary: +✅ Game architecture design - Teammate A +✅ Turn-based battle system - Teammate B +✅ Dialogue and quest system - Teammate C +✅ 2D map rendering - Teammate D +✅ UI and sound effects - Teammate E + +Project files: +├── index.html (120 lines) +├── css/ +│ ├── main.css (100 lines) +│ ├── battle.css (180 lines) +│ └── dialog.css (80 lines) +├── js/ +│ ├── game.js (250 lines) +│ ├── state.js (60 lines) +│ ├── player.js (50 lines) +│ ├── monster.js (80 lines) +│ ├── battle.js (220 lines) +│ ├── dialog.js (180 lines) +│ ├── map.js (280 lines) +│ └── audio.js (150 lines) +└── data/ + ├── monsters.js (100 lines) + ├── skills.js (80 lines) + └── dialogues.js (120 lines) + +Total: about 2050 lines of code, completed collaboratively by 5 AI team members! + +Game features: +🎮 Turn-based battle system (attack, skills, items, flee) +💬 NPC dialogue system (typewriter effect, branching choices) +📜 Quest system (accept quests, update progress, completion rewards) +🗺️ 2D map exploration (multi-scene transitions, NPC interaction) +💾 Autosave (progress stored with localStorage) +🔊 Sound effects and BGM (Web Audio API) +📊 Character growth (experience, leveling up, stat increases) +``` + +**Observe the team at work**: + +If you configured tmux split-pane mode, you will see multiple terminal windows working at the same time: + +``` +┌─────────────────┬─────────────────┬─────────────────┐ +│ Teammate B │ Teammate C │ Teammate D │ +│ Implementing │ Writing │ Rendering │ +│ damage formula │ dialogue │ tiles │ +│ │ scripts │ │ +│ "Teammate E, │ "Is │ "The monsters │ +│ is the HP bar │ MapRenderer │ need attack │ +│ width a │ ready yet?" │ animations..."│ +│ percentage?" │ │ │ +└─────────────────┴─────────────────┴─────────────────┘ +``` + +**Key takeaways**: + +This hands-on example shows several core advantages of Agent Teams: + +1. **True parallel development**: 5 members develop different game systems at the same time +2. **Complex project management**: 2000+ lines of code are split and integrated in a structured way +3. **Specialized division of labor**: battle, dialogue, maps, and UI each have a dedicated owner +4. **Interface coordination**: members negotiate interfaces and data formats through the messaging system +5. **Fast delivery**: work that could take one person weeks can be completed by the team in a few hours + +You can try running this game yourself and experience how an AI team collaborates to build a Pokemon-style RPG. + +--- + +### Single prompt vs Agent Teams: test it yourself + +To help you feel the power of Agent Teams more directly, we prepared two test plans that you can try yourself and compare. + +#### Test plan A: single prompt approach + +This is the traditional approach: use one complete prompt and ask AI to develop the game. + +**Type this in Claude Code**: + +``` +Help me build a Pokemon-style web RPG game with the following features: +- Character system (level, HP, attack, defense) +- Turn-based battle system (attack, skills, items, flee) +- NPC dialogue system +- 2D map exploration +- Save system +- Audio system + +Use React + TypeScript + Vite + Tailwind CSS. +Please give me complete code that can run directly. +``` + +**Expected result**: + +| Item | Expected situation | +|------|---------| +| **Code quality** | AI will try to generate all the code, but because of context limits, many details will be omitted or replaced with comments | +| **Feature completeness** | Core features may be present, but many advanced features will be missing or simplified | +| **Run-ability** | There may be bugs, and you may need several rounds of debugging before it runs | +| **Development time** | One conversation may take 30 to 60 minutes, with multiple back-and-forth rounds | +| **Code volume** | About 500 to 800 lines, because AI tends to compress code | + +**Problems you may encounter**: + +1. **Code gets cut off**: AI responses have length limits, so generation may stop halfway through +2. **Incomplete features**: the dialogue system may be only a basic version with no quest system +3. **Missing details**: the audio system may be left as a TODO comment +4. **Hard to debug**: if code has problems, you must ask AI to fix it in the same conversation, and the context becomes increasingly messy + +#### Test plan B: Agent Teams approach + +This is the approach introduced in this article: let multiple AI team members collaborate on development. + +**Type this in Claude Code** (after enabling Agent Teams): + +``` +I want to build a Pokemon-style web RPG game. + +Create a team to collaborate on development: + +Team member responsibilities: +- Teammate A (Game Architect): design the overall architecture, define the game state machine, and plan the data structures +- Teammate B (Battle System): implement turn-based combat logic, the skill system, and damage calculation +- Teammate C (Dialogue System): implement NPC dialogue, the quest system, and story scripts +- Teammate D (Map Rendering): use Canvas to implement 2D map rendering, character movement, and scene transitions +- Teammate E (UI & Audio): design the game interface, battle UI, and sound playback + +Technical requirements: +- Use plain HTML/CSS/JavaScript +- Use Canvas to render the game screen +- Turn-based battle system +- Save data with localStorage +- Use the Web Audio API for sound + +Use Sonnet for each member, and Opus for the Team Lead. + +First ask the architect to design the overall solution. After the data structures are defined, let the other members develop in parallel. +``` + +**Expected result**: + +| Item | Expected situation | +|------|---------| +| **Code quality** | Every member focuses on its own area, so the code is more professional and complete | +| **Feature completeness** | All features are implemented more fully, including the quest system and multi-scene maps | +| **Run-ability** | Members cross-check interfaces with each other, so integration issues are fewer | +| **Development time** | About 1 to 2 hours to complete all features because development happens in parallel | +| **Code volume** | About 2000+ lines, with a complete implementation instead of compressed code | + +#### Quantitative comparison table + +| Dimension | Single Prompt | Agent Teams | +|---------|-------------|-------------| +| **Total lines of code** | 500-800 lines | 2000+ lines | +| **Development time** | 30-60 minutes, but features are incomplete | 1-2 hours, with complete features | +| **Feature completeness** | 60-70% | 95%+ | +| **Maintainability** | Medium, usually one large file | High, with modular design | +| **Bug count** | Higher, because there is less validation | Lower, because members cross-check each other | +| **Future extensibility** | Difficult, because code is tightly coupled | Easier, because the structure is modular | +| **Token usage** | ~50K tokens | ~200K tokens (5 members) | +| **Cost** | ~$0.50 | ~$2.00 | + +#### Suggested real-world test process + +**Step 1: test the single-prompt approach first** + +``` +1. Open a new Claude Code conversation +2. Use the prompt from "Test Plan A" above +3. Record: how long did it take? How many lines of code were produced? Which features were missing? +``` + +**Step 2: then test the Agent Teams approach** + +``` +1. Confirm that Agent Teams has been enabled +2. Use the prompt from "Test Plan B" above +3. Observe: how do team members collaborate? Is the code more complete? +``` + +**Step 3: compare the two results** + +``` +1. Run both versions of the code separately +2. Compare the feature lists: which features are missing in the single-prompt version? +3. Compare the code structure: is the Agent Teams version more modular? +4. Evaluate: if you wanted to continue developing this game, which version would be easier to extend? +``` + +#### Why do these differences happen? + +**Limitations of the single-prompt approach**: + +1. **Context pressure**: AI must handle everything in a single response, so simplification is inevitable +2. **Scattered attention**: battle, dialogue, map, and UI all compete for attention, so details are easy to miss +3. **No collaborative validation**: nobody checks whether interfaces match, so bugs are more likely + +**Advantages of Agent Teams**: + +1. **Specialized division of labor**: each member focuses on one area and can go deep into the details +2. **Parallel processing**: battle, dialogue, and map development happen at the same time, improving efficiency +3. **Mutual validation**: members negotiate interfaces with each other, reducing integration problems +4. **Independent context**: every member has its own 200K context and does not interfere with the others + +#### Conclusion + +The core value of Agent Teams is not simply that it is "faster," but that it is **"more complete and more professional."** + +- For simple projects such as Snake, a single prompt is enough +- For complex projects such as a Pokemon RPG, Agent Teams can produce better results + +The key is to **choose the right tool**: do not use Agent Teams to rename a variable, and do not use a single prompt to build a complete RPG game. + +--- + +## Best practices + +Agent Teams is a powerful tool, but to use it well, you need to understand some best practices. These lessons come from real-world experience in the community and can help you avoid common pitfalls while getting the most value from team collaboration. + +### Practice 1: contract-first + +Before multiple Agents begin working in parallel, spend time defining a clear "contract," meaning the interface agreement. + +**Why it matters**: + +Suppose Teammate A is responsible for the backend API and Teammate B is responsible for the frontend integration. If they start at the same time without agreeing on the interface format first, something like this can happen: + +``` +Teammate A: implemented POST /api/login and expects {username, password} +Teammate B: implemented the frontend call and sends {user, pass} +Result: they do not match, and rework is required +``` + +**How to do it**: + +Before starting the team, first ask Claude to design the interfaces: + +``` +Do not start development yet. First help me design the interfaces for the user authentication system: + +1. The request and response formats for the login interface +2. The request and response formats for the registration interface +3. The password reset flow and interfaces +4. The error-handling conventions + +Write these interfaces down clearly, and only then let the team begin development. +``` + +**A contract should include**: + +- Function signatures and data structures +- Input and output JSON formats +- Meanings of HTTP status codes +- Error-handling conventions +- Field validation rules + +### Practice 2: assign models wisely + +Different tasks require different models. Good model assignment helps balance quality and cost. + +**Use Opus for the Team Lead**: + +The Team Lead handles task decomposition and result synthesis, which require stronger reasoning ability, so Opus is recommended: + +``` +Create a team where the Team Lead uses Opus for overall planning and final review. +The Teammates use Sonnet for implementation work. +``` + +**Use Sonnet for Teammates**: + +For concrete coding and testing work, Sonnet is entirely capable and significantly cheaper: + +- Opus 4.6: around $15 per million output tokens +- Sonnet 4.5: around $3 per million output tokens + +Using Sonnet for members can significantly reduce overall cost. + +**Use Haiku for special cases**: + +For simple tasks such as documentation updates or small test-writing tasks, you can consider Haiku, around $0.80 per million output tokens. + +### Practice 3: control task granularity + +Tasks that are too large or too small both hurt efficiency. You need to find the right granularity. + +**Rule of thumb**: + +Each task should be something one member can complete independently in **15 to 30 minutes**. + +**Task too large**: + +``` +Bad: implement the user authentication system +``` + +This task is too broad. It contains several subtasks, and one person would need a long time to finish it, which wastes the advantage of parallelism. + +**Task too small**: + +``` +Bad: create an empty file called auth.js +``` + +This task is too tiny. Members spend more time coordinating than doing actual work. + +**Appropriate granularity**: + +``` +Good: implement the login API, including: +1. The POST /api/login endpoint +2. Username and password validation +3. JWT token response +4. Error handling +``` + +This task has clear boundaries and deliverables. One person can finish it independently, and it is not overly fragmented. + +**Recommended setup**: + +Let each member own **5 to 6 medium-sized tasks**. This gives enough parallelism without making coordination costs too high. + +### Practice 4: avoid file conflicts + +Multiple members modifying the same file at the same time is the most common problem in Agent Teams. + +**Assignment principle**: + +Try to let different members own **different files**: + +``` +Good: +- Teammate A: owns all files under src/auth/ +- Teammate B: owns all files under src/api/ +- Teammate C: owns all files under tests/auth/ + +Bad: +- Teammate A and Teammate B both modify src/app.js +``` + +**If the same file must be modified**: + +Design a serial editing phase: + +``` +Phase 1 (parallel): +- Teammate A: analyze what functionality needs to be added to auth.js +- Teammate B: design the new feature interface +- Teammate C: write the test cases + +Phase 2 (serial): +- Team Lead synthesizes all inputs +- One member modifies auth.js in a single integrated pass +``` + +### Practice 5: provide rich initial context + +When Teammates start, their conversation history is empty. They do not know what the Team Lead and the user discussed before. + +**Wrong approach**: + +``` +Create the team and let the members start working. +``` + +Members will start in a fog: what project is this? What tech stack is it using? What exactly should they build? + +**Correct approach**: + +``` +This is a React + Node.js e-commerce project using TypeScript. + +The project structure is: +- src/frontend/: React frontend code +- src/backend/: Node.js backend code +- prisma/: database models + +Code style: +- Use function components and Hooks +- Use Express.js on the backend +- Use PostgreSQL for the database + +Now create a team and have the members add user authentication under src/auth/. +``` + +Only with sufficient context can members work efficiently. + +### Practice 6: research before implementation + +Do not let members start coding immediately. Ask them to research and design the solution first. + +**Two-phase process**: + +**Phase 1: research and design** + +``` +Create a team. In phase one, do research: +- One member investigates existing authentication approaches (JWT vs Session) +- One member analyzes the project's tech stack and determines best practices +- One member designs the database schema + +After the research is complete, let the members discuss through the messaging system and settle on a final plan. +``` + +**Phase 2: implementation** + +``` +After the plan is finalized, begin implementation: +- One member implements the backend authentication logic +- One member implements the frontend login page +- One member writes tests +``` + +The benefit of doing it this way is that you can **discover architecture mismatches early**, instead of realizing halfway through implementation that the plan does not work. + +### Practice 7: monitor and intervene actively + +Even if you configured automation, you should still actively monitor the team's work status. + +**Use split-pane mode**: + +If you configured tmux panes, you can see all members' output in real time: + +``` +┌─────────────────┬─────────────────┐ +│ Teammate 1 │ Teammate 2 │ +│ Analyzing code │ Implementing │ +│ ... │ API... │ +│ │ │ +│ Wait, this │ │ +│ approach seems │ │ +│ wrong... │ │ +└─────────────────┴─────────────────┘ +``` + +When you notice that a member is going in the wrong direction, you can intervene quickly: + +``` +@Teammate1 Stop for a moment. Your analysis is headed in the wrong direction. The authentication module should be under src/auth/, not src/user/. +``` + +**Check task status regularly**: + +Use the TaskList command to inspect the status of all tasks: + +``` +/tasks +``` + +This shows all task states so you can see what is completed, what is still running, and what is blocked. + +--- + +## Suitable scenarios + +Agent Teams is powerful, but not every task is suitable for it. Understanding the right scenarios helps you choose correctly. + +### Scenarios where Agent Teams fits well + +**Complex system refactors** + +When the refactor spans multiple modules with clear boundaries: + +``` +Scenario: split a monolithic application into microservices + +Create a team: +- Teammate A: analyze dependencies in the user module +- Teammate B: analyze dependencies in the order module +- Teammate C: analyze dependencies in the payment module +- Teammate D: design the inter-service communication protocol +``` + +These modules can be analyzed simultaneously, and the final result can be synthesized later, which is much faster than analyzing them serially. + +**Multi-angle code review** + +When you need to review code from several dimensions: + +``` +Scenario: conduct a full security review of the payment module + +Create a team: +- Teammate A: focus on security vulnerabilities (SQL injection, XSS, etc.) +- Teammate B: inspect performance issues (N+1 queries, memory leaks, etc.) +- Teammate C: verify completeness of error handling +- Teammate D: evaluate test coverage +``` + +Each member focuses on one dimension, making the review deeper, and the final report more complete. + +**Parallel frontend and backend development** + +When you need to build frontend and backend at the same time: + +``` +Scenario: build a user management feature + +Create a team: +- Teammate A (frontend): implement the user list page +- Teammate B (frontend): implement the user edit page +- Teammate C (backend): implement the CRUD API +- Teammate D (coordination): design the API contract and make sure frontend and backend stay aligned +``` + +Frontend and backend can move in parallel as long as the API contract is defined first, following the contract-first principle. + +**Competitive debugging** + +When you have multiple possible solutions: + +``` +Scenario: fix a complex bug with two possible repair strategies + +Create a team: +- Teammate A: implement solution 1 +- Teammate B: implement solution 2 +- Teammate C: evaluate the pros and cons of both +``` + +Both solutions can be implemented and tested in parallel, and the better one can be chosen afterward. + +**Documentation generation** + +When you need to produce a large amount of documentation: + +``` +Scenario: write documentation for the whole project + +Create a team: +- Teammate A: write API documentation +- Teammate B: write the deployment guide +- Teammate C: write the development guide +- Teammate D: write the troubleshooting manual +``` + +Multiple documents can be written at the same time, greatly improving efficiency. + +### Scenarios where Agent Teams is not a good fit + +**Simple modification tasks** + +``` +Not suitable: variable renaming, single bug fixes, tiny feature additions +``` + +For these tasks, the cost of starting a team is greater than the actual work. + +**Highly serial tasks** + +``` +Not suitable: tasks that must happen strictly in sequence +``` + +If task B cannot start until task A finishes, there is no real space for parallelism. + +**Cost-sensitive tasks** + +Agent Teams consumes **2 to 4 times** the tokens of a single instance, depending on the team size. If cost is the primary concern, a single instance may be the better choice. + +### Decision flowchart + +``` +Are there multiple independent subtasks? + │ + ├─ No → Use a single instance + │ + └─ Yes → + │ + Can the subtasks be assigned to different files? + │ + ├─ No → Consider serial execution or split the task further + │ + └─ Yes → + │ + Is the cost acceptable (2-4x)? + │ + ├─ No → Use a single instance + │ + └─ Yes → Use Agent Teams ✓ +``` + +--- + +## Cost and performance + +Using Agent Teams increases cost, but it can also produce significant efficiency gains. Understanding this tradeoff helps you make informed decisions. + +### Cost analysis + +**Token consumption and team size** + +The token consumption of Agent Teams is roughly **linear** with team size: + +| Team size | Relative cost | Suitable scenario | +|---------|---------|---------| +| 1 person (single instance) | 1x | Simple tasks | +| 2-person team | 2-2.5x | Medium complexity | +| 3-person team | 3-4x | Complex tasks | +| 5+ person team | 5-6x+ | Large projects | + +**Why it is not perfectly linear**: + +- **Startup cost**: each member must receive initial context when it starts +- **Coordination cost**: communication between members through the messaging system also consumes tokens +- **Team Lead cost**: Team Lead usually uses Opus, which is more expensive + +**Concrete example numbers** (Claude 4.5 Sonnet): + +- Input: $3 per million tokens +- Output: $15 per million tokens + +Suppose a task requires: +- Team Lead (Opus): 50K input + 20K output ≈ $2.25 +- 3 Teammates (Sonnet): each 30K input + 15K output ≈ $2.7 × 3 = $8.1 +- **Total**: about $10.35 + +The same task on a single Sonnet instance: +- 100K input + 50K output ≈ $1.05 + +**Cost multiplier**: about 10x + +**But time saved**: potentially reduced from 3 hours to 1 hour + +### Efficiency gains + +**Anthropic internal testing data**: + +- Large project refactors: around **50%** improvement in efficiency +- Parallel multi-module development: around **60-70%** improvement +- Documentation generation tasks: around **80%** improvement + +**Real case**: + +Anthropic's engineering team once used **16 parallel agents** to build a C compiler in about 2 weeks that could compile the Linux 6.9 kernel, around 100,000 lines of Rust code, and it passed 99% of GCC tests. + +### Cost optimization strategies + +**Strategy 1: mix models** + +``` +Team Lead: Opus (strong reasoning needed) +Teammates: Sonnet (high value for cost) +Simple tasks: Haiku (cheapest) +``` + +**Strategy 2: adjust team size dynamically** + +``` +Analysis phase: 5-person team (multi-angle analysis) +Implementation phase: 3-person team (parallel coding) +Testing phase: 2-person team (testing and fixing) +``` + +**Strategy 3: use Agent Teams only in selected phases** + +Do not use Agent Teams for the entire project. Use it only in the most complex phases: + +``` +Phase 1 (requirements analysis): single instance +Phase 2 (architecture design): Agent Teams (multiple plans explored in parallel) +Phase 3 (coding): single instance +Phase 4 (code review): Agent Teams (multi-angle review) +Phase 5 (documentation): Agent Teams (parallel writing) +``` + +### When it is worth it + +**Worth it when**: + +- The project timeline is tight, and the value of efficiency gains exceeds the token cost +- The task is highly complex, and a single instance is likely to miss details +- You need multi-angle analysis and validation + +**Not worth it when**: + +- The task is simple, and the overhead of starting a team is too high +- Cost is highly sensitive and the token budget is limited +- The task is highly serial and offers no space for parallelism + +--- + +## Frequently asked questions + +### Q1: Is Agent Teams stable? Can it be used in production? + +Agent Teams is currently an **experimental feature**, so there may still be bugs and unstable behavior. Recommendations: + +- Back up important projects first +- Start with small projects so you can test and get familiar with it +- Follow official release notes to see improvements in new versions +- Report issues to the official team promptly when they appear + +### Q2: How many members can I create at most? + +There is no hard theoretical limit, but from a practical perspective: + +- Small projects: 2 to 3 people +- Medium projects: 3 to 5 people +- Large projects: 5 to 10 people + +Too many members introduce the following problems: + +- Coordination overhead rises sharply +- Token usage grows linearly +- File conflict probability increases +- Monitoring and management become harder + +### Q3: Can team members see each other's context? + +**No**. Every Teammate has a completely independent context window. They communicate through the messaging system rather than sharing context directly. + +This is a deliberate design choice, and the benefits are: + +- One member's reasoning is not polluted by another member's reasoning +- Context does not become chaotic because conversations are too long +- It is closer to how a real team works, where everyone has their own mind + +### Q4: How do I switch between different members? + +If split-pane mode is not configured, you can use shortcut keys: + +- `Shift+Up`: switch to the previous member +- `Shift+Down`: switch to the next member +- `Ctrl+O`: return to the Team Lead + +### Q5: What if a task fails? + +If one member's task fails: + +1. Check the cause of failure by reading that member's output log +2. Reassign the task to another member if needed +3. Intervene manually and help unblock the issue directly + +### Q6: Can I add or remove members midway through the process? + +Yes. You can issue commands to the Team Lead at any time: + +``` +Add a new member and let it handle XXX. +``` + +``` +Let Teammate 3 leave the team after finishing the current task. +``` + +### Q7: Can Agent Teams be used together with MCP and Skills? + +Absolutely. In fact, they work even better together: + +- **Agent Teams + Skills**: each member can carry different skills +- **Agent Teams + MCP**: different members can access external resources through different MCP servers + +``` +Create a team: +- Teammate A: carries the frontend-design Skill and is responsible for UI +- Teammate B: accesses the repository through GitHub MCP and handles PR management +- Teammate C: queries data through Database MCP and handles analysis +``` + +--- + +## References + +### Official resources + +- [Official Claude Code documentation](https://docs.anthropic.com/en/docs/claude-code) - Complete Claude Code documentation +- [Anthropic engineering blog](https://www.anthropic.com/engineering) - Official technical blog and updates + +### Agent Teams tutorial collection + +**Complete guides in Chinese**: + +- [Claude Code Agent Teams complete guide: from introduction to hands-on practice](https://m.blog.csdn.net/u010634066/article/details/157903022) - Includes configuration details, hands-on examples, and the striking case where 16 parallel agents built a C compiler +- [Collaborative development with Claude Code Agent Team: a complete hands-on guide](https://m.blog.csdn.net/u010028049/article/details/158126612) - Full collaborative project workflow +- [Step-by-step guide to setting up and using Claude Code Agent Teams](https://cloud.tencent.com/developer/article/2630088) - Tencent Cloud tutorial with detailed setup instructions + +**Getting started in practice**: + +- [Hands-on with native Claude Code Agent Teams: from enabling it to running a three-person team](https://www.cnblogs.com/147api/p/19606317) - Three-person team walkthrough +- [Fresh beginner practice with Claude Code Agent Teams](https://m.toutiao.com/article/7606744384960266793/) - Beginner-friendly introduction with best practices such as contract-first +- [No more going solo: let 7 Claudes help you develop at the same time with Agent Teams](https://m.toutiao.com/a7605229732241736202/) - Case study of a 7-person team + +**Best practices**: + +- [Agent Teams best practices: contract-first, task granularity, and model assignment](https://blog.csdn.net/sinat_37574187/article/details/144727588) - Detailed explanation of 7 best practices +- [A seven-year big-tech veteran's Claude Code field manual: eight rules from beginner to expert](https://new.qq.com/rain/a/20260111A02HE900) - Enterprise-level real-world experience + +**Principles and comparisons**: + +- [Claude Code Agent Teams: the right way to do multi-agent collaboration](https://post.m.smzdm.com/p/adoezrmz/) - Deep analysis of multi-agent collaboration +- [Claude Code multi-agent team development: the complete guide from principles to pitfalls](https://m.toutiao.com/a7605229732241736202/) - Principles and pitfalls from real usage + +### Official guide translations + +- [Claude officially released the "Agent Building Guide" (with PDF download)](https://m.blog.csdn.net/sinat_37574187/article/details/144724124) - Official Agent Building Guide +- [Full translated version of Claude's official "Guide to Building Effective Agents"](https://m.blog.csdn.net/gyn_enyaer/article/details/144827922) - Full Chinese translation + +### Related technologies + +- [Agent Skills standard](https://agentskills.io/) - The Skills ecosystem +- [skills.sh - Agent Skills app store](https://skills.sh/) - 70,000+ skill library diff --git a/docs/en/stage-3/core-skills/basics/index.md b/docs/en/stage-3/core-skills/basics/index.md new file mode 100644 index 0000000..4b8d0b2 --- /dev/null +++ b/docs/en/stage-3/core-skills/basics/index.md @@ -0,0 +1,1836 @@ +# Claude Code Quickstart Core Guide + +Claude Code is Anthropic's official AI-native coding tool. It integrates large-language-model capability directly into the terminal, so you can complete programming tasks by collaborating with AI in natural language. Unlike traditional code-completion tools, Claude Code can understand the context of an entire project and execute complex development tasks. From code generation to refactoring, from debugging to documentation writing, it can handle all of them. + +This chapter helps you quickly master the core usage of Claude Code, including installation and setup, basic operations, practical techniques, and commonly used commands. Whether this is your first time using an AI coding tool, or you want to use Claude Code more efficiently, you will find what you need here. + +--- + +## Quick Installation + +Claude Code is built on Node.js, so before installation make sure Node.js 18 or above is installed on your system. The process is very simple and usually takes only a few minutes. + +### Why You Need Claude Code + +In traditional development workflows, developers frequently switch between editor, terminal, browser, and docs. Claude Code unifies these workflows into one interface: in the same terminal window, you can write code, run tests, read docs, and even collaborate with teammates. More importantly, it can understand your project structure and remember your coding habits, becoming a true programming assistant. + +### Method 1: Manual Installation + +Manual installation is suitable for developers who like full control over each step, and it also helps you clearly understand tool components. + +```bash +# Install Claude Code CLI globally +# Use -g to install command globally, so it can be used in any directory +npm install -g @anthropic-ai/claude-code + +# Verify installation +# If version is shown (for example 0.1.25), installation succeeded +claude --version +``` + +During installation, npm automatically downloads dependencies and configures environment variables. If you run into permission problems, try `sudo` (macOS/Linux) or run terminal as administrator (Windows). + +### Method 2: Let an AI Agent Install It for You + +If you are already using other AI coding assistants (such as Cursor, Windsurf, or the AI Agent in this project), you can let them complete installation for you. The benefit is that AI can detect your environment automatically, handle dependency conflicts, and choose the best installation route for your system. + +**You can just say:** + +```text +Help me install Anthropic Claude Code. +``` + +Or more specifically: + +```text +Install Claude Code CLI and check whether my Node.js version is compatible. +``` + +An AI Agent will: +1. Check current Node.js version +2. Prompt you to upgrade if requirements are not met +3. Run installation commands +4. Verify installation result +5. Try automatic fixes if there are issues + +### First Launch and Initialization + +After installation, enter your project directory and start Claude Code: + +```bash +# Enter project directory (Claude Code works in current directory) +cd /path/to/your/project + +# Start Claude Code +claude +``` + +At first launch, Claude Code guides you through several important setup steps: + +1. **Sign in to Anthropic account**: you need an Anthropic account to use Claude Code. If you do not have one, you will be prompted to register. +2. **Choose a plan**: + - **Free plan**: suitable for personal learning and light usage, with call limits + - **Pro plan**: suitable for professional developers, with higher quota and priority response +3. **Accept terms**: read and accept Anthropic terms and privacy policy +4. **Optional: configure API key**: if you have a custom key (for example from a third-party provider), configure it here + +::: info Special Note for Users in Mainland China + +Due to network reasons, users in mainland China may not be able to directly access Anthropic official services. Claude Code supports third-party services compatible with Anthropic API format, and this is technically feasible. + +**You have two options:** + +1. **Use API token directly**: buy a token from a provider compatible with Anthropic API and configure it with environment variables +2. **Use a Coding Plan**: some providers offer coding-optimized plans that are usually more cost-effective for coding scenarios + +**Recommended approach**: let an AI Agent help you configure. You only need to provide provider config information (API endpoint, key, etc.), and AI can set environment variables correctly. + +**See detailed setup guide:** [How to install claudecode and configure environment variables](/en/stage-2/backend/2.6-modern-cli/) + +::: + +--- + +## Quick Start: Run a Few Small Experiments + +After installation, do not rush into formal projects. Run a few small experiments first to understand how Claude Code works. These three experiments are designed from easy to advanced, corresponding to three core abilities: natural-language understanding, content generation, and code execution. + +### Experiment 1: Conversation - Feel AI Understanding + +The purpose is to experience Claude Code's natural-language understanding. Unlike normal search engines, Claude Code can understand context, carry multi-turn conversation, and adjust answers from your feedback. + +**Try these prompts:** + +```text +Hello, who are you? +``` + +Claude introduces itself as Claude Code, an AI coding assistant by Anthropic. + +```text +What is a closure? Give me the too-long-didnt-read version. +``` + +Observe how Claude uses "too-long-didnt-read" as a hint and gives concise but accurate explanation. + +```text +What is the difference between JavaScript and TypeScript? +``` + +This is a technical comparison question. Check whether Claude provides a structured and in-depth answer. + +**Experiment point**: note Claude's response style. It usually gives the core conclusion first, then details. This "inverted pyramid" style is excellent for fast information retrieval. + +### Experiment 2: Generate a Markdown Document - Experience Content Creation + +This experiment demonstrates Claude Code's content-generation capability. For developers, writing docs is often painful. Claude can quickly generate clear and complete docs from requirements. + +**Enter this instruction:** + +```text +Write a Markdown document of commonly used Git commands. +Requirements: include command, explanation, and example. +``` + +**What Claude does:** + +1. Analyze your requirement: common Git commands, Markdown format, and three elements (command/explanation/example) +2. Plan document structure: usually grouped by usage scenario (init, daily dev, branch workflow, remote collaboration, etc.) +3. Generate content: concise explanation and practical examples for each command +4. Format output: use Markdown syntax and proper structure + +**Expected output sample**: + +```markdown +# Common Git Command Cheat Sheet + +## Initialize Repository + +| Command | Explanation | Example | +|------|------|------| +| `git init` | Initialize new repository | `git init my-project` | +| `git clone` | Clone remote repository | `git clone https://github.com/user/repo.git` | + +... +``` + +**Advanced attempts**: you can add extra requirements like "add Chinese comments", "sort by frequency", "include common error handling", etc., and observe how Claude adapts output. + +### Experiment 3: Write and Run a Game - End-to-End Coding Workflow + +This is the most challenging experiment. It demonstrates Claude Code's full workflow: understand requirement, write code, create files, run program, and handle errors. Through it, you can really feel the power of an AI coding assistant. + +**Enter this instruction:** + +```text +Write a Snake game in Python. +Requirements: +1. Use pygame +2. Show score +3. Press ESC to exit + +After writing, help me run it. +``` + +**Claude executes these steps:** + +**Step 1: Check environment** +- Check whether Python is installed +- Check whether pygame is available +- Prompt installation if missing + +**Step 2: Write code** +- Create game entry file (for example `snake_game.py`) +- Implement movement, food generation, collision detection +- Add score rendering +- Implement ESC exit + +**Step 3: Run game** +- Execute Python script and launch game +- Game window pops up, use arrow keys to control snake + +**Step 4: Follow-up support** +- If there is a bug, you can directly say "snake can pass through walls, fix it" +- If you want more features, such as "increase difficulty with score", Claude can keep modifying + +**Value of this experiment:** + +1. **Verify setup**: confirm Claude Code can execute code correctly +2. **Experience interaction**: feel collaborative development with AI +3. **Build confidence**: see AI complete an end-to-end runnable program + +**Common questions:** + +- **Q: What if pygame is not installed?** + - A: Claude detects it and suggests `pip install pygame`, or you can ask Claude to install it + +- **Q: Terminal is occupied after game starts, what should I do?** + - A: Press ESC to quit game, or keep using Claude Code in another terminal window + +- **Q: Can I switch language?** + - A: Absolutely. Try "write in JavaScript", "write with HTML5 Canvas", etc. + +--- + +## Core Techniques + +Master these techniques and your Claude Code efficiency can improve by multiple times. They come from real development practice and cover high-frequency scenarios. + +### Technique 1: Double-press Esc to Roll Back Conversation - Undo Misoperations + +This is the most common and important shortcut in Claude Code. During collaboration, you may mistype, give wrong instruction, or dislike an answer. Double-pressing Esc gives you quick "time rewind." + +**Shortcut details:** + +```text +Press Esc once -> clear current input (similar to Ctrl+C) +Press Esc twice -> roll back to previous conversation state (undo previous turn) +Press Esc three times -> clear all conversation history (start over) +``` + +**Use cases:** + +- **Case A**: you accidentally sent wrong instruction and Claude started executing. Quickly press Esc twice to return before execution. +- **Case B**: Claude response is not what you wanted, and you want to rephrase. Double Esc to undo and ask again. +- **Case C**: conversation has many rounds and context is messy. Triple Esc to clear and restart. + +**Important note**: double Esc rolls back **conversation state**, not code changes. If Claude already edited files, those edits are not auto-reverted. You must manually restore via Git. + +**Recommendation**: before potentially large code edits, save current state (`git commit` or `git stash`) so recovery is easy. + +### Technique 2: Use @ to Reference Files - Precise Context Control + +Although Claude Code can read project files automatically, explicitly referencing files makes intent clearer and avoids wasting tokens on unrelated files. + +**Basic usage:** + +Instead of vague: + +```text +Explain src/utils.ts +``` + +Use explicit reference: + +```text +@src/utils.ts Explain this file +``` + +**Advanced usage:** + +**Compare multiple files:** +```text +@src/app.tsx @src/components/Header.tsx What is the relationship between these two files? +``` + +**Reference directory:** +```text +@src/components/ Summarize all components under this directory +``` + +**Reference specific lines (with editor):** +```text +@src/utils.ts:45-60 Explain what this code does +``` + +**Usage tips:** + +1. **Tab completion**: type `@` then press Tab, Claude shows file list under current directory and you can choose with arrows +2. **Relative paths**: support references like `@./config.json` or `@../shared/types.ts` +3. **Fuzzy matching**: partial file names are allowed, e.g. `@utils` can match `src/utils.ts` or `src/utils/index.ts` + +### Technique 3: Use ! to Execute Commands - Terminal Integration + +Claude Code has built-in command execution. You can run commands without switching to another terminal. + +**Basic usage:** + +```text +!npm test # run tests +!git status # check Git status +!ls -la # list files +``` + +**Practical scenarios:** + +**Scenario: run tests and analyze failure** +```text +!npm test +# after failure +Analyze test failure reasons and fix code +``` + +**Scenario: inspect Git diff** +```text +!git diff +# ask Claude to explain changes +Summarize key changes in this diff +``` + +**Scenario: build project** +```text +!npm run build +# if build fails +Build failed, help me fix it +``` + +**Safety note:** + +For sensitive commands (for example `rm -rf`, `sudo`), Claude Code asks for confirmation. This is a protection mechanism. Confirm carefully. + +### Technique 4: /plan Before Coding - Correct Way to Handle Complex Tasks + +For complex tasks, directly coding is often inefficient. `/plan` enters planning mode: define implementation plan first, then execute step by step. + +**Usage:** + +```text +/plan +I want to add user authentication. Please create an implementation plan. +``` + +**What Claude does:** + +1. **Analyze requirement** +2. **Evaluate current project and stack** +3. **Create step-by-step plan** +4. **Discuss and adjust with your feedback** + +**Sample output:** + +```text +📋 User Authentication Implementation Plan + +Phase 1: Database design +- [ ] Create users table (id, email, password_hash, created_at) +- [ ] Create sessions table (id, user_id, expires_at) + +Phase 2: Backend API +- [ ] POST /api/auth/register - register +- [ ] POST /api/auth/login - login +- [ ] POST /api/auth/logout - logout +- [ ] GET /api/auth/me - get current user + +Phase 3: Frontend integration +- [ ] Create login page +- [ ] Create register page +- [ ] Add route guards + +Phase 4: Testing +- [ ] Write unit tests +- [ ] Write integration tests + +Which phase do you want to start with? Or should we adjust the plan? +``` + +**Best practice:** + +- For tasks over 30 minutes, use `/plan` first +- Execute phase by phase and verify each phase +- If requirement changes, rerun `/plan` to adjust + +### Technique 5: /init Auto-generates Configuration - Fast Project Initialization + +`/init` is one of Claude Code's most powerful commands. It scans your project automatically, identifies stack and structure, and generates a complete `CLAUDE.md`. + +**Usage:** + +```text +/init +``` + +**Claude performs:** + +1. **Scan project structure**: identify framework/language/build tools +2. **Analyze config files**: read package.json, tsconfig.json, etc. +3. **Infer style**: naming conventions and file organization +4. **Generate CLAUDE.md** + +**Generated CLAUDE.md example:** + +```text +# My Project + +## Tech Stack +- Framework: Next.js 14 (App Router) +- Language: TypeScript +- Styling: Tailwind CSS +- State: Zustand +- Database: Prisma + PostgreSQL + +## Common Commands + +\`\`\`bash +npm run dev # start dev server +npm run build # production build +npm run test # run tests +npx prisma migrate dev # DB migration +\`\`\` + +## Code Conventions +- Use function components + Hooks +- File naming: PascalCase (components), camelCase (utility funcs) +- Commit style: Conventional Commits +``` + +**Why this matters:** + +`CLAUDE.md` is Claude Code's "project memory." On every launch, Claude reads this file and understands project background. That means: + +- you do not need to repeatedly explain framework and stack +- Claude follows your conventions and best practices +- new team members can onboard faster + +**Recommendation**: after project initialization, run `/init` immediately, then refine generated config to match reality. + +### Technique 6: /compact Compresses Context - Save Tokens + +Claude Code context window is limited (often around 200K tokens). Long conversations consume many tokens, increase cost, and may push important early info out of context. + +**Usage:** + +```text +/compact +``` + +**How it works:** + +`/compact` analyzes chat history, extracts key information (decisions made, code generated, confirmed requirements), and creates a concise summary. Later dialogue is based on this summary rather than full history. + +**When to use:** + +- after 5-6 rounds +- when Claude seems to "forget" previous context +- when switching to a new subtask but keeping key background + +**Recommendation:** + +```text +# compress after long conversation +/compact + +# keep working +Now that user module is done, let's build order module. +``` + +### Technique 7: Use Claude Code to Assist Git Commits + +In Claude Code, recommended commit workflow is: let Claude inspect diff and draft commit message, then you run standard Git commands. This is clear and gives you one more review checkpoint before commit. + +Official references: + +- [Built-in commands](https://code.claude.com/docs/en/commands) +- [Discover plugins](https://code.claude.com/docs/en/discover-plugins) + +**Recommended workflow:** + +```bash +# 1. Check current changes +/diff +!git status + +# 2. Ask Claude to summarize and generate commit message +Based on current git diff, generate a Conventional Commits message, +and explain in Chinese why this category is appropriate. + +# 3. After you confirm, run standard Git commit +!git add -A +!git commit -m "feat(docs): update Claude Code workflow guidance" +``` + +**Benefits of this approach:** + +1. **Aligned with current official capability**: no dependency on removed built-ins +2. **Transparent**: review diff and commit message before submit +3. **Portable**: same workflow works in other AI IDEs or pure Git + +**If you want "one-command commit" experience:** + +Claude Code now recommends plugin-based extension. For example, `commit-commands` provides commands like `/commit-commands:commit`. + +```bash +# 1. Add plugin marketplace example +/plugin marketplace add anthropics/claude-code + +# 2. Install commit workflow plugin +/plugin install commit-commands@anthropics-claude-code + +# 3. Reload plugins +/reload-plugins + +# 4. Use plugin command to commit +/commit-commands:commit +``` + +**Additional notes:** + +- `/commit-commands:commit` is provided by plugin, not current default built-in command +- if you only need to inspect changes before commit, prefer `/diff` or ask Claude to explain `git diff` +- official `/review` has also been marked deprecated; for similar capability, use plugin or natural-language review flow + +### Technique 8: Shift+Tab Auto-Accept - Improve Fluency + +By default, Claude asks confirmation before editing code. This is useful when learning, but may feel slow later. `Shift+Tab` enables auto-accept mode for faster iteration. + +**Usage:** + +- press `Shift+Tab` -> enter auto-accept mode +- press `Shift+Tab` again -> exit auto-accept mode + +**Mode comparison:** + +| Mode | Behavior | Use scenario | +|------|------|----------| +| Default mode | Ask confirmation for every edit | Learning stage, important code | +| Auto-accept | Apply edits directly | After familiarization, rapid iteration | + +**Notes:** + +- In auto-accept mode, Claude edits files directly with no second confirmation +- Recommended to pair with Git so rollback is easy +- For sensitive operations (delete files, modify key configs), Claude still asks + +### Technique 9: Ctrl+C Cancel Operation - Emergency Brake + +When Claude is running a long task, or you realize you gave a wrong instruction, `Ctrl+C` is the emergency brake. + +**Usage:** + +- press `Ctrl+C` once -> cancel currently running operation +- press `Ctrl+C` twice -> fully exit Claude Code + +**Use cases:** + +- long-running command needs interruption +- Claude is generating large irrelevant code +- wrong instruction detected and you want immediate stop + +**Difference from double Esc:** + +- `Ctrl+C`: stop ongoing **operation** (running command / generating code) +- `double Esc`: roll back **conversation state** (undo previous turn) + +### Technique 10: /context Check Context Usage - Optimize Token Cost + +`/context` displays current session context usage, helping you understand token consumption and optimize cost. + +**Usage:** + +```text +/context +``` + +**Sample output:** + +```text +📊 Context Usage + +Token usage: 45,230 / 200,000 (22.6%) +File references: 12 files +Conversation rounds: 8 + +Top token-consuming files: +1. src/api/users.ts (3,420 tokens) +2. node_modules/@types/react/index.d.ts (2,890 tokens) +3. src/components/Dashboard.tsx (1,560 tokens) + +Suggestions: +- Current usage is healthy, no compression needed +- To reduce usage, add node_modules into .claudeignore +``` + +**How to use this information:** + +1. **Identify large files**: if one file consumes a lot of tokens, check if it is really needed +2. **Optimize .claudeignore**: ignore unrelated files (node_modules, build output, etc.) +3. **Decide when to compact**: when usage exceeds 70%, consider `/compact` + +### Technique 11: /resume Restore Session - Switch Multi-task Conversations + +When handling multiple tasks, you may run multiple conversation threads. `/resume` lets you switch back to previous session context in the current chat, without restarting. + +**Usage:** + +```text +/resume +``` + +**How it works:** + +Claude Code records previous sessions automatically. When you run `/resume`, it switches to previous session context and keeps all prior discussion content and state. + +**Use cases:** + +**Case A: parallel multi-tasking** +```text +# Task 1: fix bug +claude> Fix login-page validation issue +# ... one conversation ... + +# Task 2: add feature (new thread) +claude> Add user registration feature +# ... another conversation ... + +# Switch back to task 1 +claude> /resume +# Continue previous bug-fix work +``` + +**Case B: temporary lookup then return** +```text +claude> Explain this algorithm +# ... discuss algorithm ... + +claude> /resume +# Return to previous coding work +``` + +**Case C: resume after interruption** +```text +claude> Continue previous work +# If you interrupted before, /resume brings you back +``` + +**Comparison with related commands:** + +| Command | Function | Scenario | +|------|------|----------| +| `/resume` | Switch back to previous session in current chat | Multi-task switching | +| `claude -c` | Continue most recent session | Reconnect after exit | +| `claude -r` | Restore previous session | Recover prior state after exit | +| `double Esc` | Roll back one turn | Undo most recent conversation turn | + +**Suggestions:** + +1. **Multi-task management**: `/resume` is more efficient than re-explaining context +2. **Session memory**: each session has independent context; `/resume` preserves it +3. **Use with /compact**: in long sessions, compact first, then resume switch to keep context clean + +--- + +## Core Configuration + +Reasonable configuration helps Claude Code better fit your project and team. This section explains configuration role, priority, and optimization for different usage scenarios. + +### Configuration File Locations and Priority + +Claude Code uses layered configuration strategy. Different levels have different scope and priority. Understanding this lets you manage settings flexibly. + +**Configuration priority (high to low):** + +| Location | Scope | Purpose | Commit to Git | +|------|--------|------|--------------| +| `.claude/settings.local.json` | local project | personal preferences | ❌ no | +| `.claude/settings.json` | project shared | team-wide configuration | ✅ yes | +| `~/.claude/settings.json` | global | personal defaults | ❌ no | + +**Merge rules:** + +- Higher-priority config overrides same key in lower priority +- Non-conflicting keys are merged +- Project config overrides global config +- Local personal config overrides shared project config + +**Practical scenarios:** + +**Scenario 1: team project** +```text +~/.claude/settings.json # your personal default editor settings +.claude/settings.json # team coding standards and permission config +.claude/settings.local.json # your debug preferences and theme settings +``` + +**Scenario 2: personal project** +```text +~/.claude/settings.json # global default config +.claude/settings.json # project-specific config (e.g. special permission rules) +``` + +### CLAUDE.md - Project Memory + +`CLAUDE.md` is the most important file for Claude Code configuration. It acts like a project "manual." Every time Claude Code starts, it reads `CLAUDE.md` under current directory, understanding background, stack, and conventions. + +**Why CLAUDE.md is so important:** + +Imagine joining a new project: you need to learn stack, coding conventions, and common commands. Normally this takes hours of docs/code review and teammate questions. With `CLAUDE.md`, Claude knows this at startup and you can immediately collaborate effectively. + +**Minimum viable template:** + +```text +# [Project Name] + +## Tech Stack +- Framework: React 18 + TypeScript +- State: Zustand +- Styling: Tailwind CSS +- Build tool: Vite + +## Common Commands + +\`\`\`bash +npm run dev # start development server (port 5173) +npm run test # run unit tests +npm run build # production build +npm run lint # lint checks +\`\`\` + +## Code Conventions +- Components use function components + Hooks +- Naming: PascalCase (components), camelCase (utility funcs) +- Git commits use Conventional Commits +- All API calls must go through unified request wrapper +``` + +**Full template (recommended):** + +```text +# [Project Name] + +## Project Overview +One-sentence description of main functionality and target users. + +## Tech Stack +### Frontend +- Framework: React 18 + TypeScript +- Router: React Router v6 +- State: Zustand + React Query +- Styling: Tailwind CSS + Headless UI +- Build: Vite + +### Backend (if applicable) +- Runtime: Node.js + Express +- Database: PostgreSQL + Prisma +- Auth: JWT + bcrypt + +## Project Structure + +\`\`\` +src/ +├── components/ # reusable components +├── pages/ # page components +├── hooks/ # custom Hooks +├── lib/ # utility functions +├── types/ # TypeScript types +└── api/ # API calls +\`\`\` + +## Common Commands + +\`\`\`bash +# development +npm run dev # start dev server +npm run dev:mock # use mock data in development + +# testing +npm run test # run all tests +npm run test:watch # watch mode +npm run test:coverage # generate coverage report + +# code quality +npm run lint # ESLint check +npm run lint:fix # auto-fix ESLint issues +npm run format # Prettier format +npm run typecheck # TypeScript type check + +# build +npm run build # production build +npm run preview # preview production build +\`\`\` + +## Development Rules +### Code style +- Use function components, avoid class components +- Prefer custom Hooks for logic abstraction +- Component props must define TypeScript interfaces + +### Git workflow +- Branch prefix: `feature/`, `fix/`, `refactor/` +- Commit messages follow Conventional Commits +- PR must pass CI and code review + +### Performance requirements +- Component lazy loading to reduce first-screen load time +- Use WebP images and enable lazy loading +- Keep API response time under 200ms + +## Environment Variables + +\`\`\`bash +# .env.local +VITE_API_BASE_URL=http://localhost:3000 +VITE_APP_NAME=MyApp +\`\`\` + +## Common Issues + +### Dev server failed to start? + +Check whether port 5173 is occupied, or try `npm run dev -- --port 3000` + +### Type errors? + +Run `npm run typecheck` to see detailed errors +``` + +**Fast generation of CLAUDE.md:** + +If your project exists but has no `CLAUDE.md`, run `/init`: + +```bash +claude +# inside Claude Code +/init +``` + +Claude analyzes project structure, package.json, and current code, then generates a practical `CLAUDE.md`. After generation, manually review and adjust. + +### .claudeignore - Save Tokens + +`.claudeignore` tells Claude Code which files should not be read into context. Correct configuration can significantly reduce token usage (often 40-60%) and improve response speed. + +**Why .claudeignore is needed:** + +When Claude Code tries to understand project, it reads related files. Some files do not help understanding and can: +- consume many tokens (for example type definition files in node_modules) +- introduce noise (logs, build outputs) +- include sensitive info (.env files) + +**Recommended config:** + +```text +# ===== dependencies ===== +# huge third-party code, usually unnecessary for Claude context +node_modules/ +.pnp/ +.pnp.js + +# ===== build outputs ===== +# generated artifacts, not source logic +dist/ +build/ +.next/ +out/ +*.tsbuildinfo + +# ===== logs ===== +# runtime logs, no value for understanding architecture +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +# ===== testing outputs ===== +coverage/ +.nyc_output/ + +# ===== editor / IDE ===== +.vscode/* +!.vscode/extensions.json +.idea/ +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? + +# ===== system files ===== +.DS_Store +Thumbs.db + +# ===== env files ===== +.env +.env.local +.env.*.local + +# ===== large binary assets ===== +*.png +*.jpg +*.jpeg +*.gif +*.svg +*.ico +*.mp4 +*.webm + +# ===== lock files (optional) ===== +# If you do not need Claude to analyze dependency versions, ignore these +# package-lock.json +# yarn.lock +# pnpm-lock.yaml +``` + +**Config tips:** + +1. **Start minimal**: ignore node_modules and build outputs first, then observe token usage +2. **Tune per project**: image-heavy project -> ignore image formats; docs project -> keep Markdown +3. **Optimize regularly**: use `/context` to see top token-consuming files and decide whether to ignore + +### Permission Configuration + +By default, Claude Code asks confirmation before sensitive operations. Through `permissions` in `settings.json`, you can control which actions are auto-allowed, require confirmation, or fully denied. + +**Permission config structure:** + +```json +{ + "permissions": { + "allow": [ + // auto-allow without asking + ], + "ask": [ + // ask before execution + ], + "deny": [ + // fully deny + ] + } +} +``` + +**Rule syntax:** + +Permission rules use `ActionType(pattern)` format: + +| Action type | Description | Example | +|----------|------|------| +| `Bash` | run terminal command | `Bash(git status)` | +| `Edit` | edit file | `Edit(src/**/*.ts)` | +| `Read` | read file | `Read(README.md)` | +| `Write` | create file | `Write(src/components/*.tsx)` | + +**Wildcard support:** + +- `*` matches arbitrary characters (excluding `/`) +- `**` matches arbitrary paths +- `?` matches one character + +**Real config example:** + +```json +{ + "permissions": { + "allow": [ + "Bash(git status)", + "Bash(git log:*)", + "Bash(git diff:*)", + "Bash(npm test:*)", + "Bash(npm run lint:*)", + "Edit(src/**/*.{ts,tsx})", + "Edit(tests/**/*.test.ts)", + "Read(src/**/*.ts)", + "Write(src/components/*.tsx)" + ], + "ask": [ + "Bash(git commit:*)", + "Bash(git push:*)", + "Bash(git pull:*)", + "Bash(npm install:*)", + "Bash(npm run build)", + "Edit(package.json)", + "Edit(tsconfig.json)", + "Read(.env)", + "Read(config/secrets.*)" + ], + "deny": [ + "Bash(rm -rf:*)", + "Bash(sudo:*)", + "Bash(curl * | sh)", + "Bash(wget * | sh)", + "Edit(.git/*)", + "Write(/etc/*)", + "Read(/etc/passwd)" + ] + } +} +``` + +**Configuration suggestions:** + +1. **Development stage**: relatively relaxed permissions for faster iteration +2. **Production stage**: stricter permissions, especially deployment and sensitive data operations +3. **Team collaboration**: place baseline rules in shared `settings.json`, personal tweaks in `settings.local.json` + +### Rules Directory + +For large projects, a single `CLAUDE.md` can become bloated and hard to maintain. Claude Code supports modular management through **Rules directory**, splitting conventions by topic into separate files. + +**Directory structure:** + +```text +.claude/ +├── settings.json # main config file +├── CLAUDE.md # project overview (still needed) +└── rules/ # rules directory + ├── 00-security.md # security rules (global) + ├── 01-coding-style.md # coding style rules (global) + ├── 10-api.md # API dev rules + ├── 11-frontend.md # frontend dev rules + ├── 12-backend.md # backend dev rules + └── 20-testing.md # testing rules +``` + +**Filename suggestion:** + +Use numeric prefixes (`00-`, `01-`) to control load order: base rules first, specific rules later. + +**Rule file format:** + +Rule files support YAML frontmatter to define applicability: + +```markdown +--- +# Optional: paths where this rule applies +globs: + - "src/api/**/*.ts" + - "src/services/**/*.ts" + +# Optional: commands where this rule applies +commands: + - "generate api" + - "create endpoint" + +# Optional: rule priority (smaller number = higher priority) +priority: 10 +--- + +# API Development Rules + +## Route design +- RESTful style, use plural nouns +- Versioning: /api/v1/users +- Nested resources: /api/v1/users/123/orders + +## Request/response format +- Use JSON consistently +- Error response must include code and message +- Pagination response uses { data, pagination } structure + +## Security requirements +- All endpoints must verify authentication (except public endpoints) +- Sensitive operations require secondary confirmation +- Implement rate limiting to prevent abuse +``` + +**Rule inheritance and override:** + +- Global rules (no frontmatter or `globs: *`) apply to all files +- Path-specific rules apply only to matched files +- If rules conflict, higher-priority rule wins +- Specific rules can override global rules + +**Usage scenario examples:** + +**Scenario 1: frontend-backend separated project** +```text +.claude/rules/ +├── 00-general.md # general standards (commit message, naming) +├── 10-backend.md # backend standards (NestJS-specific) +├── 11-frontend.md # frontend standards (React-specific) +└── 20-database.md # database standards (Prisma-specific) +``` + +**Scenario 2: microservice architecture** +```text +.claude/rules/ +├── 00-global/ # global rules +│ ├── security.md +│ └── logging.md +├── 10-services/ # service-specific rules +│ ├── user-service.md +│ ├── order-service.md +│ └── payment-service.md +└── 20-shared/ # shared component rules + ├── shared-lib.md + └── common-utils.md +``` + +**Migration recommendation:** + +If you already have a very large `CLAUDE.md`, migrate to Rules directory like this: + +1. Create `.claude/rules/` +2. Split `CLAUDE.md` by topic +3. Add suitable frontmatter per rule file +4. Keep `CLAUDE.md` as project overview and move detailed standards out +5. Test and ensure rule loading works correctly + +--- + +## Core Operation Commands + +Claude Code provides a rich set of operational commands for efficient AI collaboration. These commands fall into categories: Slash commands (built-in features), symbol system (short operations), and natural-language instructions (daily development). + +### Slash Command Quick Reference + +Slash commands are built-in operations that start with `/`. They provide standardized actions such as project initialization, config management, and status checks. + +| Command | Function | Use scenario | +|------|------|----------| +| `/help` | Show all commands | quick lookup when you forget commands | +| `/init` | Initialize project and generate CLAUDE.md | new project or adding config | +| `/plan` | Enter planning mode | create plan before complex tasks | +| `/clear` | Clear conversation history | restart when context is messy | +| `/compact` | Compress context | save tokens after long chat | +| `/diff` | Open interactive diff view | inspect current uncommitted changes | +| `/plugin` | Manage plugins | install commit/review extensions | +| `/context` | Show context usage | optimize token cost | +| `/cost` | Show session cost | monitor usage cost | +| `/config` | Open config panel | update settings | +| `/permissions` | Permission management | adjust operation permissions | +| `/model` | Switch model | choose different models | + +**Command-combination example:** + +```bash +# complete development workflow +/plan # 1. create plan +# ... execute development ... +/diff # 2. inspect changes +Generate a commit message from current diff +!git add -A # 3. stage changes +!git commit -m "..." # 4. commit +/cost # 5. check cost +``` + +### Symbol System + +Symbol system is Claude Code's shorthand operation mechanism. Special symbols quickly trigger specific capabilities. + +| Symbol | Name | Purpose | Example | +|------|------|------|------| +| `/` | Slash command | execute built-in operation | `/help`, `/plan` | +| `@` | At reference | reference file/directory | `@src/app.tsx` | +| `!` | Bang mode | run terminal command | `!npm test` | +| `&` | Background run | run task in background | `&npm run dev` | + +**Symbol combination tips:** + +```bash +# combine symbols +@src/utils.ts !npm test +# meaning: read utils.ts, then run tests + +@src/components/ @src/pages/ compare structures of these two directories +# meaning: reference two directories simultaneously for comparison + +!git diff @src/app.tsx explain these changes +# meaning: inspect Git diff and ask Claude to explain specific file changes +``` + +### File Operations + +File operations are the most common daily actions: read, edit, create, and delete files. + +**Read files:** + +```bash +# basic read +@src/app.tsx explain this file + +# read + analyze +@src/utils/helpers.ts find potential performance issues + +# compare read +@src/components/OldButton.tsx @src/components/NewButton.tsx compare differences +``` + +**Edit files:** + +```bash +# simple edit +Modify formatDate in src/utils/date.ts to support Chinese locale format + +# complex edit +@src/api/users.ts Refactor this file: +1. Extract duplicated error handling into shared handleError +2. Replace Promise chains with async/await +3. Add JSDoc comments + +# batch edit +Convert all class components under src/components/ into function components +``` + +**Create files:** + +```bash +# create one file +Create src/components/UserCard.tsx, a card component to display user info + +# create related files +Create user module: +1. src/types/user.ts - define User interface +2. src/api/users.ts - user API calls +3. src/components/UserCard.tsx - user card component +4. src/hooks/useUser.ts - hook to fetch user data +``` + +**Delete files:** + +```bash +# delete with confirmation +Delete src/old-component.tsx (this component is no longer used) + +# Claude asks for confirmation and may suggest checking references first +``` + +### Git Operations + +Claude Code deeply integrates with Git so you can complete full version-control workflow without leaving terminal. + +**Check status:** + +```bash +# show Git status +Show git status and uncommitted changes + +# detailed diff +!git diff +Explain changes in src/api/users.ts +``` + +**Create commits:** + +```bash +# inspect changes +/diff + +# generate commit message +Generate a Conventional Commit message from current git diff + +# commit manually +!git add -A +!git commit -m "..." +``` + +**Branch operations:** + +```bash +# create feature branch +!git checkout -b feature/user-authentication + +# after implementation +Generate commit message based on current changes +!git add -A +!git commit -m "..." +!git push -u origin feature/user-authentication +``` + +**Complete Git workflow example:** + +```bash +# 1. start new feature +!git checkout -b feature/payment-integration + +# 2. develop feature (with Claude assistance) +Create payment module with Alipay and WeChat Pay + +# 3. run tests +!npm test + +# 4. inspect changes +/diff + +# 5. generate and confirm commit message +Generate a Conventional Commit message from current git diff +!git add -A +!git commit -m "..." + +# 6. push remote +!git push -u origin feature/payment-integration + +# 7. create PR (optional, with GitHub CLI) +!gh pr create --title "feat: add payment integration" --body "Support Alipay and WeChat Pay" +``` + +### Code Operations + +Code operations are Claude Code's core strengths: generation, explanation, refactoring, and optimization. + +**Generate code:** + +```bash +# generate component +Create a React Hook to manage auth state, including login/logout/permission checks + +# generate utility function +Create a date-formatting utility that supports relative time (e.g. "2 hours ago") + +# generate complete module +Create order module with: +- order list page +- order detail page +- create-order API +- order status management +``` + +**Explain code:** + +```bash +# line-by-line explanation +Explain src/algorithms/quicksort.ts line by line + +# high-level explanation +@src/services/payment.ts explain architecture design of this module + +# explain complex logic +Explain what reduce in src/utils/dataTransformer.ts is doing +``` + +**Refactor code:** + +```bash +# architecture refactor +Convert class components in src/components/ to function components + +# performance refactor +Optimize rendering performance in src/App.tsx, reduce unnecessary re-renders + +# cleanup refactor +@src/utils/helpers.ts Refactor this file: +1. Delete unused functions +2. Extract repeated logic into shared utilities +3. Add type definitions +4. Improve function naming +``` + +**Debug code:** + +```bash +# error analysis +npm test failed, analyze root cause and fix it + +# performance analysis +@src/components/DataTable.tsx This component renders slowly, find bottlenecks + +# log analysis +!cat logs/error.log +Analyze these error logs and identify root cause +``` + +### Test Operations + +Testing is essential for quality assurance. Claude Code can help generate tests, run tests, and analyze results. + +**Generate tests:** + +```bash +# unit tests +Generate unit tests for src/utils/math.ts, including boundary cases + +# component tests +Generate React Testing Library tests for src/components/UserForm.tsx + +# integration tests +Create integration test for user registration flow from form submission to DB write +``` + +**Run and debug tests:** + +```bash +# run tests +!npm test + +# debug failed tests +Analyze failure reasons and fix +@tests/auth.test.ts + +# coverage check +!npm run test:coverage +Which code paths are not covered? +``` + +**Testing strategy suggestion:** + +```bash +I added user authentication. Please: +1. Generate unit tests for auth.service.ts +2. Generate component tests for LoginForm +3. Run all tests and ensure pass +``` + +### Command Chaining and Workflow Composition + +The most efficient way to use Claude Code is chaining commands into complete workflows. + +**Scenario 1: bug-fix workflow** + +```bash +# 1. inspect issue +!npm test +Tests failed, analyze why + +# 2. locate issue +@src/utils/validation.ts Is the issue in this file? + +# 3. fix issue +Fix isEmail in validation.ts to correctly handle addresses containing + + +# 4. verify fix +!npm test + +# 5. commit fix +Generate a fix-type commit message from current diff +!git add -A +!git commit -m "fix: ..." +``` + +**Scenario 2: code review workflow** + +```bash +# 1. inspect changes +!git diff --stat +Which files changed? + +# 2. detailed review +@src/components/ Review these component changes + +# 3. suggest improvements +What improvements should be made based on this review? + +# 4. implement improvements +Optimize performance of UserList component + +# 5. final review +/diff +Review current changes and point out potential risks and improvements +``` + +**Scenario 3: new feature workflow** + +```bash +# 1. plan first +/plan +I want to add shopping cart feature + +# 2. create branch +!git checkout -b feature/shopping-cart + +# 3. implement feature +Implement step by step according to plan + +# 4. add tests +Generate tests for shopping cart module + +# 5. run tests +!npm test + +# 6. code review +/diff +Please do a code review on current diff + +# 7. commit +Generate commit message for this feature development +!git add -A +!git commit -m "feat: ..." +!git push +``` + +--- + +## Frequently Asked Questions + +While using Claude Code, you may encounter various issues. This section summarizes common problems and solutions. + +### Token Usage Is Too Fast? + +Fast token consumption is one of the most common issues. Below is a complete optimization strategy. + +**Diagnosis:** + +First run `/context` to inspect current token usage: + +```text +/context +``` + +Focus on: +- **Token usage rate**: if over 70%, consider context compression +- **Number of referenced files**: more files means higher token consumption +- **Large files**: check which files consume most tokens + +**Optimization strategy:** + +**1. Improve .claudeignore** + +Make sure `.claudeignore` includes unnecessary files: + +```text +# must ignore +node_modules/ +dist/ +build/ +*.log +.env + +# project-specific +# React +.next/ +out/ + +# Vue +.nuxt/ +.output/ + +# generic +.vscode/ +.idea/ +coverage/ +*.min.js +*.bundle.js +``` + +**2. Compress context regularly** + +Long conversations accumulate many tokens. It is recommended to run `/compact` every 5-6 rounds: + +```text +# after long conversation +/compact + +# continue +Now let's implement order module... +``` + +**3. Reference files precisely** + +Avoid referencing entire directory if not needed: + +```bash +# not recommended +@src/ Explain this code + +# recommended +@src/utils/auth.ts @src/components/Login.tsx Explain login flow +``` + +**4. Avoid reading huge files** + +If `/context` shows one file consuming many tokens, consider: +- do you really need it? +- can you reference only a section? +- can this file be split into smaller modules? + +### Claude Does Not Understand the Project? + +If Claude answers inaccurately or repeatedly asks basic project info, it lacks project context. + +**Solutions:** + +**1. Generate CLAUDE.md** + +Run `/init` to generate project config: + +```bash +/init +``` + +After generation, validate: +- is project summary accurate? +- is stack complete? +- are common commands correct? +- are coding conventions clear? + +**2. Manually edit CLAUDE.md** + +If auto-generated config is not detailed enough, add: + +```markdown +## Project-Specific Information + +### Architecture Decisions +- Why choose X over Y? +- What are core design patterns? + +### Common Pitfalls +- When using useEffect, watch out for... +- DB queries must... + +### Third-Party Integrations +- Payments via Stripe +- Email via SendGrid +- File storage via AWS S3 +``` + +**3. Use Rules directory** + +For large projects, organize conventions in Rules: + +```text +.claude/rules/ +├── 00-architecture.md # architecture overview +├── 01-coding-style.md # coding style +├── 10-frontend.md # frontend rules +├── 11-backend.md # backend rules +└── 20-testing.md # testing rules +``` + +**4. Add context in prompt when needed** + +For specific tasks, append relevant background: + +```text +We use a custom useAuth Hook for authentication. +It returns { user, login, logout, isLoading }. +Please build a user-menu component based on this Hook. +``` + +### How to Roll Back Operations? + +Claude Code provides multiple rollback mechanisms for different scenarios. + +**Scenario 1: rollback conversation state** + +If you only mistyped or dislike response: + +```text +Double Esc -> rollback previous turn +Triple Esc -> clear all conversation history +``` + +**Note**: this only rolls back conversation state, not file edits. + +**Scenario 2: undo file edits** + +If Claude already modified files, undo manually: + +```bash +# check changes +!git status +!git diff + +# revert one file +git checkout -- src/utils/helpers.ts + +# revert all working tree changes +git checkout -- . + +# if already committed +# soft rollback (keep changes) +git reset --soft HEAD~1 + +# hard rollback (discard changes) +git reset --hard HEAD~1 +``` + +**Scenario 3: preventively use Git workflow** + +Best practice: save current work before Claude session: + +```bash +# save current state before starting +git add . +git commit -m "WIP: before Claude Code session" +# or use stash +git stash push -m "before claude" + +# develop with Claude Code... + +# if result is unsatisfactory, full rollback +git reset --hard HEAD~1 +# or +git stash pop +``` + +### Too Many Permission Prompts? + +Frequent permission confirmations hurt efficiency. Proper permission config can make workflow smoother. + +**Permission model:** + +Claude Code permissions are three levels: +- **allow**: auto-allow +- **ask**: ask before execution +- **deny**: fully deny + +**Optimization config:** + +Edit `.claude/settings.json`: + +```json +{ + "permissions": { + "allow": [ + // Git read operations + "Bash(git status)", + "Bash(git log:*)", + "Bash(git diff:*)", + "Bash(git branch)", + + // test and checks + "Bash(npm test:*)", + "Bash(npm run lint:*)", + "Bash(npm run typecheck)", + + // dev server + "Bash(npm run dev:*)", + + // source edits + "Edit(src/**/*.{ts,tsx})", + "Edit(tests/**/*.test.ts)", + "Write(src/**/*.ts)" + ], + "ask": [ + // Git write operations + "Bash(git commit:*)", + "Bash(git push:*)", + "Bash(git pull:*)", + + // package management + "Bash(npm install:*)", + "Bash(npm uninstall:*)", + + // build and deployment + "Bash(npm run build)", + "Bash(npm run deploy:*)", + + // config file edits + "Edit(package.json)", + "Edit(tsconfig.json)", + + // sensitive file reads + "Read(.env)", + "Read(config/secrets.*)" + ], + "deny": [ + // dangerous commands + "Bash(rm -rf:*)", + "Bash(sudo:*)", + "Bash(curl * | sh)", + "Bash(wget * | sh)", + + // system files + "Edit(/etc/*)", + "Write(/usr/*)", + + // Git internals + "Edit(.git/*)" + ] + } +} +``` + +**Progressive permission strategy:** + +- **Learning phase**: keep defaults and understand what Claude tries to execute +- **Familiar phase**: add common safe operations (like git status, npm test) into allow +- **High-efficiency phase**: create fine-grained rules based on project characteristics + +### How to Use in Mainland China? + +Due to network constraints, users in China may not directly access Anthropic official services. Here are several options. + +**Option 1: use API proxy service** + +Many cloud providers offer Anthropic-compatible API proxy service: + +```bash +# set env vars +export ANTHROPIC_BASE_URL="https://your-api-proxy.com/v1" +export ANTHROPIC_API_KEY="your-api-key" + +# start Claude Code +claude +``` + +**Option 2: use third-party Claude Code compatible tools** + +Some domestic providers offer compatible tooling: + +```bash +# install compatible version +npm install -g @some-provider/claude-code + +# configure API key +claude config set api.key your-api-key +claude config set api.baseUrl https://api.some-provider.com +``` + +**Option 3: use other AI coding tools** + +If Claude Code is unavailable, consider alternatives: + +| Tool | 특징 | Use scenario | +|------|------|----------| +| Cursor | VS Code-based, full-featured | full IDE experience | +| GitHub Copilot | strong autocomplete | primarily code completion | +| Tongyi Lingma | domestic product, stable in China | domestic development environment | +| Codeium | generous free quota | budget-limited | + +**Option 4: let AI Agent help configure** + +If you are unsure how to configure, ask AI Agent: + +```text +I want to use Claude Code, but I cannot directly access it in mainland China. +I bought an API from provider XXX. +API endpoint is https://api.xxx.com, +key is sk-xxx. + +Please configure environment variables so Claude Code can work correctly. +``` + +**Common questions:** + +- **Q: still cannot connect after configuration?** + - A: check API endpoint correctness, including `/v1` path + - A: check API key validity and balance + - A: check whether local network needs proxy + +- **Q: response is slow?** + - A: choose provider with closer geographic region + - A: use coding-optimized plan instead of generic API plan + - A: use `/compact` to reduce token usage + +- **Q: some features are unavailable?** + - A: some third-party providers may not fully support all Claude Code features + - A: check provider docs for supported feature scope + +--- + +## Reference Resources + +- [Claude Code Official Docs](https://code.claude.com/docs) +- [Claude Code GitHub](https://github.com/anthropics/claude-code) +- [Everything Claude Code](https://github.com/affaan-m/everything-claude-code) diff --git a/docs/en/stage-3/core-skills/claude-agent-sdk/index.md b/docs/en/stage-3/core-skills/claude-agent-sdk/index.md new file mode 100644 index 0000000..768afd0 --- /dev/null +++ b/docs/en/stage-3/core-skills/claude-agent-sdk/index.md @@ -0,0 +1,706 @@ +# Claude Agent SDK Complete Guide + +## Introduction + +You may already have used Claude's basic API: send one message, get one reply, just like chatting. But if you want Claude to help you read files, run commands, search code, fix bugs, verify the result itself, and continue iterating, this kind of "autonomous work" is not something the basic API can do. + +Claude Agent SDK is built exactly for this scenario. It packages all of Claude Code's capabilities - reading and writing files, executing commands, searching code, editing files, browsing the web - into a programmable library. You do not need to write the tool-calling loop yourself. Claude can execute tools autonomously and iterate autonomously until the task is truly completed. + +One-sentence summary: the basic SDK is "you ask, it answers"; the Agent SDK is "you assign, it works." + +--- + +## What Is the Difference from the Basic SDK? + +Look at the code first, and the difference is obvious: + +```python +# Basic anthropic SDK: you must write your own loop to handle tool calls +import anthropic + +client = anthropic.Anthropic() +response = client.messages.create( + model="claude-sonnet-4-6", + max_tokens=1024, + messages=[{"role": "user", "content": "Fix the bug in auth.py"}], + tools=[...] # You must define tools yourself +) +# Claude asks to call some tool +while response.stop_reason == "tool_use": + result = your_tool_executor(response.tool_use) # You must execute it yourself + response = client.messages.create(tool_result=result, **params) # You must feed it back yourself +``` + +```python +# Agent SDK: one block and done, Claude reads files, finds bugs, and edits code by itself +from claude_agent_sdk import query, ClaudeAgentOptions + +async for message in query( + prompt="Fix the bug in auth.py", + options=ClaudeAgentOptions(allowed_tools=["Read", "Edit", "Bash"]), +): + print(message) # Claude reads files, locates issues, and edits code by itself +``` + +The difference is clear: + +| Comparison Item | Basic anthropic SDK | Claude Agent SDK | +|--------|-------------------|-----------------| +| Tool execution | You implement it | Claude handles it | +| Tool loop | You implement it | Built-in agent loop | +| Built-in tools | None, all self-defined | Read/write files, Bash, search, and more out of the box | +| Context management | You maintain it | Auto compression and auto management | +| Best for | Chat, generation, simple tool use | Autonomously completing complex tasks | + +--- + +## How Is It Different from Other Agent Frameworks? + +There are many Agent frameworks on the market - LangChain, LlamaIndex, CrewAI, AutoGPT, and more. What is unique about Claude Agent SDK compared with them? + +> 📚 **For a detailed comparison, see the appendix**: [Mainstream Agent Framework Comparison](/en/appendix/8-artificial-intelligence/ai-agents.html) + +In short: + +| Framework | Best-Fit Scenario | +|------|-------------| +| **Claude Agent SDK** | Let Claude autonomously complete coding, file operations, and command execution | +| **LangChain** | Build complex general AI apps with highly customized flows | +| **CrewAI** | Simulate multi-role collaboration scenarios (virtual teams, role-playing) | +| **LlamaIndex** | Build knowledge-base QA systems that connect enterprise data with LLMs | + +--- + +## Installation and Configuration + +### Installation + +Python needs 3.10+, and TypeScript needs Node.js 18+: + +```bash +# Python +pip install claude-agent-sdk + +# TypeScript +npm install @anthropic-ai/claude-agent-sdk +``` + +### Authentication + +Just set the API key environment variable: + +```bash +export ANTHROPIC_API_KEY=your-api-key +``` + +Cloud-platform authentication is also supported: +- AWS Bedrock: set `CLAUDE_CODE_USE_BEDROCK=1` + AWS credentials +- Google Vertex AI: set `CLAUDE_CODE_USE_VERTEX=1` + GCP credentials +- Microsoft Azure: set `CLAUDE_CODE_USE_FOUNDRY=1` + Azure credentials + +### Custom API Endpoint + +If you use a proxy, gateway, or self-hosted API endpoint, you can change the default API URL through the `env` parameter: + +```python +from claude_agent_sdk import query, ClaudeAgentOptions + +async for message in query( + prompt="Hello", + options=ClaudeAgentOptions( + env={ + "ANTHROPIC_BASE_URL": "https://your-proxy.example.com", + "ANTHROPIC_API_KEY": "your-api-key", + } + ), +): + print(message) +``` + +`ClaudeAgentOptions` does not have a direct `base_url` parameter, but the `env` field can pass arbitrary environment variables into the underlying Claude Code CLI. Common environment variables: + +| Environment Variable | Purpose | +|---------|------| +| `ANTHROPIC_BASE_URL` | Custom API endpoint (proxy, gateway) | +| `ANTHROPIC_API_KEY` | API key | +| `ANTHROPIC_AUTH_TOKEN` | Alternative auth token | +| `ANTHROPIC_CUSTOM_HEADERS` | Custom request headers | + +--- + +## Core Concepts + +The Agent SDK runtime principle can be summarized in one sentence: **collect context -> execute actions -> verify results -> repeat**. + +This is exactly how human developers work: read code first, then modify code, then run tests and check results. If it is wrong, keep iterating. Agent SDK automates this loop. + +### Two Usage Modes + +**Mode 1: `query()` function - stateless, suitable for one-off tasks** + +```python +import asyncio +from claude_agent_sdk import query, ClaudeAgentOptions + +async def main(): + async for message in query( + prompt="What files are in this directory?", + options=ClaudeAgentOptions(allowed_tools=["Bash", "Glob"]), + ): + if hasattr(message, "result"): + print(message.result) + +asyncio.run(main()) +``` + +**Mode 2: `ClaudeSDKClient` - stateful, suitable for multi-turn conversation** + +Use this when you need to preserve context and interact across multiple turns. For example, first ask Claude to read one module, then ask it to find all call sites of that module - in the second turn it still remembers what it read in the first turn. + +```python +import asyncio +from claude_agent_sdk import query, ClaudeAgentOptions + +async def main(): + session_id = None + + # Turn 1: read the auth module + async for message in query( + prompt="Read the authentication module code", + options=ClaudeAgentOptions(allowed_tools=["Read", "Glob"]), + ): + if hasattr(message, "subtype") and message.subtype == "init": + session_id = message.session_id + + # Turn 2: continue based on previous context + async for message in query( + prompt="Find all places that call it", + options=ClaudeAgentOptions(resume=session_id), + ): + if hasattr(message, "result"): + print(message.result) + +asyncio.run(main()) +``` + +--- + +## Built-in Tools: Ready to Use + +This is one of the best parts of Agent SDK - you do not need to implement any tools yourself, Claude can use them directly: + +| Tool | Capability | Typical Use | +|------|------|---------| +| Read | Read files | View code, read configs | +| Write | Create files | Generate new files | +| Edit | Precise file edits | Bug fixes, refactoring | +| Bash | Run terminal commands | Run tests, install dependencies, git operations | +| Glob | Pattern-based file search | `**/*.py`, `src/**/*.ts` | +| Grep | Regex content search | Find function definitions, TODOs | +| WebSearch | Search web pages | Look up docs, find approaches | +| WebFetch | Fetch web content | Read online docs | +| Task | Launch sub-agents | Parallelize sub-tasks | + +Use `allowed_tools` to control which tools the agent can use: + +```python +# Read-only agent: can inspect but cannot modify +options = ClaudeAgentOptions( + allowed_tools=["Read", "Glob", "Grep"], + permission_mode="bypassPermissions" +) + +# Full agent: can read, write, and execute commands +options = ClaudeAgentOptions( + allowed_tools=["Read", "Write", "Edit", "Bash", "Glob", "Grep"] +) +``` + +--- + +## Advanced Features + +### Hooks: Insert Your Own Logic at Key Points + +Hooks let you inject custom code at critical moments of agent execution - for example, logging, intercepting risky operations, and auditing file changes. + +Supported hook types include: `PreToolUse` (before tool execution), `PostToolUse` (after tool execution), `Stop` (when the agent stops), `SessionStart`, `SessionEnd`, and more. + +```python +from datetime import datetime +from claude_agent_sdk import query, ClaudeAgentOptions, HookMatcher + +# Record an audit log every time a file is modified +async def log_file_change(input_data, tool_use_id, context): + file_path = input_data.get("tool_input", {}).get("file_path", "unknown") + with open("./audit.log", "a") as f: + f.write(f"{datetime.now()}: modified {file_path}\n") + return {} + +async def main(): + async for message in query( + prompt="Refactor utils.py for better readability", + options=ClaudeAgentOptions( + permission_mode="acceptEdits", + hooks={ + "PostToolUse": [ + HookMatcher(matcher="Edit|Write", hooks=[log_file_change]) + ] + }, + ), + ): + if hasattr(message, "result"): + print(message.result) +``` + +Real-world uses: +- Audit logging: record every operation performed by the agent +- Security interception: block modifications to critical files +- Notification push: send messages when agent tasks complete +- Cost monitoring: count tool calls and token usage + +### Sub-Agents: Split Big Tasks Across Specialists + +When a task is complex enough, you can define multiple specialized sub-agents and let the main agent delegate sub-tasks to them. Each sub-agent has its own instructions and tool permissions, isolated from each other. + +```python +from claude_agent_sdk import query, ClaudeAgentOptions, AgentDefinition + +async for message in query( + prompt="Use the code-reviewer agent to review this project's code quality", + options=ClaudeAgentOptions( + allowed_tools=["Read", "Glob", "Grep", "Task"], + agents={ + "code-reviewer": AgentDefinition( + description="Professional code reviewer responsible for quality and security reviews", + prompt="Analyze code quality, identify potential issues, and provide improvement suggestions.", + tools=["Read", "Glob", "Grep"], + ), + "test-writer": AgentDefinition( + description="Testing specialist responsible for writing unit tests", + prompt="Write unit tests for functions that are missing tests.", + tools=["Read", "Write", "Bash"], + ), + }, + ), +): + if hasattr(message, "result"): + print(message.result) +``` + +Messages from sub-agents include a `parent_tool_use_id` field, making it easy to track which messages came from which sub-agent. + +### MCP Integration: Connect to the Outside World + +Through Model Context Protocol (MCP), your agent can connect to external systems such as databases, browsers, and third-party APIs. The community already provides [hundreds of MCP servers](https://github.com/modelcontextprotocol/servers) you can use directly. + +```python +# Connect Playwright so the agent can operate a browser +async for message in query( + prompt="Open example.com and describe what you see", + options=ClaudeAgentOptions( + mcp_servers={ + "playwright": { + "command": "npx", + "args": ["@playwright/mcp@latest"] + } + } + ), +): + if hasattr(message, "result"): + print(message.result) +``` + +Common MCP integration scenarios: +- Playwright: browser automation, scraping pages, filling forms +- PostgreSQL/MySQL: direct database querying and operations +- Slack/Email: sending notifications and messages +- GitHub: operating PRs, Issues, and repositories + +--- + +## What Can You Build with It? Practical Scenarios + +After understanding features, the most important question is: what can this actually do? Below are real scenarios validated by the community. + +### Scenario 1: Automatic Bug-Fix Agent + +Give it a bug description, and it can find code, locate the issue, fix it, and run tests to verify: + +```python +async for message in query( + prompt="Users report occasional HTTP 500 errors during login. Investigate and fix code under src/auth/", + options=ClaudeAgentOptions( + allowed_tools=["Read", "Edit", "Bash", "Glob", "Grep"], + permission_mode="acceptEdits", + ), +): + print(message) +``` + +Claude will grep logs, read related code, find the bug, modify code, and run tests to confirm the fix. + +### Scenario 2: Code Review Agent + +Build a read-only code review agent that audits quality without making any modifications: + +```python +async for message in query( + prompt="Review code under src/ with focus on security vulnerabilities, performance issues, and coding conventions", + options=ClaudeAgentOptions( + allowed_tools=["Read", "Glob", "Grep"], + permission_mode="bypassPermissions", + ), +): + if hasattr(message, "result"): + print(message.result) +``` + +### Scenario 3: CI/CD Integration + +In a CI pipeline, let the agent analyze failing tests and attempt automatic fixes: + +```python +async for message in query( + prompt="Run npm test, analyze failing test cases, and fix the code so all tests pass", + options=ClaudeAgentOptions( + allowed_tools=["Read", "Edit", "Bash", "Glob"], + max_turns=20, + ), +): + print(message) +``` + +This is a major advantage of Agent SDK over CLI - CLI is good when a human sits at the terminal, while SDK is ideal for embedding into automated workflows. + +### Scenario 4: Research Agent + +Let the agent search the web, read documentation, synthesize information, and produce a report: + +```python +async for message in query( + prompt="Research mainstream Python Web frameworks in 2026. Compare FastAPI, Django, and Litestar, then write a technical selection report to report.md", + options=ClaudeAgentOptions( + allowed_tools=["WebSearch", "WebFetch", "Write"], + ), +): + print(message) +``` + +### Scenario 5: Full-Stack Agent with Browser Capability + +By connecting Playwright through MCP, the agent can not only write code but also open a browser to verify results: + +```python +async for message in query( + prompt="Fix the homepage style issue, then open a browser and take screenshots to verify the result", + options=ClaudeAgentOptions( + allowed_tools=["Read", "Edit", "Bash"], + mcp_servers={ + "playwright": { + "command": "npx", + "args": ["@playwright/mcp@latest"] + } + }, + ), +): + print(message) +``` + +### Scenario Quick Reference + +| Scenario | Core Tools | Difficulty | +|------|---------|------| +| Auto bug fixing | Read, Edit, Bash, Grep | Beginner | +| Code review | Read, Glob, Grep | Beginner | +| CI/CD auto-fix | Read, Edit, Bash | Intermediate | +| Technical research report | WebSearch, WebFetch, Write | Beginner | +| Browser automation | MCP (Playwright) | Intermediate | +| Multi-agent collaboration | Task + AgentDefinition | Advanced | +| Database operations | MCP (PostgreSQL/MySQL) | Intermediate | +| Email/notification assistant | MCP (Slack/Email) | Intermediate | + +--- + +## When Should You Use Agent SDK? + +Not every scenario needs Agent SDK. Choosing the right tool matters: + +| What You Want to Do | Recommended Tool | +|-----------|---------| +| Simple chat, text generation, translation | Basic `anthropic` SDK | +| One-shot tool use (weather lookup, arithmetic) | Basic `anthropic` SDK | +| Autonomously complete multi-step development tasks | Agent SDK | +| Embed into CI/CD pipelines | Agent SDK | +| Build apps that operate on a file system | Agent SDK | +| Daily interactive development | Claude Code CLI | +| One-off quick tasks | Claude Code CLI | + +In short: if your task requires Claude to "work hands-on" by itself (reading files, editing code, running commands), use Agent SDK. If you only need Q&A, the basic SDK is enough. + +--- + +## Enterprise Practice: Building a Code-Quality Guardrail Pipeline + +The previous scenarios all used one agent for one job. In real enterprise environments, what you need is a full pipeline - multiple agents chained together, each stage with clear input/output, plus auditing, rollback, and notifications. + +Now we will build a real scenario: after each PR submission, automatically trigger **code review -> security scan -> auto-fix -> test verification -> report generation** as a complete pipeline. + +### Architecture Design + +```text +PR submitted + │ + ▼ +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Code Review │───▶│ Security Scan│───▶│ Auto Fix │ +│ Agent │ │ Agent │ │ Agent │ +│ (read-only) │ │ (read-only) │ │ (writable) │ +└─────────────┘ └─────────────┘ └─────────────┘ + │ + ▼ + ┌─────────────┐ ┌─────────────┐ + │ Test Verify │───▶│ Report Build │ + │ Agent │ │ Agent │ + │ (Bash) │ │ (Write) │ + └─────────────┘ └─────────────┘ + │ + ▼ + Slack notification +``` + +Core idea: **each agent does one thing, permissions are minimized, and results are passed in sequence**. + +### Step 1: Define the Pipeline Framework + +```python +import asyncio +import json +from datetime import datetime +from claude_agent_sdk import query, ClaudeAgentOptions, HookMatcher + +# Audit log: record every operation by every agent +audit_log = [] + +async def audit_hook(input_data, tool_use_id, context): + audit_log.append({ + "time": datetime.now().isoformat(), + "tool": input_data.get("tool_name"), + "input": input_data.get("tool_input", {}), + }) + return {} + +# Shared hook config: all agents share audit capability +audit_hooks = { + "PostToolUse": [HookMatcher(matcher=".*", hooks=[audit_hook])] +} +``` + +### Step 2: Code Review Agent (Read-Only) + +```python +async def run_code_review(pr_diff: str) -> str: + """Read-only agent, reviews code quality and outputs a structured report""" + result_text = "" + async for message in query( + prompt=f"""Review the following PR diff from these dimensions: +1. Code conventions: naming, formatting, comments +2. Logic issues: edge cases, null pointer risks, race conditions +3. Performance risks: N+1 queries, memory leaks, unnecessary loops +4. Maintainability: oversized functions, unclear responsibilities, magic numbers + +PR Diff: +{pr_diff} + +Output JSON format: {{"issues": [{{"severity": "high/medium/low", "file": "...", "line": ..., "description": "..."}}], "summary": "..."}}""", + options=ClaudeAgentOptions( + allowed_tools=["Read", "Glob", "Grep"], + permission_mode="bypassPermissions", + hooks=audit_hooks, + max_turns=10, + ), + ): + if hasattr(message, "result"): + result_text = message.result + return result_text +``` + +### Step 3: Security Scan Agent (Read-Only) + +```python +async def run_security_scan() -> str: + """Read-only agent focused on vulnerability scanning""" + result_text = "" + async for message in query( + prompt="""Scan the project code for security vulnerabilities: +1. SQL injection, XSS, CSRF +2. Hardcoded keys or credentials +3. Insecure dependency versions +4. Missing permission checks + +Output JSON: {{"vulnerabilities": [{{"severity": "critical/high/medium", "type": "...", "file": "...", "description": "...", "fix_suggestion": "..."}}]}}""", + options=ClaudeAgentOptions( + allowed_tools=["Read", "Glob", "Grep", "Bash"], + permission_mode="bypassPermissions", + hooks=audit_hooks, + max_turns=15, + ), + ): + if hasattr(message, "result"): + result_text = message.result + return result_text +``` + +### Step 4: Auto-Fix Agent (Writable) + +```python +async def run_auto_fix(review_result: str, security_result: str) -> str: + """Writable agent that auto-fixes code based on review and scan results""" + result_text = "" + async for message in query( + prompt=f"""Fix code according to the following review results: + +Code review report: +{review_result} + +Security scan report: +{security_result} + +Fix rules: +1. Only fix issues with severity high or critical +2. Run related tests after each change to ensure no existing functionality is broken +3. Do not refactor unrelated code, apply minimal fixes only +4. Output the list of modified files after completion""", + options=ClaudeAgentOptions( + allowed_tools=["Read", "Edit", "Bash", "Glob", "Grep"], + permission_mode="acceptEdits", + hooks=audit_hooks, + max_turns=30, + ), + ): + if hasattr(message, "result"): + result_text = message.result + return result_text +``` + +### Step 5: Test Verification + Report Generation + +```python +async def run_test_and_report(fix_result: str) -> str: + """Run tests and generate final report""" + result_text = "" + async for message in query( + prompt=f"""Execute these actions: +1. Run the full test suite (npm test or pytest) +2. Compute test pass rate +3. Generate a Markdown quality report into pr-report.md, including: + - Count of issues found in code review and severity distribution + - Number of security vulnerabilities + - Auto-fix changes: {fix_result} + - Test pass rate + - Final conclusion: whether merge is recommended""", + options=ClaudeAgentOptions( + allowed_tools=["Read", "Bash", "Write", "Glob"], + hooks=audit_hooks, + max_turns=15, + ), + ): + if hasattr(message, "result"): + result_text = message.result + return result_text +``` + +### Step 6: Chain the Whole Pipeline + +```python +import subprocess + +async def run_pipeline(): + """Full PR quality-guard pipeline""" + print("🔍 Stage 1/4: code review...") + pr_diff = subprocess.run( + ["git", "diff", "main...HEAD"], capture_output=True, text=True + ).stdout + review_result = await run_code_review(pr_diff) + + print("🛡️ Stage 2/4: security scan...") + security_result = await run_security_scan() + + print("🔧 Stage 3/4: auto-fix...") + fix_result = await run_auto_fix(review_result, security_result) + + print("✅ Stage 4/4: test verification + report generation...") + report = await run_test_and_report(fix_result) + + # Save audit log + with open("audit-log.json", "w") as f: + json.dump(audit_log, f, indent=2, ensure_ascii=False) + + print(f"Pipeline finished, audit log saved ({len(audit_log)} operation records)") + return report + +asyncio.run(run_pipeline()) +``` + +### Enterprise Design Thinking + +This pipeline reflects several key enterprise design principles: + +**Least privilege**: code-review and security-scan agents are read-only and cannot accidentally modify code. Only the auto-fix agent has write permission, and even that is constrained by `acceptEdits`. + +**Auditable**: every step of every agent is logged through Hooks. If anything goes wrong, you can trace which agent did what and when. + +**Result chaining**: each agent's output becomes the next agent's input. Review results feed auto-fix; auto-fix results feed test verification. Every stage has a clear input/output contract. + +**Cost control**: every agent has a `max_turns` limit to prevent runaway loops. In production, you can also add `max_budget_usd` for budget control. + +**Extensibility**: want another stage, such as a "documentation-check agent" or "performance benchmark agent"? Add a new function and insert it into the pipeline. + +This model can be embedded directly into GitHub Actions or GitLab CI, automatically triggered on each PR, truly achieving "AI-driven code quality guardrails." + +--- + +## Error Handling + +Agent SDK provides clear exception types so you can build robust fault tolerance in production: + +```python +from claude_agent_sdk import query, CLINotFoundError, ProcessError + +try: + async for msg in query(prompt="Analyze code"): + print(msg) +except CLINotFoundError: + print("Claude Code CLI is not installed. Please install it first.") +except ProcessError as e: + print(f"Process exited unexpectedly with exit code: {e.exit_code}") +``` + +--- + +## Summary + +The core value of Claude Agent SDK is upgrading "model reasoning" into "controlled execution." It does not just generate text. It can truly complete tasks inside an auditable, constrained tool system. + +Remember a line from Anthropic's official blog: the Agent SDK design philosophy is "give the agent a computer and let it work like a human." + +A good agent application = clear tool design + explicit task boundaries + appropriate human supervision. Tools give the agent capability, boundaries give it constraints, and supervision gives you confidence. None of the three can be missing. + +--- + +## References + +### Official Resources + +- [Agent SDK Official Docs](https://platform.claude.com/docs/en/agent-sdk/overview) - the most authoritative reference +- [GitHub - claude-agent-sdk-python](https://github.com/anthropics/claude-code-sdk-python) - Python SDK source +- [GitHub - claude-agent-sdk-typescript](https://github.com/anthropics/claude-agent-sdk-typescript) - TypeScript SDK source +- [Agent SDK Demo Projects](https://github.com/anthropics/claude-agent-sdk-demos) - email assistant, research agent, and more + +### Blogs and Tutorials + +- [Building agents with the Claude Agent SDK](https://claude.com/blog/building-agents-with-the-claude-agent-sdk) - Anthropic engineering blog on design philosophy and architecture +- [Claude Agent SDK Python Study Guide](https://redreamality.com/blog/claude-agent-sdk-python-) - Chinese-friendly full tutorial from zero +- [Claude Agent SDK Full Tutorial](https://blog.wenhaofree.com/en/posts/articles/claude-agent-sdk-tutorial/) - practical guide to tool systems, Agent Loop, and controlled execution +- [12 Practical Agent SDK Scenarios](https://skywork.ai/blog/claude-agent-sdk-use-cases-2025/) - covers coding, data, automation, and more +- [Step-by-Step Agent Tutorial](https://skywork.ai/blog/how-to-use-claude-agent-sdk-step-by-step-ai-agent-tutorial/) - TypeScript + Python dual-track tutorial diff --git a/docs/en/stage-3/core-skills/long-running-tasks/index.md b/docs/en/stage-3/core-skills/long-running-tasks/index.md new file mode 100644 index 0000000..77dbe05 --- /dev/null +++ b/docs/en/stage-3/core-skills/long-running-tasks/index.md @@ -0,0 +1,776 @@ +# How to Make Claude Code Work for Long Durations + +## Introduction + +Traditional AI coding assistants are "conversational": you say one thing, it replies once, and then stops. But for real development tasks, this mode is far from enough. + +Imagine these scenarios: you want Claude to refactor an entire project, but it edits a few files and says "done"; you want Claude to keep fixing bugs until all tests pass, but it runs once and stops; you want Claude to "work overnight," but next morning you find it stopped long ago. + +In the summer of 2025, an Australian developer named Geoffrey Huntley (who is also a sheep farmer) wrote a 5-line bash script. The script was simple: continuously restart Claude Code and feed it the same task. He named it "Ralph Wiggum," after the Simpsons character who keeps trying and never gives up. + +This simple script shocked Silicon Valley. In just two weeks, related projects got 7,000+ GitHub stars. People used it to generate 6 complete projects overnight, delivered $50,000 contract work with only $297 API cost, and even used it to build a complete programming language in 3 months. + +The core question this chapter solves is: how to make Claude Code work continuously like a real developer until tasks are truly complete. + +--- + +## Core Principle: Why Does AI "Stop Too Early"? + +Before discussing specific methods, first understand the root cause. + +### AI's completion judgment is unreliable + +LLMs have a fundamental weakness: they cannot reliably judge whether work is truly complete. + +Human completion criteria are objective: all tests pass, features are complete, and code quality meets standards. But AI can only judge by "feeling." It may stop because "this looks about right," or because "output seems enough," or because it does not know what to do next. + +That is why we need an external system to determine real completion rather than relying on AI's internal sense. + +### The core idea of the solution + +The core solution is to keep AI working inside a "loop." + +Whenever it tries to exit, the external system checks three questions: is it truly complete? does it meet objective criteria? is anything missing? If not, inject the task again and continue another round. + +This idea can be implemented in many forms, from simple bash scripts to complex orchestration systems, but the essence is the same. + +--- + +## Method 1: While True Bash Loop (Most Primitive Method) + +This is the simplest and most direct implementation. Essentially, write an infinite loop that restarts Claude Code each round and feeds the same task description. + +The simplest implementation is only 5 lines: + +```bash +#!/bin/bash +while true; do + cat PROMPT.md | claude +done +``` + +### How it works + +The script flow is straightforward. Step 1 reads the task description from `PROMPT.md`. Step 2 launches Claude Code and passes the task description in. Step 3 Claude works and outputs results. Step 4 Claude exits after finishing. Step 5 the loop automatically restarts and returns to step 1, creating an infinite cycle unless you interrupt manually with `Ctrl+C`. + +### Pros and cons + +The advantage is extreme simplicity: anyone can understand it, no configuration needed, immediately usable, and good for quick experiments. + +But the disadvantages are obvious: it cannot judge real completion, it may spin forever, it has no safety guardrails, and it can waste API calls. + +### Real usage example + +First, create a `PROMPT.md` file to describe your task. For example, refactoring a user auth module: + +```markdown +# Task: Refactor user authentication module + +Requirements: +1. Extract all authentication logic into an independent AuthService class +2. Add unit tests, coverage > 80% +3. Update related documentation + +When all tests pass and docs are updated, output: task complete +``` + +Then create and run the loop script: + +```bash +chmod +x loop.sh +./loop.sh +``` + +### Safer improved version + +To avoid endless loops, add an iteration cap: + +```bash +#!/bin/bash +MAX_ITERATIONS=50 +iteration=0 + +while true; do + iteration=$((iteration + 1)) + echo "=== Iteration $iteration/$MAX_ITERATIONS ===" + + cat PROMPT.md | claude + + if [ $iteration -ge $MAX_ITERATIONS ]; then + echo "Reached maximum iterations, stopping" + break + fi + + sleep 5 # small delay to avoid API rate limits +done +``` + +This improved version adds a max-iteration limit, shows per-round progress, and stops automatically at the limit. It also adds a 5-second delay each loop to avoid rate limiting. + +--- + +## Method 2: Ralph Wiggum Plugin (Official Recommendation) + +Ralph Wiggum is an official Anthropic plugin built specifically for long-running tasks. It is named after the Simpsons character, representing the spirit of "keep trying despite failure." + +### Core mechanism: Stop Hook + +The core of Ralph is Stop Hook. When Claude wants to exit, Stop Hook intercepts the exit signal. Then the system checks: did output include the specific completion marker? If no marker is found, it reinjects the original prompt and starts another iteration. Only when the completion marker is detected is Claude allowed to exit. + +This guarantees Claude does not stop just because it "feels close enough." It must complete clearly marked requirements. + +### Installation + +Ralph Wiggum is an official Claude Code plugin and can be installed in two ways. + +**Option 1: install from official plugin marketplace (recommended)** + +```bash +# run in Claude Code +claude + +# add official plugin marketplace +/plugin marketplace add anthropics/claude-code + +# install Ralph Wiggum +/plugin install ralph-wiggum@claude-code-plugins + +# verify installation +/plugin +``` + +**Option 2: install directly from GitHub** + +```bash +# enter plugin directory +cd ~/.claude/plugins/ + +# clone plugin repo +git clone https://github.com/anthropics/ralph-wiggum-plugin.git +``` + +After installation, you can use: + +- `/ralph-wiggum:ralph-loop` - start loop +- `/ralph-wiggum:cancel-ralph` - cancel loop +- `/ralph-wiggum:help` - show help + +### Basic usage + +Use `/ralph-wiggum:ralph-loop`: + +```bash +/ralph-wiggum:ralph-loop "Build a todo API with CRUD operations, input validation, and tests. + Output COMPLETE when everything is done." \ + --max-iterations 50 \ + --completion-promise "COMPLETE" +``` + +### Parameter explanation + +The two most important parameters are `--max-iterations` and `--completion-promise`. + +`--max-iterations` sets the hard safety cap. Recommended values are typically 20-100. Even if unfinished, Ralph stops at this limit to prevent infinite API spending. + +`--completion-promise` specifies the completion marker text, which must be explicit and unique. Ralph treats the task as complete only when Claude output contains that marker. Use clear markers such as `COMPLETE` or `TASK_DONE`, and avoid ambiguous words. + +### Prompt best practices + +Writing good prompts is key to Ralph success. + +Bad prompts usually do not define completion criteria. For example, "write a todo API" may lead AI to output a rough skeleton and stop, with no tests, no verification, and no docs. + +Good prompts should include phased requirements and clear acceptance criteria. For example: + +Describe phased tasks first. Phase 1 is core functionality with all CRUD endpoints: POST `/todos` create, GET `/todos` list, GET `/todos/:id` fetch single, PUT `/todos/:id` update, DELETE `/todos/:id` delete. Phase 2 is input validation: title cannot be empty, completion status must be boolean. Phase 3 is tests: write tests for each endpoint, with coverage > 80%. + +Then define acceptance criteria: all tests pass, code passes linter, README includes API docs. + +Finally define a unique completion marker: `TODO_API_COMPLETE`. + +This way Claude knows exactly what to do and when completion is truly achieved. + +### More prompt templates + +Here are common task templates you can use directly or adapt. + +**Template 1: test migration (Jest -> Vitest)** + +```text +/ralph-wiggum:ralph-loop " +Migrate all tests in this project from Jest to Vitest: +- Keep all test logic unchanged +- Update config files (vite.config.js, vitest.config.js) +- Replace Jest-specific APIs (e.g., jest.mock -> vi.mock) +- Ensure all tests pass +- Remove Jest-related dependencies + +Acceptance criteria: +- npm test passes fully +- no Jest dependency in package.json +- project builds successfully + +Output after completion: VITEST_MIGRATION_COMPLETE +" --max-iterations 40 --completion-promise "VITEST_MIGRATION_COMPLETE" +``` + +**Template 2: UI/UX optimization (mobile-first)** + +```text +/ralph-wiggum:ralph-loop " +Polish this project's UI/UX into a refined mobile-first language learning app: +- unify spacing and whitespace (use 4px base unit) +- establish clear type hierarchy (title/body/auxiliary text) +- unify styles for cards, lists, and shared components +- add bottom navigation (Home/Learn/Quiz/Progress/Settings) +- ensure mobile rendering quality + +Acceptance criteria: +- npm run build succeeds +- no TypeScript errors +- key pages preview correctly on mobile + +Output after completion: UI_UX_COMPLETE +" --max-iterations 25 --completion-promise "UI_UX_COMPLETE" +``` + +**Template 3: bulk TypeScript annotation** + +```text +/ralph-wiggum:ralph-loop " +Add TypeScript type annotations to all functions in the project: +- prioritize src/ directory +- add types for function params and return values +- avoid any, use concrete types or unknown +- add necessary type definitions + +Acceptance criteria: +- npm run typecheck passes +- no @ts-ignore or @ts-any comments +- code runs correctly + +Output after completion: TYPES_ADDED +" --max-iterations 30 --completion-promise "TYPES_ADDED" +``` + +**Template 4: TDD-driven feature development** + +```text +/ralph-wiggum:ralph-loop " +Implement checkout functionality using TDD: +1. Write tests first (checkout.test.ts) +2. Run tests (should fail) +3. Write minimal code to pass tests +4. Refactor and optimize +5. Repeat until all tests pass + +Feature requirements: +- shopping cart item list +- shipping fee calculation +- coupon application +- payment form validation + +Acceptance criteria: +- all tests pass (npm test checkout.test.ts) +- code coverage > 80% +- no ESLint errors + +Output after completion: CHECKOUT_COMPLETE +" --max-iterations 25 --completion-promise "CHECKOUT_COMPLETE" +``` + +**Template 5: code style unification** + +```text +/ralph-wiggum:ralph-loop " +Unify code style across the project: +- format all files with Prettier +- unify naming conventions (variables camelCase, components PascalCase) +- remove unused imports and variables +- unify string quotes (single quotes) +- unify semicolon style (no semicolons) + +Acceptance criteria: +- npm run lint passes +- consistent code style +- build succeeds + +Output after completion: STYLE_UNIFIED +" --max-iterations 20 --completion-promise "STYLE_UNIFIED" +``` + +### Real-world cases + +One famous case happened at a Y Combinator hackathon, where a team used Ralph Loop. At 11 PM, they set a task: implement MVPs for 6 product specs in sequence and emit specific completion markers for each one. They set max iterations to 200 and went to sleep. + +The next morning, they had 6 demo-ready projects, and API cost was only $297. That is Ralph's power: while you sleep, AI keeps working. + +Another case came from Boris Cherny (Claude Code lead). With Ralph plus Opus 4.5, he delivered 259 PRs in 30 days, including 497 commits, adding 40,000 lines and deleting 38,000 lines. Most strikingly, all of it was produced by Claude Code without manually writing code. + +An even wilder case is the CURSED programming language. Ralph creator Geoffrey Huntley used Ralph Loop over 3 months to autonomously build a full programming language. Its keywords use Gen Z slang (such as `slay`, `sus`, `based`), and more importantly it includes a full LLVM compiler implementation, standard library, and partial editor support. This demonstrates Ralph Loop's true potential: if you provide a clear target, it can keep working for months until a complex project is truly finished. + +### More real-world cases + +**Automated project refactor** + +One developer used Ralph to refactor a legacy project with messy code, no tests, and missing documentation. The assigned tasks were: + +1. Add tests for existing code +2. Refactor step by step, ensuring tests pass after each change +3. Update documentation + +Ralph ran over a full weekend. By Monday, there were 47 commits, cleaner code structure, 75% test coverage, and complete API docs. Cost was around $12. + +### Ralph philosophy + +Ralph reflects three core philosophies. + +The first is iteration over perfection. Do not expect perfection in one pass; use loops to improve. The first pass may only build a skeleton, second fixes bugs, third optimizes, fourth adds tests; every round gets better. + +The second is failure as data. Every test failure is an opportunity to improve; do not fear failure, learn from it. + +The third is persistent trying: keep trying until it works. That is Ralph spirit. + +### When Ralph is suitable or unsuitable + +Knowing where Ralph fits helps save both time and cost. + +**Suitable scenarios for Ralph** + +These tasks have clear completion criteria and are good for automatic iteration: + +| Scenario | Why | +|------|------| +| Test migration | Clear target framework, validated by passing tests | +| Large refactors | Specific refactor rules can be defined | +| Framework migration | Successful migration is verifiable by working code | +| Bulk type annotation | Done when typecheck passes | +| Test coverage improvement | Coverage percentage is objective | +| Documentation generation | API docs can be automatically validated | +| UI/UX unification | Concrete design rules can be defined | +| Bug fixes with repro | Pass condition is testable | + +**Unsuitable scenarios for Ralph** + +These tasks require human judgment or exploration: + +| Scenario | Why | +|------|------| +| Architecture decisions | e.g., microservices vs monolith requires trade-off judgment | +| Security-sensitive code | Vulnerabilities can be subtle and hard to detect automatically | +| Ambiguous requirements | No clear completion criteria | +| Exploratory work | Direction changes continuously | +| Creative design | Requires human aesthetic judgment | +| Simple one-off tasks | Using Ralph is overkill | + +**Decision checklist** + +Ask yourself three questions: +1. **Can I define explicit completion criteria?** If not, not suitable +2. **Is there an objective validation method?** (tests/build/typecheck) If not, not suitable +3. **Does this task require continuous human feedback?** If yes, not suitable + +If all three answers are "no," let Ralph run. + +--- + +## Method 3: Enhanced Ralph + +This is a community-enhanced implementation of official Ralph. The [frankbria/ralph-claude-code](https://github.com/frankbria/ralph-claude-code) project adds stronger safety mechanisms. + +### Additional features + +Enhanced Ralph adds several extra safety features. + +First is dual exit conditions. Official Ralph checks only the completion marker, but the enhanced version requires both the completion marker and explicit `EXIT_SIGNAL` before stopping. This means even if Claude outputs completion marker, loop can continue for additional verification unless explicit exit appears. + +Second is rate limiting. Default is 100 runs/hour, preventing runaway API bills if a bug causes endless loops. You can adjust this limit. + +Third is a smart circuit breaker. If the system detects completion marker 5 consecutive times, it force-stops. This prevents rare edge cases where loops fail to terminate correctly. + +Fourth is a real-time dashboard. Enhanced Ralph provides a command-line dashboard showing current iterations, task progress, and estimated cost. + +### Installation + +Install enhanced Ralph by cloning from GitHub: + +```bash +git clone https://github.com/frankbria/ralph-claude-code.git +cd ralph-claude-code +./install.sh +``` + +The install script sets required files and configuration automatically. + +### Usage + +Enhanced Ralph usage has two steps. First initialize project with `ralph-setup`: + +```bash +ralph-setup my-project +``` + +This creates required config files in project. Then start loop with `ralph loop`: + +```bash +ralph loop +``` + +### Configuration file + +Enhanced Ralph uses `.claude/ralph-config.json`: + +```json +{ + "maxIterations": 50, + "rateLimitPerHour": 100, + "completionPromise": "TASK_COMPLETE", + "exitSignal": "EXIT_NOW", + "costAlertThresholds": [10, 50, 100] +} +``` + +`maxIterations` is max loop count. `rateLimitPerHour` is hourly rate cap. `completionPromise` is completion marker text. `exitSignal` is explicit exit signal. `costAlertThresholds` defines budget warning levels. + +--- + +## Method 4: Agent Teams (Parallel Multi-Agent) + +When tasks are large enough, a single Claude is not enough; you need "team collaboration." + +Agent Teams is an advanced capability that lets multiple Claude instances run in parallel and coordinate through shared task lists and dependencies. This is suitable for very large projects. In Nicholas Carlini's experiment, 16 parallel agents produced 100,000+ lines of code in two weeks and built a C compiler capable of compiling the Linux kernel. + +Agent Teams is more complex, and we will cover it in detail in the next section: "3.3 Agent Teams Multi-Agent Collaboration." + +--- + +## Method 5: Background Tasks (Ctrl+B) + +This is a simple and practical non-blocking execution method. + +### Basic operation + +Usage is straightforward. When Claude starts a task, press `Ctrl+B` to push it to background. + +For example, you say: "Run full test suite." Claude begins running. You press `Ctrl+B`, and Claude replies: "Task pushed to background (ID: task_abc123)." Then you can continue: "Meanwhile, analyze this log file." Claude can analyze logs while tests continue in background. + +### Viewing background tasks + +There are several ways to check background tasks. Use `/tasks` to list all tasks with task ID, state, and start time. Press `Ctrl+T` for quick status summary. You can also bring a task back to foreground to inspect live output. + +### Suitable scenarios + +Background tasks are good for typical situations: + +First, long-running tests. Full suites may take tens of minutes, and background mode avoids blocking. + +Second, large project builds. Build pipelines can run while you continue other work. + +Third, batch file operations such as mass rename and formatting. + +Fourth, anything you do not want to wait for synchronously. + +--- + +## Safety Mechanisms: Preventing Infinite Loops + +Any automated loop system must include protections, otherwise it may run out of control. + +### Hard limits + +The most basic protection is setting `--max-iterations` (maximum loop count). This is mandatory. Regardless of completion state, task stops at this cap and prevents unlimited API spending. + +You can also enforce time limits, for example auto-stop after 4 hours. You can also set budget alerts that pause and notify at spend thresholds (for example 10 USD, 50 USD, 100 USD). + +### Intelligent detection + +You can add smart dead-loop detection. For example, check whether recent commits include meaningful changes: + +```bash +if [ $(git diff HEAD~5 | wc -l) -eq 0 ]; then + echo "No substantive changes in the last 5 commits, possible loop" + exit 1 +fi +``` + +If recent diffs are minimal, system may be stuck and should stop with alert. + +### Cost alerts + +Set cost alert thresholds in config: + +```json +{ + "costAlertThresholds": [10, 50, 100], + "alertAction": "pause_and_notify" +} +``` + +When spending reaches 10, 50, or 100 USD, system pauses and notifies so you can decide whether to continue. + +### Manual checkpoints + +For important tasks, add manual checkpoints: + +```bash +if [ $((iteration % 10)) -eq 0 ]; then + read -p "Completed $iteration iterations. Continue? (y/n)" answer + if [ "$answer" != "y" ]; then + break + fi +fi +``` + +This pauses every 10 iterations for confirmation, allowing timely human intervention. + +--- + +## Practical Build: Complete BBS Forum with Ralph Loop + +Let's use a full example to show Ralph Loop power. We will build a BBS-style forum system from scratch, including user auth, posting, profile center, and admin backend. + +### Project objective + +Build a fully functional BBS forum system with: + +**User-side features:** +- user registration, login, logout +- browse post list (pagination) +- view post detail +- publish new posts +- comment feature +- profile center (view own posts, update profile) + +**Admin backend features:** +- admin login +- user management (ban/unban) +- post management (delete/pin) +- comment management +- system statistics + +**Tech stack:** +- backend: Node.js + Express + SQLite +- frontend: React + React Router + Axios +- auth: JWT token +- styling: Tailwind CSS + +### Preparation + +First install Ralph Wiggum plugin: + +```bash +claude /plugins:add ralph-wiggum +``` + +### Start Ralph Loop + +Now launch Ralph Loop to build the whole project: + +```bash +/ralph-wiggum:ralph-loop " +Please build a complete BBS forum system from scratch using TDD. + +Project structure requirements: +- backend/ directory: Express API server +- frontend/ directory: React frontend app +- both directories have their own tests + +Backend requirements: +- use Express framework +- SQLite storage (better-sqlite3) +- JWT auth (jsonwebtoken + bcrypt) +- user table: id, username, password, email, role, createdAt +- post table: id, title, content, authorId, category, pinned, createdAt +- comment table: id, content, postId, authorId, createdAt + +Backend API endpoints: +- POST /api/auth/register - user register +- POST /api/auth/login - user login +- GET /api/posts - get post list (pagination + category filter) +- GET /api/posts/:id - get post detail +- POST /api/posts - create post (auth required) +- PUT /api/posts/:id - edit post (author or admin) +- DELETE /api/posts/:id - delete post (author or admin) +- POST /api/posts/:id/comments - add comment (auth required) +- GET /api/user/profile - get profile (auth required) +- PUT /api/user/profile - update profile (auth required) +- GET /api/admin/stats - admin statistics (admin only) +- GET /api/admin/users - user list (admin only) +- PUT /api/admin/users/:id/ban - ban user (admin only) + +Frontend page requirements: +- /login - login page +- /register - register page +- / - home page (post list) +- /post/:id - post detail +- /new - publish post +- /profile - profile center +- /admin - admin panel (admin permission required) + +Admin panel features: +- user management (view, ban, unban) +- post management (view, delete, pin) +- comment management (view, delete) +- system statistics (user count, post count, comment count) + +TDD requirements: +- write tests first, then implementation +- each feature must have corresponding tests +- backend uses Jest, API tests cover all endpoints +- frontend uses Vitest, component tests cover major features +- auth middleware must have tests + +Acceptance criteria: +- npm test (backend) passes +- npm test (frontend) passes +- frontend starts and works correctly +- backend API responds correctly +- proper permission isolation between normal users and admin +- code passes ESLint checks + +Output after completion: BBS_SYSTEM_COMPLETE +" --max-iterations 150 --completion-promise "BBS_SYSTEM_COMPLETE" +``` + +### Expected time + +Based on complexity: + +**If coded manually**: about 40-60 hours (including schema design, auth system, frontend/backend integration, and testing) + +**Using Ralph Loop**: +- base version (core features): around 3-5 hours +- full version (admin backend + tests): around 6-10 hours + +### Monitoring progress + +While Ralph Loop is running, you can monitor progress in several ways: + +**Iteration count**: Ralph shows current and max iterations, which helps estimate remaining time. + +**Logs**: you can see what Claude is doing now, such as designing schema, writing APIs, building components, and fixing bugs. + +**Test status**: every test run result is shown. Passing tests increase and failing tests decrease. When failures begin to drop, project is approaching completion. + +### Post-completion verification + +After Ralph outputs completion marker, perform manual verification: + +```bash +# backend tests +cd backend +npm test + +# frontend tests +cd frontend +npm test + +# start backend +cd backend +npm start + +# start frontend (in another terminal) +cd frontend +npm run dev +``` + +Open browser and test: + +1. register a new user +2. login +3. browse posts +4. publish new post +5. add comment +6. open profile center +7. logout and login as admin (default account: admin/admin123) +8. test admin backend features + +### Notes + +Ralph Loop is powerful, but keep these points in mind: + +**First, more detailed prompts produce better results.** Ambiguous prompts require more iterations for correction. + +**Second, set reasonable iteration caps.** BBS systems are complex; recommend at least 100 iterations. + +**Third, TDD is recommended.** Writing tests first can significantly reduce debugging time. + +**Fourth, final manual verification is required.** AI may miss edge cases or special scenarios, especially in security-sensitive paths. + +**Fifth, pay close attention to schema design.** Ralph may need several iterations before landing on a robust schema. + +--- + +## Method Comparison and Selection + +Each method has its own characteristics and fits different scenarios. + +While True Loop is the simplest: only 5 lines to run, good for quick experiments and prototypes. But it is limited and does not detect real completion, relying only on iteration caps. + +Ralph Wiggum is the general recommendation for most scenarios. It has a complete Stop Hook mechanism, supports completion-marker checks, has official support, and solid docs. + +Enhanced Ralph is better for production environments, with dual exit conditions, rate limits, and smart circuit breakers. + +Background tasks are useful for simple non-blocking execution: just press `Ctrl+B`. But it is only background execution, not iterative loop orchestration. + +--- + +## Summary + +The core idea for making Claude Code work long-term is simple: do not ask it to "finish in one shot," ask it to "keep trying until true completion." + +All methods are fundamentally doing the same thing: give Claude a task, let it run, check whether completion is real, and if not, continue the next round. + +Which method to choose depends on your needs. + +If you want simple and fast, use While True Loop. Five lines can run, but features are limited. + +If you want general recommendation, use Ralph Wiggum. Official support, complete capability, suitable for most cases. + +If this is production usage, use enhanced Ralph. It has extra safety mechanisms and is more reliable. + +(For Agent Teams multi-agent collaboration, see the next section: "3.3 Agent Teams Multi-Agent Collaboration.") + +Hopefully this chapter helps you use Claude Code more effectively so AI becomes a true productivity tool rather than only a chatbot. + +--- + +## References + +### Official Resources + +- [Claude Code Official Docs](https://docs.anthropic.com/en/docs/claude-code) - complete official Claude Code documentation +- [Ralph Wiggum Plugin README](https://github.com/anthropics/claude-plugins-official/tree/main/plugins/ralph-wiggum) - official plugin documentation +- [Claude Code Hooks](https://docs.anthropic.com/en/docs/claude-code/configuration/hooks) - official Hooks system docs + +### Community Projects + +- [frankbria/ralph-claude-code](https://github.com/frankbria/ralph-claude-code) (2.1k stars) - enhanced Ralph implementation with additional safeguards +- [Awesome Ralph](https://github.com/snwfdhmp/awesome-ralph) - curated Ralph resources and examples +- [Ralph Ryan](https://github.com/wquguru/ralph-ryan) - PRD generation + Ralph loop integration +- [snarktank/ralph](https://github.com/snarktank/ralph) - original Ralph implementation + +### Articles and Tutorials + +**English resources** + +- [Geoffrey Huntley - Ralph Technique](https://ghuntley.com/ralph/) - original Ralph concept by creator +- [Effective Framework Practices for Reliable Long-Running AI Agents](https://m.blog.csdn.net/weixin_48708052/article/details/158044721) - deep read of Anthropic engineering blog +- [Complete Claude Code Guide](https://developer.aliyun.com/article/1705912) - full usage guide + +**Chinese tutorials** + +- [Beginner-Friendly Tutorial - CSDN](https://m.blog.csdn.net/zsr154278963/article/details/156637281) - detailed install and usage guide +- [Deep Analysis - Toutiao](https://m.toutiao.com/a7585579989207188006/) - mechanism and core principles +- [Full-Stack Plain-Language Guide](https://www.jdon.com/90167-ralph-wigum-loop-explained-for-teens.html) - complete walkthrough from principles to practice +- [Beginner and Practical Guide - CNBlogs](https://www.cnblogs.com/buwai/p/19625356) - basics and practical examples +- [Ralph Loop Deep Dive - CSDN](https://m.blog.csdn.net/roamingcode/article/details/156732443) - Stop Hook mechanism details +- [Claude Code Perpetual Engine - CSDN](https://m.blog.csdn.net/qq_44866828/article/details/156736656) - infinite-loop iteration plugin deep dive +- [Ralph Loop New User Starter - CNBlogs](https://www.cnblogs.com/gyc567/p/19495639) - best practices and prompt summary + +### Practical Case Studies + +- [CURSED Programming Language](https://github.com/geoffreyhuntley/cursed) - complete programming language built with Ralph over 3 months +- [Boris Cherny's 30 Days](https://twitter.com/boriskirov/status/1756002385683786616) - 259 PRs case share +- [Y Combinator Hackathon](https://github.com/geoffreyhuntley/ralph) - 6-project overnight generation case +- [Geoffrey Huntley's Blog](https://ghuntley.com/) - creator's technical blog diff --git a/docs/en/stage-3/core-skills/mcp/index.md b/docs/en/stage-3/core-skills/mcp/index.md new file mode 100644 index 0000000..042a7cb --- /dev/null +++ b/docs/en/stage-3/core-skills/mcp/index.md @@ -0,0 +1,571 @@ +# Claude Code MCP Complete Guide + +## What is Claude Code MCP? + +**Claude Code** is Anthropic's official AI command-line tool, while **MCP (Model Context Protocol)** is the protocol that allows Claude Code to connect to external tools and services. + +Put simply, MCP turns Claude Code from an AI assistant that can only read and write local files into a super assistant that can access GitHub, databases, APIs, and cloud services. + +## Why use MCP in Claude Code? + +### Claude Code without MCP + +```text +What you can do: +✓ Read local files +✓ Edit code +✓ Run commands +✓ Use Bash tools + +What you cannot do: +✗ View your GitHub Issues +✗ Access a cloud database +✗ Call external APIs +✗ Get real-time weather +``` + +### Claude Code with MCP + +```text +What you can do: +✓ All original functions +✓ View / create GitHub Issues and PRs +✓ Query SQLite and PostgreSQL databases +✓ Access external services such as Notion and Slack +✓ Get real-time weather and map data +✓ Browser automation +✓ ...and more +``` + +## Quick Start + +### Step 1: Understand where the config files live + +Claude Code's MCP configuration files are located at: + +| Level | Config file path | Scope | +|-----|-------------|----------| +| **User level** | `~/.claude.json` | All projects | +| **Project level** | `.claude/mcp.json` | Current project | + +It is recommended to use **project-level config** first, so different projects can use different MCP services. + +### Step 2: Add MCP servers with natural language + +In Claude Code, you do not need to manually edit configuration files or memorize commands. You can describe what you want in natural language: + +```text +You: Help me add a GitHub MCP server. My token is ghp_xxx + +Claude: I'll help you configure the GitHub MCP server... + +[Automatically updates .claude/mcp.json] +``` + +```text +You: Add a SQLite database server. The database file is at ./data/app.db + +Claude: Okay, I'll configure the SQLite MCP server... +``` + +```text +You: Add an HTTP-type MCP server with the address https://api.example.com/mcp + +Claude: I'll add that remote MCP server... +``` + +### Step 3: Verify the configuration + +Ask Claude Code directly: + +```text +You: What MCP servers are available now? + +Claude: Currently configured MCP servers: +• github - GitHub integration +• sqlite - SQLite database +• filesystem - Filesystem access +``` + +Or use the diagnostic command: + +```text +/doctor +``` + +### Step 4: Start using it + +Once configuration succeeds, you can call MCP functions directly with natural language: + +```text +You: Help me create an Issue on GitHub + +Claude: I can help you create a GitHub Issue. Please tell me: +- the repository address, for example owner/repo +- the Issue title +- the Issue description +``` + +## Natural-language management in Claude Code + +### View and manage MCP servers + +You can interact with Claude Code entirely in natural language: + +```text +You: List all configured MCP servers + +You: Check the connection status of the MCP servers + +You: Delete the MCP server named notion + +You: Update the token for the github server +``` + +### Diagnose problems + +When you run into problems: + +```text +You: Check what's wrong with the MCP connection + +Claude: [will automatically run diagnostics, analyze configuration files, and check server status] +``` + +## Configuration methods in detail + +### User-level config, global + +Edit `~/.claude.json`: + +```json +{ + "mcpServers": { + "filesystem": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-filesystem", "/Users/yourname/Documents"] + }, + "github": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-github"], + "env": { + "GITHUB_PERSONAL_ACCESS_TOKEN": "your-token" + } + } + } +} +``` + +### Project-level config, recommended + +Edit `.claude/mcp.json` in the project root: + +```json +{ + "mcpServers": { + "project-db": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-sqlite", "--db-path", "./data/app.db"] + } + } +} +``` + +**Advantages of project-level config:** + +- Team members can share the config by committing it to Git +- Different projects can use different MCP services +- Configuration is more flexible and does not pollute global settings + +### Transport modes + +Claude Code supports three transport modes: + +#### STDIO, local process + +```json +{ + "mcpServers": { + "local-tool": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-filesystem", "/path"] + } + } +} +``` + +#### HTTP, remote service + +```json +{ + "mcpServers": { + "remote-api": { + "url": "https://api.example.com/mcp", + "transport": "http", + "headers": { + "Authorization": "Bearer your-token" + } + } + } +} +``` + +#### SSE, server-sent events + +```json +{ + "mcpServers": { + "streaming": { + "url": "https://api.example.com/sse", + "transport": "sse" + } + } +} +``` + +## Practical examples + +### Example 1: GitHub workflow automation + +```text +You: Help me push the current changes to GitHub, then create a PR titled "Add new feature" + +Claude: +1. Check current git status... +2. Create a new branch feature/new-feature... +3. Commit changes... +4. Push to remote... +5. Call github_create_pull_request to create the PR... +6. PR created: https://github.com/owner/repo/pull/123 +``` + +### Example 2: Database query + +```text +You: Query the 10 most recently registered users in the database + +Claude: +1. Connect to the SQLite database... +2. Execute query: SELECT * FROM users ORDER BY created_at DESC LIMIT 10 +3. Return results: + - Zhang San (zhang@example.com) - 2025-02-28 + - Li Si (li@example.com) - 2025-02-27 + ... +``` + +### Example 3: Webpage screenshot analysis + +```text +You: Open https://example.com, take a screenshot, and analyze the page design + +Claude: +1. Launch browser... +2. Navigate to https://example.com... +3. Capture screenshot... +4. [Returns screenshot] +5. Page design analysis: + - The layout uses a centered design with a max width of 1200px + - The main color is blue (#3b82f6) + - The navigation bar is fixed to the top + ... +``` + +### Example 4: Integration with Notion + +```text +You: Save the notes I just wrote to Notion + +Claude: +1. Use the Notion MCP server... +2. Create a new page... +3. Saved: https://notion.so/page/xxx +``` + +## Debugging tips + +### Diagnose with natural language + +When something goes wrong, just tell Claude Code directly: + +```text +You: My MCP server cannot connect. Please check it for me + +You: The GitHub MCP tool call failed. What is the reason? + +You: Why does the sqlite server always show "connecting"? +``` + +Claude Code will automatically: + +1. Check the configuration file format +2. Validate environment variables +3. Test the server connection +4. Provide concrete fix suggestions + +### Common problem troubleshooting + +| Problem | Possible cause | Solution | +|-----|---------|----------| +| Server not connected | Config file format error | Check JSON syntax | +| Tool cannot be called | Insufficient permissions | Check environment variables | +| Connection timeout | Network problem | Check URL or network | +| Process crashes | Bug in server code | Check server logs | + +### Manual diagnostic command + +```text +/doctor +``` + +Example output: + +```text +System Diagnostic Report: +=============== + +Claude Code: v2.5.0 ✓ +Node.js: v20.0.0 ✓ + +MCP server status: +• github: ✓ Connected (12 tools) +• sqlite: ✗ Connection failed - Database file not found +• puppeteer: ✓ Connected (8 tools) + +Suggestions: +1. Check whether the sqlite database path is correct +2. Make sure the .claude/mcp.json format is correct +``` + +## Best practices + +### 1. Prefer project-level configuration + +**Why recommend project-level configuration?** + +Different projects often need different MCP services. For example, a frontend project may need browser testing tools, while a backend project may need database connections. With project-level configuration, each project can have its own dedicated set of MCP servers, avoiding the chaos of one large global config. + +More importantly, project-level config can be committed to Git. After team members clone the project, they can directly use the same MCP services without reconfiguring everything. + +```text +Project A, frontend project -> .claude/mcp.json contains browser testing MCP +Project B, backend project -> .claude/mcp.json contains database MCP +``` + +### 2. Store sensitive information in environment variables + +**Never hard-code secrets in the configuration file.** + +Configuration files may be accidentally committed to Git and leak keys. The correct approach is to store sensitive values in environment variables and only reference the variable names from the config file. That way, even if the config file becomes public, the real secrets are still hidden. + +```json +{ + "env": { + "GITHUB_TOKEN": "$GITHUB_TOKEN", + "GITHUB_TOKEN": "ghp_abc123" + } +} +``` + +The first form is good because it reads from the environment variable. The second form is bad because it hard-codes a secret directly. + +### 3. Pin versions + +**Why do you need to pin versions?** + +By default, `npx -y` will always use the latest version of an MCP server. This can cause problems: a new version may introduce breaking changes, or a package may suddenly be removed or renamed. + +By appending `@version` to the package name, you ensure that a validated version is always used, reducing surprises caused by automatic upgrades. + +```json +{ + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-github@1.2.3"] +} +``` + +### 4. Document your MCP configuration + +**Help teammates understand the MCP setup quickly** + +When a project includes multiple MCP servers, new team members may not understand what each server is for or what configuration it requires. Creating a `README.md` under the `.claude/` directory that explains each server's purpose, required config, and how to obtain credentials can significantly reduce communication cost. + +Create `.claude/README.md` in your project: + +```markdown +# MCP Configuration Notes + +MCP servers used in this project: + +## github +Used for GitHub automation. Requires GITHUB_TOKEN. + +## sqlite +Connects to ./data/app.db for querying and modifying data. + +## puppeteer +Used for E2E testing. +``` + +## Claude Code vs Claude Desktop + +| Feature | Claude Code | Claude Desktop | +|-----|-------------|----------------| +| **Config file** | `~/.claude.json` or `.claude/mcp.json` | `claude_desktop_config.json` | +| **Project-level config** | ✓ Supported | ✗ Not supported | +| **Natural-language management** | ✓ Supported | ✗ Manual editing required | +| **Diagnostics** | ✓ `/doctor` | ✗ None | +| **Hot reload** | ✓ Automatic | ✗ Requires app restart | +| **Use cases** | Development workflow, CI/CD | Daily use, office tasks | + +## Common MCP servers + +> 💡 For the complete MCP server list, please refer to the appendix: [MCP Server Directory](/zh-cn/appendix/mcp-servers/) + +### GitHub server + +**Function:** Issues, PRs, repository management + +```json +{ + "mcpServers": { + "github": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-github"], + "env": { + "GITHUB_PERSONAL_ACCESS_TOKEN": "your-token" + } + } + } +} +``` + +**Get a token from:** https://github.com/settings/tokens + +### SQLite server + +**Function:** Query and manage SQLite databases + +```json +{ + "mcpServers": { + "sqlite": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-sqlite", "--db-path", "./data/database.db"] + } + } +} +``` + +### Filesystem server + +**Function:** Access files inside a specified directory + +```json +{ + "mcpServers": { + "filesystem": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-filesystem", "/Users/yourname/Documents"] + } + } +} +``` + +### Puppeteer browser automation + +**Function:** Browser control, screenshots, automated testing + +```json +{ + "mcpServers": { + "puppeteer": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-puppeteer"] + } + } +} +``` + +### Brave search server + +**Function:** Web search + +```json +{ + "mcpServers": { + "brave-search": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-brave-search"], + "env": { + "BRAVE_API_KEY": "your-brave-api-key" + } + } + } +} +``` + +## Reference resources + +### Official documentation + +- [Claude Code official documentation - MCP](https://docs.anthropic.com/zh-CN/docs/claude-code/mcp) +- [MCP official website](https://modelcontextprotocol.io/) +- [MCP specification documentation](https://modelcontextprotocol.io/specification/) +- [MCP GitHub repository](https://github.com/modelcontextprotocol) + +### Official servers + +- [@modelcontextprotocol/server-github](https://github.com/modelcontextprotocol/servers/tree/main/src/github) - GitHub integration +- [@modelcontextprotocol/server-sqlite](https://github.com/modelcontextprotocol/servers/tree/main/src/sqlite) - SQLite database +- [@modelcontextprotocol/server-postgres](https://github.com/modelcontextprotocol/servers/tree/main/src/postgres) - PostgreSQL database +- [@modelcontextprotocol/server-filesystem](https://github.com/modelcontextprotocol/servers/tree/main/src/filesystem) - Filesystem access +- [@modelcontextprotocol/server-puppeteer](https://github.com/modelcontextprotocol/servers/tree/main/src/puppeteer) - Browser automation +- [@modelcontextprotocol/server-fetch](https://github.com/modelcontextprotocol/servers/tree/main/src/fetch) - Web fetching +- [@modelcontextprotocol/server-brave-search](https://github.com/modelcontextprotocol/servers/tree/main/src/brave-search) - Brave search +- [@modelcontextprotocol/server-git](https://github.com/modelcontextprotocol/servers/tree/main/src/git) - Git operations + +### Tutorial articles + +- [A thorough explanation of MCP principles and practice](https://view.inews.qq.com/a/20250414A023WV00) +- [MCP (Model Context Protocol) architecture and how it works](https://m.toutiao.com/w/1826385835060307/) +- [2025 latest large-model tutorial: from getting started to mastering the MCP protocol](https://m.blog.csdn.net/weixin_45653328/article/details/150916706) +- [Learn MCP from scratch (8) - build an MCP server](https://juejin.cn/post/7582510291667419187) + +### Configuration guides + +- [Claude Code best practices](https://www.anthropic.com/engineering/claude-code-best-practices) +- [Claude Code complete configuration guide](https://juejin.cn/post/7576838552472043563) + +### Development tutorials + +- [Beginner-friendly MCP server practical guide in both TypeScript and Python](https://m.blog.csdn.net/ztt123654/article/details/150844207) +- [Ultimate MCP server building guide: complete TypeScript and Python tutorials](https://m.blog.csdn.net/gitblog_00703/article/details/154862128) +- [Build the simplest MCP server with TypeScript](https://m.blog.csdn.net/weixin_45653525/article/details/148433757) +- [Generate a TypeScript MCP server using Azure container applications](https://learn.microsoft.com/zh-cn/azure/developer/ai/build-mcp-server-ts) + +### MCP server resources + +- [Awesome MCP Servers](https://github.com/punkpeye/awesome-mcp-servers) - the most comprehensive MCP server list +- [Official MCP Registry](https://registry.modelcontextprotocol.io) - Anthropic's official app store +- [MCP.so](https://mcp.so) - community MCP server center +- [Glama.ai MCP](https://glama.ai/mcp/servers) - MCP directory with ratings and comments +- [Smithery](https://smithery.ai) - MCP server marketplace +- [MCPHub](https://mcphub.io/registry) - clean interface directory +- [LobeHub MCP](https://lobehub.com/zh/mcp) - Chinese MCP directory + +### Map and weather services + +- [Amap MCP Server](https://lobehub.com/zh/mcp/luozengchang-mcp-amap) +- [Tencent Location Service MCP documentation](https://lbs.qq.com/service/MCPServer/MCPServerGuide/overview) +- [Caiyun Weather MCP Server](https://github.com/caiyunapp/mcp-caiyun-weather) +- [OpenWeatherMap MCP Server](https://github.com/CodeByWaqas/weather-mcp-server) + +### Community resources + +- [Everything Claude Code Config](https://github.com/affaan-m/everything-claude-code) - production-grade Claude Code configuration collection +- [AI Coding Guide](https://github.com/hacket/AICodingGuide) - Chinese learning path for Claude Code + +### Real-world application cases + +- [BlenderMCP - AI-driven 3D modeling](https://github.com/Belthur/blender-mcp) - 4,100+ ⭐ +- [15 best practices for MCP in production](https://learn.microsoft.com/zh-cn/azure/azure-functions/scenario-mcp-apps) diff --git a/docs/en/stage-3/core-skills/mobile-development/index.md b/docs/en/stage-3/core-skills/mobile-development/index.md new file mode 100644 index 0000000..d71bf4d --- /dev/null +++ b/docs/en/stage-3/core-skills/mobile-development/index.md @@ -0,0 +1,935 @@ +# Claude Code Remote Development on Mobile + +## Introduction + +Imagine these scenarios: you suddenly think of a brilliant bug-fix idea on the subway during your commute; you receive an urgent production incident alert while waiting in line at a cafe; you want to check how your AI-built project is progressing while accompanying your girlfriend shopping. + +In traditional development workflows, these scenarios usually mean you need to find a place to open your laptop, or helplessly postpone the work. But in the AI-assisted coding era, the rules have changed. Claude Code makes it possible to carry your development environment in your pocket and stay productive anytime, anywhere. + +In the summer of 2025, as Claude Code adoption grew, developers started exploring different "coding on phone" approaches. From simple local Termux usage, to complex SSH + Tailscale remote connections, to dedicated Happy Coder apps, a full mobile development ecosystem gradually took shape. + +The core problem this chapter solves is: how to make Claude Code follow your phone and become a true "pocket development assistant." + +--- + +::: info Community Feedback at a Glance + +Based on real-world community feedback, the experience of each approach compares as follows: + +**Happy Coder (Approach 2)** +- Connection stability issues: disconnections happen often, and context is lost after disconnects +- Limited functionality: cannot use `/` commands +- Security concerns: depends on official relay servers, and some users are concerned about data security + +**HAPI (Approach 3)** +- Supports self-hosted servers: can be deployed on your own VPS +- Better experience when paired with Tailscale: run `hapi server` on your computer and connect from your phone through the Tailscale IP +- Relatively stable connection, suitable for long-term use + +**Claude Remote Control (Official Approach)** +- Official solution, natively integrated with Claude Code +- Supports full access to local environments (MCP, tools, project configuration) +- Requires Max subscription (Pro support is coming soon) +- Relies on Anthropic cloud connectivity + +**Recommendation**: if you require high connection stability, or are concerned about third-party relay security, choose **HAPI + Tailscale** or the **official Remote Control** approach. + +::: + +--- + +## Core Principle: Mobile Development Architecture Patterns + +Before introducing specific approaches, first understand the essence of the problem. + +### Why is mobile development a problem? + +Traditional IDEs (such as VS Code and IntelliJ) require a full operating system environment, strong CPU, large memory, and storage space. Although phones are increasingly powerful, they still have natural limits for development experience: + +**Input constraints**: virtual keyboards are inefficient for coding, and complex syntax is easy to mistype + +**Screen constraints**: small screens make it hard to view code, terminal, and browser at the same time + +**Environment constraints**: phones cannot run full development toolchains (compilers, databases, debuggers) + +**Connection constraints**: mobile networks are unstable, and SSH sessions disconnect easily + +### Core idea: thin-client architecture + +The core idea behind all mobile development approaches is the same: the phone is only the "control console"; real development work is done elsewhere. + +```text +┌─────────────────────────────────────────────────────────────┐ +│ │ +│ ┌─────────────┐ ┌─────────────┐ │ +│ │ Phone │ │ Host/Cloud │ │ +│ │ (Controller)│ ────────► │ (Executor) │ │ +│ │ │ Commands │ │ │ +│ │ • Send cmds │ │ • Run CLI │ │ +│ │ • View out │ │ • Exec code │ │ +│ │ • Review │ │ • Access fs │ │ +│ └─────────────┘ └─────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────┘ +``` + +This architecture allows the phone to focus only on human-computer interaction, while heavy computation is delegated to your host or cloud. + +--- + +## Approach 1: Official iOS App + +In October 2025, Anthropic officially launched Claude Code mobile support in the iOS app. This is the simplest mobile development option. + +### Regional limitations + +Important note: the Claude app **cannot be used directly** in mainland China. + +If you are in mainland China, it is recommended to use **Happy Coder** directly (Approach 2), which can work normally through configured domestic API relay services. + +If you have an overseas Apple ID, you can switch regions and download the Claude app. + +### How it works + +```text +┌─────────────┐ ┌─────────────────┐ +│ iOS App │ ──────────────────► │ Anthropic Cloud │ +│ (Phone) │ HTTPS + OAuth │ Claude Code │ +└─────────────┘ └────────┬────────┘ + │ + ▼ + ┌───────────────┐ + │ GitHub API │ + └───────────────┘ +``` + +Your phone app only sends commands. All code execution runs in Anthropic's cloud sandbox, and results are synced through GitHub. + +### Basic usage + +**Prerequisites:** + +- iPhone with iOS 15 or later +- Claude Pro/Team/Enterprise subscription (free plan is not supported) +- GitHub account + +**Steps:** + +1. Download Claude app from App Store +2. Log in to your Anthropic account +3. Find the "Code" tab in the app +4. Connect your GitHub repository through OAuth +5. Start creating tasks + +### Pros and cons + +Pros are zero setup barrier, smooth experience, and push notifications. Cons are iOS-only support, primary GitHub workflow, relatively limited capability (cannot access local file systems), and no direct availability in mainland China. + +--- + +## Approach 2: Happy Coder + +Happy Coder is an open-source mobile and web client designed for Claude Code and Codex, with end-to-end encryption and remote control of your AI coding assistant from anywhere. + +### How it works + +```text +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Happy App │ ────────► │ Happy Server │ ◄──────── │happy-coder │ +│ (Phone/Web) │ Encrypted WS │ (Relay) │ WebSocket │ (Desktop) │ +└─────────────┘ └─────────────┘ └──────┬──────┘ + │ + ▼ + ┌─────────────┐ + │Claude Code │ + │ CLI │ + └─────────────┘ +``` + +On your computer, run `happy` instead of `claude` to launch your AI coding assistant. When you need phone control, the session automatically switches to remote mode. Press any key on your computer to switch back to local control. + +### Installation and usage + +**Step 1: download app** + +| Platform | Link | +|------|------| +| iOS | [App Store](https://apps.apple.com/us/app/happy-claude-code-client/id6748571505) | +| Android | [Google Play](https://play.google.com/store/apps/details?id=com.ex3ndr.happy) | +| Web | [app.happy.engineering](https://app.happy.engineering) | + +**Step 2: install CLI on computer** + +```bash +npm install -g happy-coder +``` + +**Step 3: launch and pair** + +```bash +# run in your project directory +cd ~/my-project +happy + +# a pairing QR code will be shown +``` + +**Step 4: scan and pair on phone** + +Open Happy app and scan the QR code shown on your computer. After pairing succeeds, you can control Claude Code from your phone. + +**Step 5: use** + +```bash +# launch Claude Code +happy + +# or launch Codex +happy codex +``` + +### Resource links + +- [GitHub Project](https://github.com/slopus/happy) - source code +- [Documentation](https://happy.engineering/docs) - usage docs +- [Discord Community](https://discord.gg/fX9WBAhyfD) - community discussion + +### Pros and cons + +Pros are simple setup, cross-platform support, end-to-end encryption, and open-source auditability. Cons are dependence on third-party relay infrastructure and the need to verify mobile app availability in your own environment. + +--- + +## Approach 3: HAPI + +HAPI is an alternative to Happy Coder, with a local-first design and support for seamless device switching across multiple AI models. + +### How it works + +```text +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ HAPI App │ ────────► │ HAPI Server │ ◄──────── │ hapi │ +│ (Phone/PWA/ │ WireGuard │ (Self-hosted│ WireGuard │ (Desktop) │ +│ Telegram) │ + TLS │ relay) │ + TLS │ │ +└─────────────┘ └─────────────┘ └──────┬──────┘ + │ + ▼ + ┌─────────────┐ + │Claude Code │ + │ / Codex / │ + │ Gemini etc. │ + └─────────────┘ +``` + +HAPI uses WireGuard plus TLS for end-to-end encryption. All communication goes through encrypted relay servers. You can self-host relay servers to fully control your data flow. + +### Core features + +- **Seamless switching**: switch control between desktop and phone; press any key to return to local control +- **Native-first**: mobile apps are wrapped with native technology for smooth interaction +- **AFK approvals**: receive approval requests on your phone while away from your computer +- **Multi-model support**: supports Claude Code, Codex, Gemini, OpenCode, and more +- **Terminal anywhere**: access via PWA, Telegram Mini App, and more +- **Voice control**: supports voice input commands, so your hands stay free + +### Installation and usage + +**Step 1: start relay server** + +```bash +# run on your server (or launch directly with npx) +npx @twsxtd/hapi hub --relay +``` + +**Step 2: install CLI on computer** + +```bash +# run in your project directory +cd ~/my-project +npx @twsxtd/hapi + +# or install globally +npm install -g @twsxtd/hapi +hapi +``` + +**Step 3: pair devices** + +Follow terminal prompts, open HAPI app on your phone, and scan the QR code to complete pairing. + +**Step 4: access methods** + +| Access Method | Description | +|---------|------| +| Web PWA | Browser access, supports install-to-home-screen | +| Telegram Mini App | Use directly inside Telegram | +| Mobile App | Native app experience (if published) | + +### Differences from Happy Coder + +| Feature | Happy Coder | HAPI | +|------|-------------|------| +| Design philosophy | Cloud-first | Local-first | +| Encryption method | WebSocket + E2E | WireGuard + TLS | +| Multi-model support | Claude Code, Codex | Claude, Codex, Gemini, OpenCode | +| Access methods | iOS/Android/Web | PWA, Telegram, more | +| Voice control | No | Yes | +| AFK approvals | No | Yes | +| Self-hosted relay | Requires manual deployment | Out-of-the-box support | + +### Resource links + +- [GitHub Project](https://github.com/tiann/hapi) - source code +- [PWA Docs](https://github.com/tiann/hapi/blob/main/docs/pwa.md) - PWA installation and usage +- [How It Works](https://github.com/tiann/hapi/blob/main/docs/how-it-works.md) - technical implementation details +- [Voice Assistant](https://github.com/tiann/hapi/blob/main/docs/voice.md) - voice control features +- [Why HAPI](https://github.com/tiann/hapi/blob/main/docs/why-hapi.md) - design philosophy +- [FAQ](https://github.com/tiann/hapi/blob/main/docs/faq.md) - frequently asked questions + +### Pros and cons + +Pros are local-first design, multi-model support, end-to-end encryption, voice control, and self-hosted relay capability. Cons are that the project is relatively new and the ecosystem is still growing. + +--- + +## Approach 4: SSH + Tailscale + Tmux + +This is the best option for professional developers. You remotely connect to your development machine over SSH and keep sessions persistent with Tmux. + +### How it works + +```text +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Phone │ ────────► │ Tailscale │ ◄──────── │ Computer │ +│ (SSH client)│ VPN P2P │ relay/hole │ VPN P2P │ (dev host) │ +└─────────────┘ └─────────────┘ └──────┬──────┘ + │ + ▼ + ┌─────────────┐ + │ Tmux │ + │ (session │ + │ persistence)│ + └─────────────┘ +``` + +Tailscale creates a peer-to-peer VPN so you can access your home computer from any network. Tmux ensures Claude Code keeps running in the background even when SSH disconnects. + +### Why do you need Tailscale? + +**Problems with traditional SSH:** + +```text +Phone (4G) ──XX──> Router NAT ──XX──> Home Computer + (cannot penetrate) (LAN isolation) +``` + +Your computer is on a private network, and your phone is on the public network, so direct access fails. Traditional solutions require port forwarding plus dynamic DNS, which are complex and risky. + +**Tailscale solution:** + +```text +Phone (4G) ──► Tailscale Relay ──◄── Home Computer + (auto hole-punch or relay) +``` + +Tailscale uses NAT traversal, and falls back to relay automatically if traversal fails. The entire connection is encrypted. + +### Full setup steps + +**Step 1: install Tailscale on computer** + +```bash +# macOS +brew install --cask tailscale + +# or download installer +# https://tailscale.com/download +``` + +**Step 2: log in and get IP** + +```bash +# start Tailscale +sudo tailscale up + +# check Tailscale IPv4 +tailscale ip -4 +# example output: 100.x.x.x +``` + +**Step 3: install Tailscale on phone** + +Download Tailscale from App Store or Google Play and log in with the same account. + +**Step 4: install and configure Tmux** + +```bash +# macOS +brew install tmux + +# create ~/.tmux.conf +cat > ~/.tmux.conf << 'EOF' +# enable mouse support +set -g mouse on + +# default terminal with 256 colors +set -g default-terminal "screen-256color" + +# change prefix key to Ctrl+A (optional) +unbind C-b +set -g prefix C-a + +# simplified split shortcuts +bind v split-window -h +bind h split-window +EOF +``` + +**Step 5: create a persistent session** + +```bash +# create session named "claude" +tmux new -s claude + +# start Claude Code in this session +cd ~/my-project +claude + +# detach without closing +# press Ctrl+B then D +``` + +**Step 6: connect from phone SSH client** + +Recommended SSH clients: + +| Client | Platform | Notes | +|--------|------|------| +| Blink Shell | iOS | Supports MOSH, great for unstable networks | +| Termius | iOS/Android | Cross-platform and polished UI | +| a-Shell | iOS | Free and lightweight | + +Connection config: + +```text +Host: 100.x.x.x (your Tailscale IP) +Port: 22 +Username: your computer username +``` + +After connecting, attach to Tmux: + +```bash +tmux attach -t claude +``` + +### Advanced tips + +**Prevent your computer from sleeping:** + +```bash +# macOS +caffeinate -dimsu & + +# or set System Settings > Energy Saver > prevent automatic sleep +``` + +**Use MOSH for unstable networks:** + +MOSH (Mobile Shell) is an SSH alternative optimized for mobile networks, with seamless recovery across network changes. + +```bash +# install on computer +brew install mosh + +# use MOSH from phone client +# Blink Shell supports MOSH natively +``` + +**One-command connect script:** + +Set this as startup command in your SSH client: + +```bash +tmux attach -t claude || tmux new -s claude +``` + +This will auto-attach to an existing session or create a new one. + +### Pros and cons + +Pros are full capabilities and desktop-equivalent workflow with all development tools. Cons are more complex setup and the requirement to keep your computer online. + +--- + +## Approach 5: Local Termux Runtime + +If you are an Android user, you can run Claude Code directly on your phone without connecting external devices. + +### How it works + +```text +┌─────────────────────────────────────────────────────────────┐ +│ │ +│ ┌─────────────┐ │ +│ │ Termux │ │ +│ │ (Linux env) │ │ +│ │ │ │ +│ │ • Node.js │ │ +│ │ • Claude │ │ +│ │ Code CLI │ │ +│ │ │ │ +│ │ • Project │ │ +│ │ files │ │ +│ │ • Git │ │ +│ └─────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────┐ │ +│ │Anthropic API│ │ +│ └─────────────┘ │ +└─────────────────────────────────────────────────────────────┘ +``` + +Termux is a terminal emulator and Linux environment for Android. You can directly install Node.js and Claude Code in it. + +### Installation steps + +**Important**: download Termux from [F-Droid](https://f-droid.org/), not from Google Play (the Play version is outdated). + +**Step 1: install base tools** + +```bash +# update package manager +pkg update && pkg upgrade + +# install development tools +pkg install git nodejs python vim +``` + +**Step 2: install Claude Code** + +```bash +npm install -g @anthropic-ai/claude-code +``` + +**Step 3: configure environment** + +```bash +# create workspace +mkdir -p ~/projects +cd ~/projects + +# initialize project +git clone https://github.com/your-repo.git +cd your-repo + +# launch Claude Code +claude +``` + +**Step 4: configure external keyboard (recommended)** + +In Termux: + +```bash +# enable extra keys row +# long press screen > More > Extra keys row + +# configure shortcuts +# add in ~/.termux/termux.properties +extra-keys = [['ESC','/','-','HOME','UP','END','PGUP','~'], \ + ['TAB','CTRL','ALT','LEFT','DOWN','RIGHT','PGDN','|']] +``` + +### Performance considerations + +| Task Type | Android Performance | +|---------|-------------| +| Web development (HTML/CSS/JS) | Excellent | +| Python scripts | Excellent | +| Node.js applications | Good | +| Running test suites | Medium | +| Compiling large projects | Not recommended | + +### Pros and cons + +Pros are full local control, no external host dependency, and offline-first operation. Cons are limited phone performance, weak text input experience, and Android-only availability. + +--- + +## Approach 6: Claude Code UI + +Claude Code UI (also known as CloudCLI) is an open-source project that provides a web interface for Claude Code, with phone browser support. + +### How it works + +```text +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│Phone Browser│ ────────► │ Web Server │ ◄──────── │Claude Code │ +│ │ HTTP/HTTPS │ (localhost) │ invoke │ CLI │ +└─────────────┘ └─────────────┘ └─────────────┘ +``` + +You run a web server on your computer, then access it from your phone browser. This requires LAN access or tunneling. + +### Installation and usage + +**Step 1: install** + +```bash +# one-command start (recommended) +npx @siteboon/claude-code-ui + +# or global install +npm install -g @siteboon/claude-code-ui +claude-code-ui +``` + +**Step 2: open interface** + +Server defaults to `http://localhost:3001`. + +**Step 3: access from phone** + +Method A - LAN access (same Wi-Fi): + +```bash +# bind all interfaces +claude-code-ui --host 0.0.0.0 + +# access from phone +http://:3001 +``` + +Method B - ngrok tunnel: + +```bash +# install ngrok +brew install ngrok + +# start tunnel +ngrok http 3001 + +# open ngrok URL from phone +``` + +### Features + +- Responsive design with mobile support +- Built-in chat interface +- File browser +- Git operations UI +- Session management + +### Pros and cons + +Pros are graphical interface and rich features. Cons are tunnel requirements outside LAN and relatively more complex setup. + +--- + +## Approach 7: Cloud Development Environment + +If you do not have an always-on local computer, you can use cloud development environments where Claude Code runs on cloud servers. + +### How it works + +```text +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Phone │ ────────► │ Cloud Box │ ─────────► │Claude Code │ +│(Browser/App)│ HTTPS │ (DevBox) │ │ CLI │ +└─────────────┘ └─────────────┘ └─────────────┘ +``` + +A cloud container comes with Claude Code preinstalled, and you access it from browser or mobile app. + +### Using Sealos DevBox + +**Step 1: create environment** + +Go to [Sealos DevBox](https://sealos.io/devbox), choose a Claude Code template, and create an environment. + +**Step 2: start development environment** + +Environment is ready in about 30-60 seconds, and you get a web terminal. + +**Step 3: configure Claude API** + +```bash +export ANTHROPIC_API_KEY="your-api-key" +``` + +**Step 4: connect Happy app** + +```bash +# install happy-coder (or use preinstalled) +npm install -g happy-coder + +# generate pairing QR code +happy +``` + +After scanning on your phone, you can use it immediately. + +### Cloud option comparison + +| Platform | Claude Code | Mobile Optimization | Startup Time | Pricing | +|------|------------|----------|----------|------| +| Sealos DevBox | Preinstalled | Happy support | ~60s | Pay-as-you-go | +| GitHub Codespaces | Manual setup | Browser flow | ~2-3 min | Free quota + hourly | +| Gitpod | Manual setup | Browser flow | ~1-2 min | Free quota + hourly | +| Replit | No native Claude Code | Native app | Instant | Free + subscription | + +### Pros and cons + +Pros are no local computer requirement, environment consistency, and scalability. Cons are paid usage, network dependency, and code hosted in cloud. + +--- + +## Comparison and Selection + +Each approach has different strengths and is suitable for different scenarios. + +### Comparison table + +| Approach | Difficulty | Requires Tunnel | Cost | Best Scenarios | +|------|------|-------------|------|----------| +| Official iOS App | Easy | No | $20/month | Quick checks, simple tasks | +| Happy Coder | Relatively easy | No | Free | Daily use, convenience | +| HAPI | Medium | No | Free | Multi-model, local-first | +| SSH + Tailscale | Relatively complex | No | Free | Professional development, full features | +| Termux | Medium | No | Free | Android local development | +| Claude Code UI | Medium | Yes | Free | Web interface preference | +| Cloud DevBox | Easy | No | Pay-as-you-go | No local computer | + +### Selection guide + +**If you are in mainland China**: use **Happy Coder**; with domestic API relay setup, it works well. + +**If you want maximum convenience**: choose Happy Coder. Scan-and-use flow is very convenient. + +**If you need multi-model support**: choose HAPI. It supports multiple AI coding assistants and is ideal for model switching workflows. + +**If you have an always-on computer**: choose SSH + Tailscale. This gives the most complete experience. + +**If you are an iPhone user (outside mainland China)**: official app is the easiest way to get started. + +**If you only have Android**: Termux gives a fully local mobile development path. + +**If you do not have a computer**: cloud DevBox is the ideal choice. + +--- + +## Security and Privacy + +Mobile development involves code transfer over networks, so security needs special attention. + +### Risks of relay servers + +When using relay-dependent services like Happy Coder or HAPI, consider these risks: + +```text +┌─────────────────────────────────────────────────────────────┐ +│ │ +│ What can a relay server potentially see? │ +│ │ +│ • Data before encryption (if E2E is implemented poorly) │ +│ • Metadata (when you connect, how long sessions run) │ +│ • Your API key (if configured incorrectly) │ +│ │ +│ What can a relay server potentially do? │ +│ │ +│ • Record your code content │ +│ • Steal API credentials │ +│ • Inject malicious commands │ +│ • Abuse your device as an attack node │ +│ │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Security best practices + +**1. Code sensitivity grading** + +```text +┌─────────────────────────────────────────────────────────────┐ +│ │ +│ Public projects/learning code -> any approach is acceptable│ +│ │ +│ Private projects -> prefer SSH+Tailscale or self-hosted │ +│ │ +│ Commercial code -> use SSH+Tailscale only, disable all │ +│ third-party relay paths │ +│ │ +└─────────────────────────────────────────────────────────────┘ +``` + +**2. Key management** + +```bash +# do not hard-code keys in source +const apiKey = "sk-ant-xxxxx" + +# use environment variables +const apiKey = process.env.ANTHROPIC_API_KEY + +# use .env files (add to .gitignore) +ANTHROPIC_API_KEY=sk-ant-xxxxx +``` + +**3. Use sandbox mode** + +Claude Code supports sandbox mode to limit access scope: + +```bash +claude --sandbox /path/to/project +``` + +**4. Self-host relay** + +If using Happy Coder, consider self-hosting relay: + +```bash +# clone project (includes server implementation) +git clone https://github.com/slopus/happy.git +cd happy + +# deploy server to your VPS +# follow project documentation for details +``` + +**5. Use Headscale** + +Headscale is an open-source implementation of Tailscale and can be self-hosted: + +```bash +# one-command Docker deployment +docker run -d \ + --name headscale \ + -v /srv/headscale:/etc/headscale \ + -p 3478:3478/udp \ + -p 8080:8080 \ + headscale/headscale:latest +``` + +--- + +## Frequently Asked Questions + +### Do I need NAT traversal? + +Most modern approaches **do not** require manual NAT traversal: + +| Approach | Principle | +|------|------| +| Happy Coder | Relay mode, both sides actively connect to server | +| HAPI | Relay mode, WireGuard + TLS | +| Tailscale | NAT hole-punching or relay | +| iOS App | Cloud execution | +| Claude Code UI | Requires inbound access | + +### Why does relay mode not require traversal? + +```text +Outbound connection (NAT allows): +Computer ──► Relay Server yes + +Inbound connection (NAT blocks): +External ──► Computer no + +Relay trick: +Both sides make outbound connections to the relay, +so neither side needs inbound connectivity. +``` + +### Does mobile development affect battery life? + +Different approaches consume different power: + +| Approach | Power Usage | Reason | +|------|--------|------| +| SSH terminal | Low | Text-only rendering | +| iOS App | Medium | Cloud execution, phone controls only | +| Termux | High | Local CLI runtime | +| Browser | Medium | Web UI rendering load | + +For long sessions, keep your phone charging. + +### What happens when network disconnects? + +| Approach | Impact of Network Disconnect | +|------|-------------| +| SSH + Tmux | Claude keeps running; recover on reconnect | +| Happy Coder | Auto-reconnect | +| HAPI | Auto-reconnect | +| iOS App | Cloud continues; app shows disconnect | +| Termux | Session interruption | + +### Can I compile large projects on a phone? + +Not recommended. Phone CPU and memory are limited, and large builds can cause: + +- significant heating +- rapid battery drain +- very long compile times + +Run heavy build tasks on remote hosts or cloud environments. + +--- + +## Summary + +The core idea of Claude Code mobile development is: **the phone is the controller, and real development runs elsewhere**. + +Which approach you should choose depends on your specific needs. + +If you are in mainland China, **Happy Coder** is recommended, especially when paired with domestic API relay configuration. + +If you want the most convenient setup, use **Happy Coder**. Scan to connect, get push notifications, and switch devices smoothly. + +If you need multi-model support or local-first architecture, use **HAPI**. It supports multiple assistants and self-hosted relay. + +If you want the most complete development experience, use **SSH + Tailscale**. Setup is more complex, but capability is closest to desktop. + +If you are an iOS user outside mainland China, the **official app** is the easiest way to begin. + +If you are an Android user, **Termux** enables fully local development on the phone. + +If you do not have an always-on computer, **cloud DevBox** is the ideal option. + +No matter which solution you choose, security matters: be cautious with third-party relay for sensitive code, manage API keys properly, and prefer self-hosted or private paths for important projects. + +--- + +## References + +### Official Resources + +- [Claude Code Official Docs](https://docs.anthropic.com/en/docs/claude-code) - complete official Claude Code documentation +- [Claude iOS App](https://apps.apple.com/app/claude/id6473753684) - official iOS app + +### Open Source Projects + +- [slopus/happy](https://github.com/slopus/happy) (2.5k stars) - Happy Coder mobile client +- [tiann/hapi](https://github.com/tiann/hapi) - HAPI local-first multi-model AI coding assistant +- [siteboon/claudecodeui](https://github.com/siteboon/claudecodeui) - Claude Code UI (CloudCLI) +- [juanfont/headscale](https://github.com/juanfont/headscale) (19k stars) - open-source Tailscale implementation + +### Chinese Tutorials + +- [Code Anytime Anywhere: Configure Claude Code on Phone](https://m.blog.csdn.net/haa_y/article/details/151156494) - Termux setup guide +- [AI Lab in Your Pocket: Always-Online Claude Code Mobile Workflow](https://www.cnblogs.com/swizard/p/19308983) - Tmux + Docker approach +- [I Took Claude Code Shopping with My Girlfriend](https://post.m.smzdm.com/p/a3r7d63d/) - Tailscale remote connection +- [Build Production Apps from Phone](https://m.toutiao.com/article/7611823834756301318/) - real mobile development case + +### English Resources + +- [The Definitive Guide to Using Claude Code on Your Phone | Sealos Blog](https://sealos.io/blog/claude-code-on-phone/) - most comprehensive mobile guide +- [SSH + Tailscale + Termius Complete Guide](https://m.blog.csdn.net/Lvyizhuo/article/details/157692953) - detailed remote connectivity guide + +### Tool Downloads + +- [Tailscale](https://tailscale.com/download) - peer-to-peer VPN tool +- [Termux (F-Droid)](https://f-droid.org/en/packages/com.termux/) - Android terminal emulator +- [Blink Shell](https://blink.sh/) - iOS SSH client (MOSH support) +- [Termius](https://termius.com/) - cross-platform SSH client diff --git a/docs/en/stage-3/core-skills/skills/index.md b/docs/en/stage-3/core-skills/skills/index.md new file mode 100644 index 0000000..cc6c5a4 --- /dev/null +++ b/docs/en/stage-3/core-skills/skills/index.md @@ -0,0 +1,1309 @@ +# Claude Code Skills Complete Guide + +## Introduction to Skills + +**Claude Code Skills** is a feature that packages specialized knowledge, workflows, and best practices into reusable "skill packs." + +You can imagine Skills as "skill books" equipped for Claude. When you need it to complete a specific task, you no longer have to explain the requirements over and over again. Instead, it can directly carry out the work according to the standards defined in advance by the Skill. + +### Why do we need Skills? + +Before Skills existed, using Claude Code had several problems: + +- **Repeated instructions**: every time, you had to explain things like "what coding style to follow" and "how commit messages should be written" +- **Knowledge could not accumulate**: team members' individual experience using Claude could not be shared +- **Inconsistent standards**: different people using Claude could get completely different results +- **Low efficiency**: common tasks had to be explained from scratch every time + +Skills solve these problems and turn Claude into an "experienced team member" - it knows your project conventions, workflows, and best practices. + +--- + +## Why learn Skills now? + +**Skills are becoming a must-have capability for AI engineers**: + +- **High community interest**: related GitHub repositories are gaining stars rapidly. For example, the OpenSkills project has already reached 7.2k stars, and Obsidian Skills gained 6.6k stars in just 9 days +- **Official support**: Anthropic maintains an official Skills repository, and Vercel has launched Agent Skills and the find-skills tool +- **Highly practical**: from code review and Git operations to video creation and PPT generation, Skills cover many scenarios. The skills.sh platform already has popular skills with 60K+ subscriptions +- **Efficiency gains**: configure once, reuse repeatedly, and let Claude truly become your "digital employee" +- **Developer recognition**: recommended by multiple technical communities and widely considered a key tool for improving AI programming efficiency + +--- + +## Quick Start + +Now that you understand the value of Skills, let's try them right away. This section will take you through installing your first Skill and completing a few interesting hands-on tasks so you can quickly build intuition. + +### Step 1: Install `find-skills` (strongly recommended) + +Before you start using Skills, it is strongly recommended that you install `find-skills` first. It is the "ultimate skill search tool" in the AI Agent world and already has 60K+ subscriptions. + +**What is `find-skills`?** + +Simply put, `find-skills` is like an "app store search engine" for AI Agents. When you need to complete a task but do not have a suitable local Skill, it will automatically search for and recommend the most appropriate one. + +**Install `find-skills`:** + +```bash +npx skills add vercel-labs/skills@find-skills -g -y +``` + +After installation, you can directly tell Claude what you need, and it will use `find-skills` to search for relevant skills automatically. + +**Example usage:** + +```text +I need to optimize the performance of a React component. Help me find what skills I can use. +``` + +Claude will search through `find-skills`, then tell you which relevant skills it found so that you can choose one to install. + +**Why install `find-skills` first?** + +Before `find-skills`: +- manually search GitHub for related skills +- copy, install, and configure them one by one +- repeatedly debug and adapt them + +After `find-skills`: +- describe the requirement in one sentence +- AI automatically searches for the best matching skill +- install with one click and use it immediately + +**Note for Windows users**: the official version has limited Windows support. The community has made a Windows-compatible version that supports CMD and PowerShell and adds Chinese-language search. + +Download the Windows version: [github.com/tongbei821/customize-skills](https://github.com/tongbei821/customize-skills/blob/main/findskills/SKILL.md) + +Installation steps: +1. Download the Windows version of `SKILL.md` +2. Replace the file in `C:/Users/your-username/.agents/skills/find-skills` +3. Restart Claude Code and it will take effect + +**Related links**: +- [Skills official website](https://skills.sh/) - browse all available skills +- [find-skills repository](https://github.com/vercel-labs/agent-skills) - official source code + +### Install and Try Your First Skill + +After installing `find-skills`, let's use it to search for and install a fun first Skill: the Remotion video creation tool. + +#### Step 1: Use `find-skills` to search for Remotion + +Type this in Claude Code: + +```text +Help me find skills related to Remotion. I want to make videos. +``` + +Claude will search via `find-skills` and recommend `remotion-dev/skills`. + +#### Step 2: Install Remotion Skills + +```bash +npx skills add remotion-dev/skills -g +``` + +#### Step 3: Use it to build something fun + +Remotion is a framework for making videos with React code. After installing this Skill, you can ask Claude in natural language to help you write video code. + +**Task 1: Make a cool animated text video** + +```text +Use Remotion to make a video: +- 1920x1080, 5 seconds +- A line of text "Hello World" flies in from the left +- With rotation and scaling effects at the same time +- The background is a gradient +``` + +Claude will generate complete Remotion code, and you can run it to see the animation. + +**Task 2: Make a data visualization video** + +```text +Make a 10-second video showing data growth: +- Start with a bar chart +- The bars grow one by one with animation +- Numbers count upward +- At the end, show large text saying "300% growth" +``` + +**Task 3: Make a multi-scene product demo video** + +```text +Make a product demo video with three scenes: +Scene 1: Logo fades in, 2 seconds +Scene 2: Product features appear one by one, 3 seconds +Scene 3: CTA button pops up, 2 seconds +Use smooth transitions between each scene +``` + +**Run the code**: + +The code Claude generates is a complete Remotion project. You can: + +1. Create a new project: `npx create-video my-video` +2. Copy Claude's generated code into it +3. Run a preview: `npm start` +4. Render the video: `npm run build` + +--- + +### The Second Skill: Use `find-skills` to solve "the frontend looks ugly and feels slow" + +#### Step 1: Describe your problem in natural language + +Directly tell Claude your high-level need: + +```text +My website looks outdated and loads slowly. Help me find what skills I can use. +``` + +Or make it a bit more specific: + +```text +I want the frontend to look better and stop being so laggy. +``` + +#### Step 2: Claude will search with `find-skills` + +Claude will search the skills.sh database via `find-skills` and recommend related skills. For a requirement like "make it look better + reduce lag," it will recommend: + +**anthropics/skills/frontend-design** (official skill) + +This skill is specifically designed to solve the problem of AI-generated interfaces that "look plain and generic," helping Claude design: + +- unique visual styles that avoid the same old "AI template look" +- professional color schemes and typography +- smooth animation effects +- production-grade code quality, with clean code and naturally better performance + +#### Step 3: Install and use it + +**Install**: + +```bash +npx skills add anthropics/skills/frontend-design -g +``` + +**Tasks you can complete with it**: + +```text +Help me redesign this page. I want it to look very professional and not like it was generated by AI. +``` + +```text +This UI is too ugly. Rewrite it in a more modern design style. +``` + +```text +Make a dark-theme dashboard with a strong tech feel. +``` + +Claude will follow this skill's conventions and help you design: +- a unique visual direction such as minimalism, retro-futurism, or brutalism +- carefully chosen colors and fonts +- reasonable spacing and layout +- smooth interactive animation + +--- + +### Comparing the Two Skills + +| Skills | What problem does it solve? | Fun factor | +|--------|-------------|---------| +| **remotion-dev/skills** | Make videos with code | ⭐⭐⭐⭐⭐ | +| **anthropics/skills/frontend-design** | Make the frontend look better | ⭐⭐⭐⭐ | + +--- + +### The Third Skill: Use `frontend-slides` to quickly make beautiful PPT presentations + +#### Introduction + +**frontend-slides** is a Skill that lets you create beautiful HTML presentations with natural language - even if you do not know any CSS or JavaScript. + +Its core idea is "**show, don't tell**." If you cannot clearly describe the design style you want, it will generate 3 visual previews for you to choose from, rather than forcing you to describe abstract requirements like "blue background, large font." + +#### Install `frontend-slides` + +**Method 1: Install manually** + +```bash +# Create the skill directory +mkdir -p ~/.claude/skills/frontend-slides + +# Download files (or copy from GitHub) +# 1. Visit https://github.com/zarazhangrui/frontend-slides +# 2. Download SKILL.md and STYLE_PRESETS.md +# 3. Put them into ~/.claude/skills/frontend-slides/ +``` + +**Method 2: Install with `find-skills`** + +```text +Help me find a skill for making PPT presentations +``` + +Claude will search through `find-skills` and recommend `frontend-slides`. + +#### Usage scenarios + +**Scenario 1: Create a presentation from scratch** + +```text +/frontend-slides + +I want to create a fundraising pitch deck for an AI startup project, around 10 slides +``` + +Claude will guide you to: +1. fill in the content of each slide such as titles, bullet points, and images +2. describe the feeling you want such as stunning, professional, or warm +3. choose from 3 visual style previews +4. create the complete HTML presentation +5. open a preview in the browser + +**Scenario 2: Convert a PowerPoint file** + +```text +/frontend-slides + +Convert my presentation.pptx into a web presentation +``` + +Claude will: +1. extract all text, images, and notes from the PPT +2. show the extracted content for you to confirm +3. let you choose a visual style +4. generate an HTML presentation that preserves all original content + +**Scenario 3: Quickly generate style previews** + +```text +/frontend-slides + +I want to make a PPT for a technical talk. Show me the available visual styles first. +``` + +Claude will directly generate 3 preview pages in different styles: +- **Dark themes**: Neon Cyber, Terminal Green, Deep Space +- **Light themes**: Paper & Ink, Swiss Modern, Soft Pastel +- **Special styles**: Brutalist, Gradient Wave + +#### Built-in visual styles + +| Style name | Characteristics | Suitable scenarios | +|---------|------|---------| +| **Neon Cyber** | Futuristic tech feel, particle effects | Technical talks, AI products | +| **Midnight Executive** | High-end business, trustworthy | Business reports, fundraising pitches | +| **Paper & Ink** | Editorial style, literary atmosphere | Content creation, educational sharing | +| **Swiss Modern** | Clean geometry, Bauhaus style | Design portfolios, minimalism | +| **Brutalist** | Raw, bold, attention-grabbing | Art showcase, personal expression | + +#### Output result + +The generated presentation is a **single-file HTML** document that includes: + +- complete styling and interaction code +- keyboard navigation with arrow keys and space +- touch and swipe support +- mouse wheel slide turning +- progress bars and navigation dots +- scroll-triggered animation +- responsive design + +```html + + + + + + +
+

Your Title

+
+ + + +``` + +#### Why recommend it? + +1. **Zero dependency**: a single HTML file that will still open 10 years from now +2. **Visual discovery**: no need to describe the design, just pick what you like +3. **PPT conversion**: keep your existing content and give it a better visual skin +4. **Production-grade code**: accessible, clearly commented, and easy to customize + +**Related links**: +- [frontend-slides GitHub repository](https://github.com/zarazhangrui/frontend-slides) - 6.1k+ stars +- [Online preview example](https://github.com/zarazhangrui/frontend-slides#output-example) + +--- + +### Comparing the Three Skills + +| Skills | What problem does it solve? | Fun factor | Practicality | +|--------|-------------|---------|---------| +| **remotion-dev/skills** | Make videos with code | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ | +| **anthropics/skills/frontend-design** | Make the frontend look better | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | +| **frontend-slides** | Quickly make beautiful PPTs | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | + +--- + +### How to use them after installation + +After installation, you do not need any extra configuration. When you ask Claude to perform a related task, it will automatically call the corresponding Skill. + +View installed Skills: + +```bash +npx skills list +``` + +--- + +## What are Skills? + +### Core concept + +**Skills are "skill packs" stored in the file system** and can include: + +- **SKILL.md**: the definition file for the skill, required +- **scripts/**: helper scripts, optional +- **templates/**: output templates, optional +- **references/**: reference docs, optional + +### Skills vs. prompts + +You may wonder: what is the difference between Skills and directly sending prompts to Claude? + +| Prompts | Skills | +|--------|--------| +| Temporary, you have to repeat them every time | Persistent, write once and reuse many times | +| Live in conversation history and consume tokens | Loaded on demand and save tokens | +| Cannot be shared across sessions | Can be shared within a team | +| Hard to version-control | Can be managed with Git | + +### Two types of Skills + +**Global Skills (personal)**: +- storage location: `~/.claude/skills/` +- scope: all projects +- suitable scenarios: general-purpose personal skills + +**Project Skills (team)**: +- storage location: `project-directory/.claude/skills/` +- scope: the current project +- suitable scenarios: team sharing and project-specific conventions + +### How Skills work + +When Claude Code starts, it will: + +1. scan the Skills directories +2. parse each `SKILL.md` file +3. extract YAML frontmatter metadata +4. add the skill content into its "knowledge base" +5. automatically match triggers based on the description + +--- + +## `SKILL.md` File Structure + +### Basic structure + +A complete Skill directory looks like this: + +```text +my-skill/ +├── SKILL.md # Required: skill definition file +├── scripts/ # Optional: helper scripts +├── templates/ # Optional: output templates +├── references/ # Optional: reference documents +└── examples/ # Optional: example files +``` + +### `SKILL.md` template + +The `SKILL.md` file has two parts: + +**Part 1: YAML Frontmatter (metadata)** + +```yaml +--- +name: skill-name # Skill name, becomes the /skill-name command +description: short description # Used for Claude's automatic trigger matching +category: development # Category +tags: # Tags + - code + - automation +--- +``` + +**Part 2: Markdown content (instructions)** + +```markdown +# Skill Title + +## Use cases +When to use this skill + +## Execution steps +1. Step one +2. Step two + +## Notes +- Note 1 +- Note 2 +``` + +### Explanation of key fields + +| Field | Required | Explanation | +|------|------|------| +| `name` | Yes | The skill name. Only lowercase letters, numbers, and hyphens are allowed | +| `description` | Yes | The skill description. The more specific it is, the easier it is for Claude to match automatically | +| `category` | No | Category label | +| `tags` | No | Additional category labels | +| `allowed-tools` | No | Tools that may be used without extra permission | + +--- + +## Skills vs. MCP: What is the difference? + +Many beginners confuse Skills and MCP, but they are completely different things. + +### Core differences + +| Dimension | Skills | MCP | +|------|--------|-----| +| **Nature** | Knowledge and workflow | Tools and interfaces | +| **What it provides** | Tells AI "how to do it" | Gives AI "what it can use" | +| **Storage location** | `skills/` directory | MCP server | +| **Configuration format** | Markdown files | JSON config files | +| **Trigger method** | `/skill-name` or automatic recognition | Automatically loaded through configuration | + +### An intuitive analogy + +If Claude were a "worker": + +- **MCP** would be the "tools" given to the worker, such as a wrench, a computer, and access permissions +- **Skills** would be the "operating manual" given to the worker, such as how to do code review or how to submit code + +### Their relationship + +Skills and MCP are not competing with each other. They are complementary: + +```text +User task -> Claude recognizes the requirement + ↓ + Load relevant Skills (know how to do it) + ↓ + Call tools through MCP (have tools available) + ↓ + Complete the task +``` + +### Example + +**Scenario: code review** + +- **Skills** define the review steps, checklist, and output format +- **MCP** provides the ability to access GitHub PRs and fetch code diffs + +Working together: Skills tell Claude "how to review," and MCP gives Claude "the ability to access the code." + +### Recommendation for choosing + +| Your need | Recommended solution | +|----------|----------| +| Need to define a workflow | Use Skills | +| Need to access external data | Use MCP | +| Need both | Use them together | + +--- + +## Common Resources for Getting Skills + +### Official resources + +- [Anthropic official Skills repository](https://github.com/anthropics/skills) - an officially maintained collection of skills +- [Claude Code official docs - Skills](https://docs.anthropic.com/en/docs/claude-code/configuration/skills) - official documentation + +### GitHub community resources + +| Repository | Description | +|------|------| +| [shanraisshan/claude-code-best-practice](https://github.com/shanraisshan/claude-code-best-practice) | Maintained by Boris Cherny, head of Claude Code, including Skills, Agents, Hooks, and more | +| [affaan-m/everything-claude-code](https://github.com/affaan-m/everything-claude-code) | Comprehensive toolkit including preconfigured Skills | +| [JackyST0/awesome-agent-skills](https://github.com/JackyST0/awesome-agent-skills) | Curated Skills resource list | +| [jeffallan/claude-skills](https://github.com/jeffallan/claude-skills) | 66 professional skills and 300+ reference documents | +| [GitCode/awesome-claude-skills](https://gitcode.com/GitHub_Trending/aw/awesome-claude-skills) | Selected open-source collection | + +### How to install community Skills + +Using `find-skills`, you only need to tell Claude what you need, and it will automatically search and recommend: + +```text +Help me find a skill related to React performance optimization +``` + +Claude will search the skills.sh database through `find-skills`, then list the most relevant skills, and you can choose one to install. + +**Search tips**: + +- use specific keywords: `"react testing"` is better than `"testing"` +- combine "domain + action": `"nextjs deploy"`, `"typescript lint"` +- prioritize skills with high install counts, since 10K+ usually means battle-tested +- watch the trending list to discover emerging skills + +--- + +## How to Create Your Own Skills + +There are two ways to create Skills: directly ask Claude to create one for you, or use the dedicated `skill-creator` tool. + +### Method 1: Directly ask Claude to help you create one + +This is the simplest approach. Just tell Claude your requirement in natural language. + +**Example**: + +```text +Please help me create a skill named "format-code" to automatically format code. + +Requirements: +1. Automatically detect the programming language +2. Apply the corresponding formatting rules +3. Return the diff before and after formatting +``` + +Claude will automatically: +1. create the directory structure +2. generate the `SKILL.md` file +3. fill in the YAML frontmatter +4. write the skill content + +**Suitable scenarios**: +- quickly creating simple skills +- you know what you want but are not familiar with the `SKILL.md` format +- you want to iterate and modify quickly + +### Method 2: Use `skill-creator` + +`skill-creator` is a dedicated tool for creating Skills. It guides you step by step through the process. + +**Install**: + +```bash +npx skills add anthropics/skills@skill-creator -g +``` + +Or install the entire official skills repository: + +```bash +npx skills add anthropics/skills -g +``` + +**Use**: + +```text +/skill-creator +``` + +Then fill in the prompts: +- skill name +- feature description +- usage scenarios +- execution steps + +`skill-creator` will: +1. guide you to clarify the purpose of the skill +2. generate a draft `SKILL.md` +3. create test cases +4. run evaluation and optimize it + +**Suitable scenarios**: +- creating complex skills +- needing a more standard creation process +- wanting to test and verify the skill + +### Comparison of the two methods + +| Method 1: Direct creation | Method 2: `skill-creator` | +|-----------------|---------------------| +| Fast and simple | Guided steps | +| Suitable for simple skills | Suitable for complex skills | +| Completed directly in conversation | Standardized process | +| Flexible modification | Includes testing and verification | + +### Tip: how to write a good requirement + +**A good requirement description**: + +```text +Create a "git-commit" skill that automatically commits code. + +Execution steps: +1. Check which files were modified +2. Generate a commit message that follows Conventional Commits +3. Run git commit +4. Ask whether to push + +Notes: +- Check for sensitive information before committing +- Do not commit directories like dist/ or node_modules/ +``` + +**A bad requirement description**: + +```text +Help me write a skill for committing code +``` + +That is too vague. Claude will not know exactly what it needs to do. + +--- + +## Common Skill Examples + +### Example 1: Code Review Skill + +Create the directory and file: + +```bash +mkdir -p ~/.claude/skills/review-pr +``` + +```bash +cat > ~/.claude/skills/review-pr/SKILL.md << 'EOF' +--- +name: review-pr +description: Review Pull Requests for code quality, security, and test coverage +--- + +You are a senior code reviewer. + +## Review workflow + +1. **Code style check** + - Does the code follow team conventions? + - Are names clear? + - Are comments sufficient? + +2. **Security check** + - Are there security vulnerabilities? + - Is sensitive information exposed? + - Is input validation complete? + +3. **Testing check** + - Are there enough tests? + - Do test cases cover edge conditions? + - Are the tests runnable? + +4. **Overall evaluation** + - What are the strengths? + - What needs improvement? + - Do you recommend approving the merge? + +## Output format + +Please output the review results in a clear structure using a list format. +EOF +``` + +How to use it: + +```text +/review-pr +Please review the PR for the current branch +``` + +### Example 2: Git Auto-Commit Skill + +```bash +mkdir -p ~/.claude/skills/git-commit +``` + +```bash +cat > ~/.claude/skills/git-commit/SKILL.md << 'EOF' +--- +name: git-commit +description: Automatically detect changes, generate a commit message, and commit the code +--- + +You are a skilled Git user. + +## Execution workflow + +1. **Check changes** + Run `git status` to view modified files + Run `git diff` to view detailed changes + +2. **Generate commit message** + Analyze the nature of the changes + Generate a commit message that follows Conventional Commits + Format: `type(scope): description` + +3. **Security check** + Check whether there is sensitive information such as keys, passwords, or tokens + Check whether directories that should not be committed are included + +4. **Execute after confirmation** + Show the commit message for confirmation + Run `git add` and `git commit` + Ask whether a push is needed + +## Notes + +- Do not commit directories such as node_modules/, dist/, or .next/ +- Run tests before committing to ensure the code works +- The commit message should clearly explain the change +EOF +``` + +How to use it: + +```text +/git-commit +``` + +### Example 3: Test Generation Skill + +```bash +mkdir -p ~/.claude/skills/gen-test +``` + +```bash +cat > ~/.claude/skills/gen-test/SKILL.md << 'EOF' +--- +name: gen-test +description: Automatically generate unit tests for code to ensure correctness +--- + +You are a test engineer. + +## Workflow + +1. **Analyze the code** + - Understand the function or class + - Identify inputs and outputs + - Find edge cases + +2. **Generate tests** + - Use an appropriate test framework + - Cover normal cases + - Cover edge cases + - Cover exceptional cases + +3. **Validate the tests** + - Make sure the tests can run + - Make sure the tests can catch problems + - Do not over-mock the implementation + +## Test frameworks + +- JavaScript/TypeScript: Jest or Vitest +- Python: pytest +- Go: testing package + +## Output format + +Output the test code first, then explain how to run the tests. +EOF +``` + +How to use it: + +```text +/gen-test +Generate unit tests for src/utils.ts +``` + +### Example 4: Documentation Generation Skill + +```bash +mkdir -p ~/.claude/skills/gen-readme +``` + +```bash +cat > ~/.claude/skills/gen-readme/SKILL.md << 'EOF' +--- +name: gen-readme +description: Automatically generate a README document for a project +--- + +You are a technical documentation expert. + +## Workflow + +1. **Analyze the project** + - Scan the project directory structure + - Check package.json or other configuration files + - Read the existing code + +2. **Generate content** + - Project introduction + - Installation steps + - Usage instructions + - API documentation + - Development guide + +3. **Formatting** + - Use a clear section structure + - Add code examples + - Add appropriate badges + - Add license information + +## Standard README structure + +- Project title and introduction +- Features +- Installation +- Quick start +- Usage instructions +- API documentation +- Development guide +- Contribution guide +- License +EOF +``` + +How to use it: + +```text +/gen-readme +Generate a README document for the current project +``` + +--- + +## Advanced Tips + +### Combine Skills with Hooks + +Hooks can automatically perform actions on specific events. Combined with Skills, they enable more powerful automation. + +For example, automatically format code after saving: + +```json +// .claude/hooks.json +{ + "hooks": { + "PostToolUse": [{ + "matcher": { + "tool_name": "Edit" + }, + "hook": { + "type": "command", + "command": "/format-code" // Call the format-code skill + } + }] + } +} +``` + +### Combine Skills with Commands + +Commands are simple shortcut commands. Skills are complex workflows. They can be used together. + +### Team collaboration + +**Share project Skills**: + +1. put the Skills under `.claude/skills/` +2. commit them to Git +3. team members can use them after cloning the project + +**Version control**: + +- Skills can be version-controlled just like code +- each commit can record changes to Skills +- you can roll back to older versions + +--- + +## Frequently Asked Questions + +### Q1: Why was the Skill not triggered? + +Possible reasons: +- YAML frontmatter format is wrong +- the description is not specific enough +- Claude Code was not restarted + +How to solve it: +- check whether the YAML format is correct +- improve the description and include specific usage scenarios +- restart Claude Code + +### Q2: How do I write an accurate description? + +A good description includes: +- the specific function of the skill +- the usage scenario, such as "when the user mentions..." +- trigger keywords + +**Bad example**: +```text +description: Review code +``` + +**Good example**: +```text +description: Review Pull Request code. Trigger when the user mentions PR, review, or code review. +``` + +### Q3: What is the difference between Skills and Commands? + +| Commands | Skills | +|----------|--------| +| Simple shortcut commands | Complete workflows | +| A single `.md` file | A directory structure (`SKILL.md` + optional files) | +| Manually triggered | Can be automatically triggered | +| Suitable for simple operations | Suitable for complex processes | + +### Q4: How do I debug a Skill? + +1. Use `/skills` to check whether the skill was recognized +2. Directly enter the skill name to trigger it manually +3. Check whether the `SKILL.md` content is correct +4. Review the Claude Code logs + +--- + +## References + +### Official resources + +- [Claude Code official docs - Skills](https://docs.anthropic.com/en/docs/claude-code/configuration/skills) +- [Agent Skills standard](https://agentskills.io/) +- [Anthropic engineering article (practical ideas behind Agent Skills)](https://www.anthropic.com/engineering/equipping-agents-for-the-real-world-with-agent-skills) +- [Anthropic official Skills GitHub repository](https://github.com/anthropics/skills) +- [VS Code Copilot Agent Skills documentation](https://code.visualstudio.com/docs/copilot/customization/agent-skills) + +### Resource directories + +- [skills.sh](https://skills.sh/) - Vercel's Agent Skills app store with a 48,000+ skill library +- [find-skills](https://github.com/vercel-labs/agent-skills) - intelligent skill search tool with 60K+ subscriptions +- [Skills marketplace (Chinese interface)](https://skillsmp.com/zh) - discover and install community Skills + +### GitHub community projects + +- [vercel-labs/agent-skills](https://github.com/vercel-labs/agent-skills) - Vercel Labs official Agent Skills collection, including find-skills +- [claude-code-best-practice](https://github.com/shanraisshan/claude-code-best-practice) - official best practices maintained by Boris Cherny +- [everything-claude-code](https://github.com/affaan-m/everything-claude-code) - comprehensive toolkit including preconfigured Skills +- [awesome-claude-skills](https://github.com/ComposioHQ/awesome-claude-skills) - curated list of selected Skills resources +- [superpowers](https://github.com/obra/superpowers) - collection of Skills for software development automation workflows +- [jeffallan/claude-skills](https://github.com/jeffallan/claude-skills) - 66 professional skills and 300+ reference documents +- [awesome-agent-skills](https://github.com/JackyST0/awesome-agent-skills) - curated resource list + +### Official Skill examples + +- [skill-creator](https://github.com/anthropics/skills/tree/main/skills/skill-creator) - a skill for creating new skills +- [mcp-builder](https://github.com/anthropics/skills/tree/main/skills/mcp-builder) - a skill for building MCP servers +- [slack-gif-creator](https://github.com/anthropics/skills/tree/main/skills/slack-gif-creator) - a skill for creating Slack GIFs + +### Chinese tutorials + +- [Complete guide to advanced Claude Code configuration and usage tips](https://blog.csdn.net/2601_95335870/article/details/158460599) +- [Vibe Coding - full-chain practice with CLAUDE.md, Skills, and Subagents](https://blog.csdn.net/yangshangwei/article/details/158319117) +- [A step-by-step guide to customizing Claude Code Skills](https://m.blog.csdn.net/u010028049/article/details/157979705) + +## Further Reading: The Internal Mechanism of Claude Skills + +Next, we will go deeper into how Claude Skills work internally, so you not only know how to use them, but also understand why they are designed this way. + +### First-principles view: prompt-based dynamic context injection + +First, understand one key fact: **Skills are not executable code**. + +Skills are essentially advanced instructions, or prompts, that are "injected" into Claude's context when needed. This design is called "**Prompt-based Dynamic Context Injection & Meta-Tool Architecture**." + +```text +┌─────────────┐ ┌─────────────┐ ┌──────────────┐ +│ User Request│ ───> │ LLM Matches │ ───> │ Trigger Skill│ +└─────────────┘ │Description │ └──────────────┘ + └─────────────┘ │ + ▼ + ┌──────────────┐ + │ Inject Full │ + │ Instructions │ + └──────────────┘ + │ + ▼ + ┌──────────────┐ + │ Execute Task │ + └──────────────┘ +``` + +### Three-layer progressive loading architecture (token optimization) + +To handle a large number of Skills without consuming too many tokens, Claude uses a smart three-layer loading mechanism: + +| Layer | Content | When loaded | Token cost | +|------|------|----------|-----------| +| **Layer 1: Metadata** | YAML frontmatter (`name + description`) | When Claude starts | ~30-50 tokens/skill | +| **Layer 2: Instructions** | Full `SKILL.md` content | When the Skill is triggered | ~5,000 tokens | +| **Layer 3: Resources** | Scripts, templates, references | Accessed from the file system on demand | Not added to context | + +**Advantages of this design**: + +- Suppose you have 100 Skills. At startup, only about 3,000-5,000 tokens are consumed for metadata +- Only the triggered Skill loads its full content +- Resource files such as reference documents are never fully loaded into the context + +**Compared with no Skills**: + +```text +Without Skills: every conversation needs 50,000+ tokens to describe all capabilities +With Skills: startup ~100 tokens/skill + 5,000 tokens loaded on demand +Savings: on average 40,000+ tokens saved per conversation +``` + +### Dual context injection mechanism + +When a Skill is activated, the system makes two modifications at the same time: + +**1. Conversation context injection** + +```javascript +// What the user sees (visible message) +The "pdf" skill is loading + +// What the AI actually receives (hidden meta-message) +{ + isMeta: true, // marked as a meta-message, not shown in the UI + content: ` + # PDF Analysis Expert Instructions + + You are a professional PDF analysis expert. Workflow: + 1. Use pdftotext to extract text + 2. Analyze the document structure + 3. Generate a summary report + ... + ` // full SKILL.md content, possibly thousands of words +} +``` + +**2. Execution context modification** + +Besides injecting instructions, a Skill can also dynamically modify Claude's environment: + +| Modification type | Example | Explanation | +|---------|------|------| +| **Tool permissions** | `allowed-tools: "Bash(pdftotext:*)"` | Temporarily grant access to a specific tool | +| **Model switching** | Switch from Sonnet to Opus | Some complex tasks require stronger reasoning | +| **Context isolation** | Create a child session space | Avoid polluting the main conversation context | + +### A routing mechanism based entirely on LLM reasoning + +This is a very important design decision: **Claude Skills do not use hardcoded routing**. + +| Traditional approach | Claude Skills | +|---------|--------------| +| ❌ Embedding vector matching | ✅ Pure LLM reasoning | +| ❌ Classifier | ✅ Transformer forward pass | +| ❌ Regex or keyword matching | ✅ Natural language understanding | +| ❌ Separate routing algorithm | ✅ Unified model decision-making | + +**Workflow**: + +```text +1. The name and description of every Skill are formatted into the Skill tool description + +2. Claude receives: + - the user message + - the list of available tools, including the Skill meta-tool + - the Skill list, with name + description + +3. Claude's natural language understanding matches the user's intent to a Skill description + +4. When the match succeeds, it calls: command: "skill-name" +``` + +**Why design it this way?** + +**Hardcoded routing requires**: +- extra maintenance cost +- no ability to understand complex semantic relationships +- difficulty handling multiple languages +- no support for fuzzy matching + +**Pure LLM reasoning**: +- leverages Claude's own language understanding +- automatically handles multiple languages, synonyms, and fuzzy descriptions +- requires no extra maintenance +- makes routing decisions more intelligent + +### File parsing mechanism + +**`SKILL.md` file structure**: + +```bash +my-custom-skill/ +├── SKILL.md # Required: core definition file +├── config.json # Optional: metadata config +├── README.md # Recommended: usage documentation +├── scripts/ # Optional: executable scripts +├── templates/ # Optional: template folder +└── references/ # Optional: reference documents +``` + +**Parsing flow**: + +```text +┌─────────────────────────────────────────────────────────────┐ +│ Claude Code startup │ +└─────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Scan ~/.claude/skills/ and .claude/skills/ directories │ +└─────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Use the gray-matter library to parse each SKILL.md │ +│ YAML frontmatter │ +└─────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Validate required fields (name and description) │ +│ - name: max 64 characters, only lowercase letters, │ +│ numbers, and hyphens │ +│ - description: used for LLM automatic matching │ +└─────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Extract metadata and build the Skill list │ +│ (only load name + description, not the full body) │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Example of the full execution flow + +Let's look at the entire flow through a concrete example: + +```text +User: "Help me analyze this PDF file" + +═══════════════════════════════════════════════════════════════ + +Step 1: LLM decision +──────────────── +Claude finds the description of the "pdf" skill in the Skill list: + description: "Analyze PDF document content, extract text, generate a summary" + +═══════════════════════════════════════════════════════════════ + +Step 2: System intervention +──────────────── +Claude Code executes: + 1. Read ~/.claude/skills/pdf/SKILL.md + 2. Generate a visible message: "The pdf skill is loading" + 3. Generate a hidden meta-message: the full SKILL.md content + 4. Modify session permissions: allowed-tools = ["Bash(pdftotext:*)"] + +═══════════════════════════════════════════════════════════════ + +Step 3: LLM execution +──────────────── +Now Claude's context contains: + - the original user request + - the PDF expert workflow instructions + - access permission to the pdftotext tool + +Claude executes: + 1. Use pdftotext to extract the PDF text + 2. Analyze the content structure + 3. Generate a summary report + 4. Present the result to the user + +═══════════════════════════════════════════════════════════════ + +Step 4: Dispose after use +──────────────── +After the task is completed, the full Skill content is removed from context +(only the conversation history remains, not the full Skill instruction) +``` + +### Core design innovations + +| Innovation | Traditional approach | Skills approach | Advantage | +|--------|---------|------------|------| +| **Source of capability** | Fixed in model weights | Dynamically loaded prompts | Extensible and updatable | +| **Token efficiency** | All capabilities always stay in memory | Load on demand | Save 80%+ tokens | +| **Knowledge management** | Scattered in conversation history | Modular file system | Version-controllable and shareable | +| **Lifecycle** | Continuously occupies space | Dispose after use | Cleaner context | + +### Academic foundations + +The design of Claude Skills draws on the following research: + +| Research field | Representative work | Applied here as | +|---------|---------|---------| +| **Reinforcement learning** | Voyager (2023) | The idea of accumulating a skill library | +| **Cognitive architecture** | ACT-R, Soar | Separation of procedural memory and declarative memory | +| **Hierarchical policy** | Options Framework | Three-layer progressive loading | + +**Core shift in thinking**: + +```text +Traditional: AI needs to remember everything + ↓ +Skills: AI knows where to find specialized knowledge + ↓ +Result: more like the thinking pattern of a human expert +``` + +### Relationship to the Agent Skills standard + +Claude Skills follows the [Agent Skills open standard](https://agentskills.io/), which means: + +- ✅ Cross-platform compatibility: tools such as Cursor, Windsurf, and Aider also support it +- ✅ Unified file format: standardized `SKILL.md` structure +- ✅ Interoperability: Skills can be shared across different tools + +```text +Agent Skills standard defines: +├── Required: SKILL.md file (metadata + instructions) +├── Optional: scripts/ (executable code) +├── Optional: references/ (knowledge base documents) +└── Optional: assets/ (templates and resources) +``` + +### Summary: why is this design brilliant? + +1. **Decouples capability from the model**: specialized knowledge no longer depends on model training and can be updated at any time through Markdown files + +2. **Extreme token efficiency**: the three-layer loading mechanism ensures only necessary content is loaded + +3. **Uses the LLM's own strengths**: routing and matching rely entirely on Claude's language understanding, with no extra algorithm required + +4. **Developer-friendly**: creating a Skill only requires writing Markdown, not programming + +5. **Composable**: Skills can reference and combine with each other to form complex workflows + +6. **Dispose after use**: automatically cleans up after completion and keeps context fresh + +--- + +### Summary + +Skills are the key to turning Claude Code from a "general assistant" into a "team expert." + +Through Skills, you can: +- standardize workflows +- reuse team knowledge +- improve collaboration efficiency +- reduce repeated explanation + +Remember: **if you find yourself repeating the same instruction twice, you should consider creating a Skill**. + +Now go create your first Skill. diff --git a/docs/en/stage-3/core-skills/spec-coding/index.md b/docs/en/stage-3/core-skills/spec-coding/index.md new file mode 100644 index 0000000..80cf13a --- /dev/null +++ b/docs/en/stage-3/core-skills/spec-coding/index.md @@ -0,0 +1,613 @@ +# From Vibe Coding to Spec Coding: The Evolution of AI Programming + +> "Code is a lossy projection of intent." +> Code is a lossy projection of intent. +> - Sean Grove, OpenAI, AI Engineer World's Fair 2025 + +## The Core Idea of Spec Coding: Everything Is Markdown + +Before going deeper into Spec Coding, first understand the underlying philosophy of Claude Code: **everything is Markdown**. + +In Claude Code's design philosophy, process records, information transfer, and even conversations with the model can all be Markdown: + +- **CLAUDE.md**: a Markdown document for project conventions +- **.claude/rules/**: a collection of layered Markdown rule files +- **specs/**: Markdown descriptions of feature requirements +- **Conversation history**: Claude Code's chat records are themselves in Markdown format +- **AGENTS.md**: Markdown instructions that define agent behavior + +This is exactly the core of Spec Coding: **the specification itself is code**. When you write requirements, design decisions, and acceptance criteria in Markdown, you are already writing "code" - AI will read that Markdown and then generate the real implementation. + +Josh Beckman's summary of Grove's talk captures it perfectly: + +> "Software engineering (and lawmaking and legal review) is specification repair." +> Software engineering (and lawmaking and legal review) is specification repair. + +In Claude Code, this "specification repair" process is: **modify Markdown -> AI reads Markdown -> generate/modify code -> verify the result**. The entire workflow is Markdown-driven. + +--- + +## 1. Sean Grove's "The New Code": A Talk That Changes How You Think + +In 2025, OpenAI researcher **Sean Grove** gave a talk titled **"The New Code"** at AI Engineer World's Fair, and it shook the entire developer community. He proposed a disruptive idea: **for 70 years we have been writing code to solve problems, but code is only a lossy projection of intent - specifications are the real "new code."** + +That talk gave rise to a new development paradigm: **Spec Coding** - making specification documents, rather than code, the core artifact of development, and letting AI generate code from the specification. + +Starting from Grove's talk, this article will help you understand the core ideas of Spec Coding, review the limits of Vibe Coding, and show how to apply this methodology in real development with Claude Code. + +::: info 📚 What you will learn + +1. Understand the key ideas in Sean Grove's "The New Code" talk +2. Master the core concepts and methodology of Spec Coding +3. Recognize both the value and the ceiling of Vibe Coding +4. Learn how to practice a Spec Coding workflow in Claude Code +5. Master a gradual transition strategy from Vibe Coding to Spec Coding + +::: + +--- + +## 1. Sean Grove's "The New Code": A Talk That Changes How You Think + +In 2025, OpenAI researcher Sean Grove gave a talk titled **"The New Code"** at AI Engineer World's Fair. This talk is widely seen as the intellectual starting point of the Spec Coding movement. + +Grove previously founded OneGraph, a GraphQL developer tools company later acquired by Netlify, and now works on alignment reasoning at OpenAI - helping turn high-level intent into executable specifications and evaluation standards. + +### 1.1 Core Argument: Code Is a Lossy Projection of Intent + +The core concept of Grove's talk can be summarized in one sentence: + +> **Code is a lossy projection of intent.** +> Code is a lossy projection of intent. + +What does that mean? When you have an idea in your head and turn it into code, a huge amount of context gets lost along the way - **why** you chose this approach, **what trade-offs** you considered, and **which constraints** mattered. The final code only preserves "how to do it," while losing "why it should be done this way." + +It is like compressing a book into a tweet - the information density drops sharply, and the original intent is heavily degraded. + +### 1.2 The Essence of Programming Is Communication + +Grove proposed a simple but profound idea: + +> "If you can communicate effectively, you can program." +> If you can communicate effectively, you can program. + +He argues that actual coding work only accounts for **10-20%** of development. The other 80% is **structured communication** around requirements and goals - understanding what users want, aligning with the team on solutions, defining acceptance criteria, and handling edge cases. + +That means the core of programming ability is not mastery of syntax in a particular language, but the ability to **turn vague intent into precise descriptions**. + +### 1.3 Whoever Writes the Spec Is the Programmer + +This is Grove's most disruptive idea: + +> "Whoever writes the spec - be it a PM, a lawmaker, an engineer, a marketer - is now the programmer." +> Whoever writes the spec - be it a PM, a lawmaker, an engineer, a marketer - is now the programmer. + +As AI becomes increasingly good at turning specifications into code, the **real programming work** shifts from "writing code" to "writing specifications." Whoever can express intent most precisely becomes the most valuable "programmer." + +### 1.4 Specifications Can Have a Code-Like Toolchain + +Grove pointed out that specifications can have a complete toolchain just like code: + +> "Specs actually give us a very similar toolchain, but it's targeted at intentions rather than syntax." + +- **Composition**: specifications can be modular and composable, like code modules +- **Testing**: specifications can embed unit tests to verify that behavior matches expectations +- **Linting**: ambiguous language in specifications can be detected, just like a linter catches syntax issues +- **Consistency checks**: specifications across departments can be checked for consistency, similar to a type checker + +### 1.5 OpenAI Model Spec: Living Proof + +Grove used OpenAI's own **Model Spec** document as evidence. + +When OpenAI discovered a sycophancy problem, they did not retrain the model. Instead, they **modified the specification document**. The change propagated automatically across the system, and the issue was corrected. + +This proves a crucial point: **the specification itself can act like executable code**. Changing the specification is equivalent to changing behavior, without touching a single line of traditional code. + +Josh Beckman's summary of Grove's talk captures it perfectly: + +> "Software engineering (and lawmaking and legal review) is specification repair." +> Software engineering (and lawmaking and legal review) is specification repair. + +--- + +## 2. Spec Coding: Specification as Code + +### 2.1 What Is Spec Coding + +Spec Coding, also called Spec-Driven Development (SDD), is a methodology that treats **specification documents as the core artifact of development**. + +The core idea is: **write the specification clearly first, then let AI generate code from that specification. The specification is the source of truth, and code is only the implementation artifact derived from it.** + +Robert C. Martin's classic statement from *Clean Code* becomes newly relevant in the AI era: + +> "Specifying requirements so precisely that a machine can execute them is programming." +> Specifying requirements so precisely that a machine can execute them is programming. + +### 2.2 Comparing Vibe Coding and Spec Coding + +| Dimension | Vibe Coding | Spec Coding | +|------|------------|-------------| +| **Approach** | Improvised prompts, iterative back-and-forth | Write a complete specification first, then generate code | +| **Best for** | Prototypes, hackathons, exploration | Production systems, team collaboration, enterprise work | +| **Code quality** | Fast but fragile | Structured, testable, auditable | +| **First-pass success rate** | Unstable | Targets 95%+ | +| **Reusability** | One-off prompts | Specifications can be reused across projects | +| **Security** | Easy to overlook things | Built in at the specification layer | +| **Documentation** | Missing or always lagging behind | The specification is the documentation and stays maintained | +| **Team collaboration** | Depends on personal prompting skill | Shared specifications, shared standards | + +The two are not opposites. As Brad Jolicoeur points out: + +> "Clever engineers will even use vibe coding as a first step to generate the initial draft of a specification." +> Clever engineers will even use vibe coding as a first step to generate the initial draft of a specification. + +### 2.3 The Three-Layer Specification Structure of Spec Coding + +Engineers at Red Hat summarized a practical three-layer specification model: + +**Layer 1: Functional Specification (What)** + +Describe the expected result in natural language and answer "what should it do": + +```markdown +## User Authentication Feature + +### User Stories +- As a new user, I want to register with my email +- As a registered user, I want to log in with email and password +- As a user who forgot my password, I want to reset it by email + +### Acceptance Criteria +- Validate email format and password strength during registration +- Lock the account for 15 minutes after 5 failed login attempts +- Password reset links are valid for 30 minutes +``` + +**Layer 2: Language-Agnostic Specification (How - Architecture Layer)** + +Define data structures, architectural patterns, and security requirements: + +```markdown +## Technical Design + +### Data Model +- users table: id, email, password_hash, created_at, locked_until +- sessions table: id, user_id, token, expires_at + +### API Design +- POST /api/auth/register -> 201 Created +- POST /api/auth/login -> 200 OK + JWT +- POST /api/auth/reset-password -> 202 Accepted + +### Security Requirements +- Passwords use bcrypt with cost factor >= 12 +- JWT expires in 15 minutes, refresh token in 7 days +- Enable rate limiting on all endpoints +``` + +**Layer 3: Language-Specific Specification (How - Implementation Layer)** + +Version requirements, test framework, and documentation standards: + +```markdown +## Implementation Constraints + +### Tech Stack +- Runtime: Node.js 20+ +- Framework: Express 5 +- ORM: Prisma +- Testing: Vitest + +### Code Conventions +- Use TypeScript strict mode +- Use a custom AppError class for error handling +- All API endpoints require JSDoc comments +``` + +--- + +## 3. Practicing Spec Coding in Claude Code + +Once you understand the theory, the next question is how to apply it in Claude Code. Claude Code's design philosophy naturally fits Spec Coding - its `CLAUDE.md`, Rules directory, and `/plan` command are all forms of specification-driven development. + +When OpenAI itself builds projects with Codex, it uses a similar pattern: using an `AGENTS.md` file as a specification to guide the AI agent. Their core lesson is this: **when the agent struggles, treat that as a signal - identify what is missing, whether it is tools, guardrails, or documentation, and then add it to the repository**. That aligns perfectly with Spec Coding: specifications are living artifacts and should keep evolving. + +Research from Augment Code supports the same conclusion: **executable specifications stay accurate because AI agents generate code directly from them, creating a forcing function - outdated specifications produce broken implementations**. That means specifications do not rot the way traditional documentation does. + +### 3.1 Step One: Use `CLAUDE.md` to Establish Project Specifications + +`CLAUDE.md` is the "living specification" of your project. Every time Claude Code starts, it reads this file, which is equivalent to giving AI a persistent project handbook. + +In the earlier chapter [Claude Code Quick Start Core Guide](../basics/), we already learned how to create `CLAUDE.md`. In the context of Spec Coding, its role becomes even more important - **it is not just a config file, but the entry point to the project specification**. + +Engineers at LogRocket emphasize that **solid context is crucial for AI agents because it prevents hallucinations and inefficiency**. Without specifications, an AI agent may make large, uncontrolled changes to a project. `CLAUDE.md` is the first line of defense that provides that "solid context." + +```markdown +# E-commerce Project Specification + +## Project Positioning +A SaaS e-commerce platform for small and medium-sized merchants, supporting multiple stores and multiple payment channels. + +## Architectural Decisions +- Frontend-backend separation with an API-first design +- Microservice backend architecture, with services communicating through a message queue +- Read-write database separation + +## Core Constraints +- Store all monetary amounts as integers in cents to avoid floating-point precision issues +- The order state machine must strictly follow: pending payment -> paid -> shipped -> completed +- Payment-related endpoints must be idempotent +``` + +Aviator's team summarized the key information that specifications should capture - and that is exactly what your `CLAUDE.md` should cover: + +- input and output formats and data types +- business rules and edge cases +- system dependencies and constraints +- performance and scalability requirements +- error handling and security requirements + +### 3.2 Step Two: Use the Rules Directory to Manage Layered Specifications + +As your project grows, a single `CLAUDE.md` will not be enough. At that point, use the `.claude/rules/` directory to organize layered specifications. + +This is exactly what Augment Code calls the idea of "executable specifications": **specifications are not static documents, but living instructions consumed directly by AI agents**. When you split rules into the Rules directory, each rule file is loaded only when related files are being edited, which both saves tokens and preserves precision. + +Engineers at Tessl found that breaking requirements into structured documents - with a PRD defining "what and why," and technical specifications defining "how" - helps prevent AI from accumulating confusion in long conversations and significantly improves output consistency. + +```text +.claude/rules/ +├── 00-architecture.md # Architecture rules (global) +├── 01-security.md # Security rules (global) +├── 10-api-design.md # API design rules +├── 11-frontend-patterns.md # Frontend pattern rules +├── 12-database.md # Database rules +└── 20-testing.md # Testing rules +``` + +Each rule file can specify its scope through frontmatter: + +```markdown +--- +globs: + - "src/api/**/*.ts" + - "src/services/**/*.ts" +--- + +# API Design Rules + +## Route Design +- RESTful style, use plural nouns: /api/v1/orders +- Nested resources can go at most two levels deep: /api/v1/users/123/orders + +## Response Format +- Success: { data, pagination? } +- Error: { error: { code, message, details? } } + +## Must Follow +- All write operations require authentication +- All list endpoints must support pagination +- Sensitive operations must write audit logs +``` + +That way, when Claude Code edits API-related files, it will automatically load this specification and make sure the generated code follows the standard. + +### 3.3 Step Three: Use `/plan` to Implement Specify -> Plan -> Tasks -> Implement + +The standard Spec Coding workflow is a four-stage loop. GitHub Spec Kit standardizes it as Specify -> Plan -> Tasks -> Implement, and Claude Code's `/plan` command naturally supports this flow. + +The SpecThis team emphasized one key principle: **define boundaries before the agent runs - know what should change before any code changes happen**. That is exactly the value of `/plan`. + +**Stage 1: Specify** + +First write clearly what you want to build. Do not rush into code: + +```text +/plan +I need to implement an order refund feature. The specification is: + +Functional requirements: +- Users can request a full refund before shipment +- Within 7 days after shipment, users can request a return and refund +- Refunds require administrator approval + +Acceptance criteria: +- The refund amount cannot exceed the amount actually paid for the order +- Refund state machine: requested -> approved -> refunding -> refunded +- Inventory is restored after the refund is completed +- Log every operation throughout the process +``` + +**Stage 2: Plan** + +Claude will generate a technical plan based on your specification: + +```text +📋 Refund Feature Implementation Plan + +1. Data model design + - Create a refunds table + - Add refund-related states to the order state machine + +2. API design + - POST /api/orders/:id/refund - request a refund + - PUT /api/refunds/:id/approve - approve a refund + - GET /api/refunds - refund list + +3. Business logic + - Refund eligibility checks + - Refund amount calculation + - Inventory restoration logic + +4. Integrations + - Connect to the payment provider's refund API + - Send refund notifications +``` + +**Stage 3: Tasks** + +Break the plan into small tasks that can be executed independently, and give each task a clear completion standard. + +**Stage 4: Implement** + +Implement one task at a time, validating after each one is completed. + +### 3.4 Real Example: Building a User Notification System with Spec Coding + +Let's use a full example to compare Vibe Coding and Spec Coding. Data from Orchestrator.dev shows that in the 2025 Stack Overflow survey, 84% of developers use or plan to use AI tools, but only 22% are satisfied with the results, and 46% believe accuracy is a problem. Spec Coding is exactly the key to closing that satisfaction gap. + +**Vibe Coding approach:** + +```text +You: Build a notification feature +AI: [Immediately starts writing code and generates a simple notification list] + +You: It should support read and unread +AI: [Modifies the code and adds a read field] + +You: It also needs multiple notification types +AI: [Changes it again and adds a type field] + +You: It should push notifications to phones too +AI: [Makes a big rewrite, and the previous structure no longer fits very well...] +``` + +Result: after four rounds of changes, the architecture has been overturned again and again, and the code gets messier over time. + +**Spec Coding approach:** + +First write a specification document `specs/notification.md`: + +```markdown +# User Notification System Specification + +## Functional Requirements +1. Support three channels: in-app notifications, email notifications, and push notifications +2. Notification types: system announcements, order status, promotional campaigns, security alerts +3. Users can configure notification preferences by channel and type +4. Support read/unread state and bulk mark-as-read + +## Data Model +- notifications table: id, user_id, type, channel, title, content, + is_read, created_at +- notification_preferences table: user_id, type, channel, enabled + +## API Design +- GET /api/notifications?type=&is_read= - get notification list (paginated) +- PUT /api/notifications/:id/read - mark as read +- PUT /api/notifications/read-all - mark all as read +- GET /api/notification-preferences - get preference settings +- PUT /api/notification-preferences - update preference settings + +## Acceptance Criteria +- The unread notification count updates in real time +- The notification list supports infinite scrolling +- Push notification latency < 3 seconds +- Preference changes take effect immediately +``` + +Then in Claude Code: + +```text +@specs/notification.md +Implement the user notification system according to this specification. +Start with the data model, then implement the API, and finally build the frontend components. +Pause after each module is complete, and I will confirm before you continue. +``` + +Result: it lands cleanly in one go, with a clear architecture and no need to repeatedly tear things down and rebuild them. + +### 3.5 Strengthening Spec Coding with Superpowers + +In the earlier chapter [Superpowers for Engineering-Grade Development](../superpowers/), we learned about the Superpowers skill system. Spec Coding and Superpowers are natural companions: + +| Spec Coding Stage | Matching Superpowers Skill | +|------------------|---------------------| +| Define the specification | `brainstorming` - use Socratic questioning to clarify requirements | +| Technical planning | `writing-plans` - break the specification into small tasks | +| Incremental implementation | `test-driven-development` - TDD red-green-refactor | +| Quality verification | `code-review` + `verification-before-completion` | + +**Example of combined usage:** + +```text +@specs/notification.md +Implement the notification system according to this specification using TDD, +and help me review the code after it is done +``` + +This single instruction activates both the Spec Coding workflow and Superpowers skills like TDD and Code Review, forming a complete engineering-grade development process. + +### 3.6 Version Control and Continuous Evolution of Specifications + +The Vibe Coding Substack proposed an important viewpoint: **Specs are now code**. If specifications are code, then they should be managed like code: + +- **Version control**: keep specification files in Git and commit them together with the code +- **Change tracking**: every change to the specification has a commit record so you know who changed what and why +- **Code review**: changes to specifications should also go through PR review so the team stays aligned +- **CI integration**: specification changes trigger automated tests to verify whether the implementation still conforms to the specification + +In Claude Code, that means your `CLAUDE.md`, `.claude/rules/`, and `specs/` directory should all be version-controlled. Robomotion's experience is that **versioning specifications together with implementations prevents drift and keeps everything auditable**. + +OpenAI's Harness Engineering practice also confirms this: their `AGENTS.md` file is itself written by Codex and is continuously updated as the project evolves. When the agent encounters difficulties, the fix is not to change the code directly, but to **have Codex update the specification itself** - forming a self-healing loop for specifications. + +--- + +## 4. A Hybrid Strategy: Gradually Moving from Vibe to Spec + +The industry consensus is not "abandon Vibe Coding," but rather **choose the right approach for the right scenario**. + +### 4.1 When to Use Vibe Coding + +- Validate whether an idea is feasible, with a prototype built within 30 minutes +- Explore unfamiliar technologies or frameworks +- Hackathons or internal demos +- One-off scripts or tools + +### 4.2 When to Use Spec Coding + +- Production feature development +- Multi-person collaborative projects +- Code that will need long-term maintenance +- Sensitive domains such as security, payments, or data +- API design and system integration + +### 4.3 A Recommended Gradual Workflow + +**Stage 1: Vibe Exploration** + +Use Vibe Coding to validate the idea quickly. Do not write specifications yet, and do not worry about code quality: + +```text +Build a simple notification popup so we can see how it feels +``` + +**Stage 2: Refine the Specification** + +Once feasibility is confirmed, organize what you learned during exploration into a specification. You can even ask AI to help: + +```text +Based on the notification feature prototype we just built, +help me organize a formal functional specification document, +including the data model, API design, and acceptance criteria +``` + +**Stage 3: Rebuild with Spec** + +Based on that specification, re-implement the production-grade version using Spec Coding: + +```text +@specs/notification.md +Implement this from scratch according to the specification, and do not refer to the previous prototype code +``` + +The advantage of this workflow is clear: **use the speed of Vibe Coding to validate direction, and the quality of Spec Coding to deliver the product**. + +Robomotion summarized it well: + +> "The spec is the source of truth. The AI generated output is the draft implementation. Validation is not optional." +> The spec is the source of truth. The AI generated output is the draft implementation. Validation is not optional. + +--- + +## 5. Frequently Asked Questions + +### Q1: Doesn't Spec Coding feel too slow? + +Writing specifications does require up-front investment. But Greg Ceccarelli's team used Spec Coding to deliver a complete macOS product with **three people in four weeks** - something that would be nearly impossible in traditional development. + +The time spent writing specifications early will be recovered later through less rework, fewer bugs, and lower communication cost. + +### Q2: How detailed should a specification be? + +Robomotion's suggestion is: **a high-quality specification can be only one page**. What matters is whether it answers these eight questions: + +1. What are we automating? +2. What is the input? +3. What is the output? +4. What are the constraints? +5. What are the failure modes? +6. What are the security requirements? +7. What are the performance requirements? +8. What tests prove that it works? + +### Q3: What if AI only does exactly what the specification says and misses "obvious" features? + +This really is one limitation of Spec Coding. Feedback from GitHub Spec Kit users is that AI will do **"exactly and only"** what is written in the specification. + +The solution is to add a "non-functional requirements" section to the specification and list common expectations there, such as error handling, logging, and accessibility. Or set global rules in `CLAUDE.md`. + +### Q4: Do small projects also need Spec Coding? + +No. Spec Coding is best suited to: + +- production-grade projects +- collaborative team projects +- projects that need long-term maintenance + +For quick prototypes, one-off scripts, and learning experiments, Vibe Coding is more suitable. + +### Q5: How do you get a team to accept Spec Coding? + +Start with a small feature as a pilot. Let the team see how Spec Coding reduces rework and improves first-pass success. The Stack Overflow 2025 survey shows that 84% of developers use or plan to use AI tools, but only 22% are satisfied with the results - Spec Coding is exactly the key to improving that satisfaction. + +--- + +## 6. Summary + +Moving from Vibe Coding to Spec Coding is not a revolution. It is an evolution. + +Sean Grove made it very clear in "The New Code": **for 70 years, we have been writing code to solve problems; now we should be writing specifications to generate code**. Code is a lossy projection of intent, while specifications can fully capture intent, context, and constraints. + +For developers using Claude Code, this shift is already happening: + +- the `CLAUDE.md` you write is your project specification +- the Rules directory you configure is your layered specification system +- the planning you do with `/plan` is the Specify -> Plan -> Tasks flow +- combining TDD and Code Review from Superpowers gives you a complete Spec Coding workflow + +**Key takeaways:** + +- Vibe Coding is suitable for exploration and prototypes, while Spec Coding is suitable for production and collaboration +- The specification is the source of truth, and code is an implementation artifact produced from it +- The ability to write specifications = programming ability, and communication ability matters more than syntax ability +- Start small: just by writing `CLAUDE.md` well, you have already taken the first step into Spec Coding + +::: tip 💡 Next step +In the next chapter, we will learn how to use Claude Code's Agent Teams capability so multiple AI instances can collaborate like a real development team. +::: + +--- + +## References + +### Related to Sean Grove's "The New Code" Talk + +- [Code is just a lossy projection of intent — The Decoder](https://the-decoder.com/code-is-just-a-lossy-projection-of-intent-according-to-openai-researcher-sean-grove/) +- [The End of Coding? How Specifications Are Becoming the New Source Code — Implicator](https://www.implicator.ai/the-end-of-coding-how-specifications-are-becoming-the-new-source-code/) +- [OpenAI: Intent, Not Code, Drives Future Software Development — AI Tech Suite](https://www.aitechsuite.com/ai-news/openai-intent-not-code-drives-future-software-development) +- [Note on The New Code — Josh Beckman](https://www.joshbeckman.org/notes/914234100) +- [Full Transcript of "The New Code"](https://lawwu.github.io/transcripts/8rABwKRsec4.html) + +### Spec Coding Methodology + +- [How spec-driven development improves AI coding quality — Red Hat](https://developers.redhat.com/articles/2025/10/22/how-spec-driven-development-improves-ai-coding-quality) +- [Spec-Driven Development with AI: Complete 2025 Guide — Dplooy](https://www.dplooy.com/blog/spec-driven-development-with-ai-complete-2025-guide) +- [Spec-Driven Development: Building Production-Ready Software with AI — Orchestrator.dev](https://orchestrator.dev/blog/2025-12-16-spec_driven_dev_article) +- [Agents Code but the Problem of Clear Specification Remains — Greg Ceccarelli](https://www.gregceccarelli.com/writing/beyond-code-centric) + +### Vibe Coding vs. Spec Coding + +- [Vibe Coding vs Spec Driven — Cosmo Edge](https://cosmo-edge.com/vibe-coding-vs-spec-driven-ai-development/) +- [Master AI in Software Engineering: Vibe vs. Spec Coding — Brad Jolicoeur](https://bradjolicoeur.com/article/ai-software-engineering-vibe-spec-prompting) +- [From Vibe Coding to Spec-Driven Development — Tessl](https://tessl.io/blog/from-vibe-coding-to-spec-driven-development/) +- [Spec First Approach for Enterprise — Robomotion](https://robomotion.io/blog/spec-first-approach-the-way-to-adapt-vibe-coding-for-enterprise-work) + +### Tools and Practices + +- [GitHub Spec Kit vs Vibe Coding — Ossels](https://ossels.ai/github-spec-kit-spec-driven-development/) +- [A Spec-First Workflow for Agentic AI — LogRocket](https://blog.logrocket.com/spec-first-workflow-agentic-ai/) +- [Specs Are Now Code — The Vibe Coding Substack](https://thevibecoding.substack.com/p/specs-are-now-code) +- [Harness Engineering — Martin Fowler](https://martinfowler.com/articles/exploring-gen-ai/harness-engineering.html) +- [Spec-Driven Development & AI Agents Explained — Augment Code](https://www.augmentcode.com/guides/spec-driven-development-ai-agents-explained) +- [Spec-Driven Development: The Key to Scalable AI Agents — Aviator](https://www.aviator.co/blog/spec-driven-development/) diff --git a/docs/en/stage-3/core-skills/superpowers/index.md b/docs/en/stage-3/core-skills/superpowers/index.md new file mode 100644 index 0000000..09860b7 --- /dev/null +++ b/docs/en/stage-3/core-skills/superpowers/index.md @@ -0,0 +1,633 @@ +# Claude Code Superpowers for Engineering-Grade Development + +## Introduction to Superpowers + +**Superpowers** is an open-source agent skills framework created by Jesse Vincent (online handle: obra), specifically designed to solve a core problem in AI programming: how to make AI produce "engineering-grade" code instead of "toy-grade" code. + +Imagine a normal AI coding assistant as a "smart intern." It can write runnable code, but it may have no tests, no documentation, and no best-practice discipline. Superpowers is like assigning a "senior engineer mentor" to that intern, forcing it to follow a complete software development process. + +### Why Do We Need Superpowers? + +Before Superpowers, there were several issues when using Claude Code: + +- **Chaos in vibe coding**: AI starts coding directly without planning, causing frequent rework +- **Lack of TDD discipline**: AI tends to write code first and add tests later, or skip tests entirely +- **Coding with vague requirements**: user says "build a login feature," AI starts immediately, and the result is not what was wanted +- **Unstable code quality**: no code-review mechanism, so quality depends on AI "mood" + +Superpowers solves these issues and turns Claude into a "disciplined development team." It helps clarify requirements first, then creates a plan, then develops with TDD, and finally ensures quality through code review. + +--- + +## Quick Start + +### Step 1: Install Superpowers + +Run in Claude Code: + +```bash +# Add marketplace +/plugin marketplace add obra/superpowers-marketplace + +# Install superpowers +/plugin install superpowers@superpowers-marketplace +``` + +Or clone manually: + +```bash +git clone https://github.com/obra/superpowers.git ~/.claude/skills/superpowers +``` + +### Step 2: Try Your First Skill + +Let's use Superpowers' **brainstorming** skill to experience its value. + +In Claude Code, type: + +```text +Build me a user login feature +``` + +**Before Superpowers**: Claude starts writing code directly and may produce something you do not really want. + +**With Superpowers**: Claude uses Socratic questions to help clarify requirements: + +> Is this login feature for a Web app or a mobile app? +> +> Which login methods are required? Email/password? Third-party login (Google, GitHub)? +> +> Do you need a "remember me" feature? +> +> Should password reset be via email or SMS? +> +> ... + +These questions force you to clarify what you actually need before coding, preventing a lot of unused code. + +### Step 3: Understand Skill Trigger Mechanisms + +Superpowers is not a "magic switch." It is a **set of skills**. Understanding how skills are triggered is important. + +**Three trigger methods:** + +1. **Keyword trigger** + - When you mention "TDD," "test-driven development," or "write tests first" + - The `test-driven-development` skill is activated + +2. **Scenario trigger** + - When requirements are unclear, `brainstorming` asks proactive questions + - When bugs appear, `systematic-debugging` is activated + +3. **Manual invocation** + - Use skill names directly, such as: `/test-driven-development` + +#### 💡 Important Clarification: What Happens If You Do Not Specify TDD? + +This is a common misunderstanding. Let's clarify: + +```text +# Case A: TDD not mentioned +"Implement a calculator" +-> Claude may write tests, or may not +-> Depends on the model's own habits + +# Case B: TDD explicitly requested +"Implement a calculator with TDD" +-> test-driven-development skill is activated +-> RED-GREEN-REFACTOR is enforced +``` + +**The real value of Superpowers**: not creating abilities from nothing, but strengthening discipline. + +- Without the TDD skill: Claude writing tests is "maybe" +- With the TDD skill: Claude is forced to follow TDD flow + +### Understanding the Value of Superpowers + +From the explanation above, the core value of Superpowers is clear: + +1. **Requirements first**: `brainstorming` asks actively when requirements are vague +2. **Process discipline**: `test-driven-development` enforces the TDD red-green-refactor cycle +3. **Task decomposition**: `writing-plans` breaks large projects into small tasks +4. **Quality control**: `code-review` skills ensure code quality + +--- + +## Superpowers Core Skills in Detail + +Superpowers includes **20+ composable skills** covering the full software lifecycle. Let's go through them by category. + +### 🧪 Testing Skills + +#### test-driven-development + +**How to trigger**: mention keywords like "TDD," "test-driven development," or "write tests first." + +**What this skill does**: forces Claude to follow the TDD red-green-refactor cycle instead of "maybe writing tests later." + +**Traditional approach** (common problems): +1. Write code directly +2. Do a quick manual test +3. Find bugs and patch code +4. Repeat... (tests? maybe next time) + +**TDD approach** (after skill activation): +1. 🔴 **RED**: write a failing test first +2. 🟢 **GREEN**: write the minimal code to pass the test +3. 🔵 **REFACTOR**: refactor while keeping tests passing +4. Repeat + +**Example usage**: + +```text +Implement a user authentication module using TDD +``` + +Claude will: +1. Write tests first (username/password validation, token generation, etc.) +2. Run tests and confirm all fail (RED) +3. Write minimal implementation code +4. Run tests and confirm pass (GREEN) +5. Refactor code and extract shared logic +6. Run tests again and confirm pass (REFACTOR) + +> **Note**: if you do not mention "TDD," Claude may or may not write tests. The skill's role is to **enforce process discipline** so tests are not "forgotten." + +### 🐛 Debugging Skills + +#### systematic-debugging + +When bugs appear, human developers often try random fixes. Superpowers enforces four-stage root-cause analysis: + +**Stage 1: Reproduce the issue** +- Confirm the bug can be reproduced consistently +- Record exact reproduction steps + +**Stage 2: Isolate root cause** +- Narrow the scope using binary search style reduction +- Add logs to locate problematic code + +**Stage 3: Validate hypotheses** +- Propose root-cause hypotheses +- Design validation experiments + +**Stage 4: Fix and verify** +- Implement the fix +- Confirm the bug is resolved +- Add regression tests + +#### verification-before-completion + +This skill prevents Claude from stopping when things only "seem close enough." Before claiming completion, it requires Claude to: + +1. Run all tests +2. Manually test key features +3. Check code quality (lint) +4. Confirm documentation is updated + +### 🤝 Collaboration Skills + +#### brainstorming + +This is one of the most interesting Superpowers skills. It uses Socratic questioning to clarify requirements. + +**How it works**: when you provide a vague requirement, Claude does not start coding directly. It asks questions: + +```text +You: Build a blog system + +Claude: +- Who is this blog for? Technical readers or general readers? +- Should Markdown editing be supported? +- Do you need comments? +- Do you need search? +- Single-user or multi-author? +- ... +``` + +These questions force you to think about what functionality is truly needed, avoiding lots of unused features. + +#### writing-plans + +This skill decomposes large tasks into small tasks that can each be completed in 2-5 minutes. + +**Example**: + +```text +Use writing-plans to plan development of a todo API +``` + +Claude will generate a detailed plan: + +```markdown +# Implementation Plan + +## Task 1: Design database schema (estimated 5 minutes) +- Create todos table +- Define fields: id, title, completed, createdAt + +## Task 2: Create Express routes (estimated 10 minutes) +- POST /todos - create task +- GET /todos - list tasks +- GET /todos/:id - get one task +- PUT /todos/:id - update +- DELETE /todos/:id - delete + +## Task 3: Add input validation (estimated 10 minutes) +- title cannot be empty +- completed must be boolean + +## Task 4: Write tests (estimated 15 minutes) +- Write tests for each endpoint +- Cover edge cases + +## Task 5: Start server and verify (estimated 5 minutes) +- Run tests +- Manually test API + +Acceptance criteria: +- All tests pass +- curl test passes for every endpoint +``` + +#### executing-plans + +This skill executes a plan in batches and pauses at each checkpoint for confirmation. + +**Usage example**: + +```text +Execute the plan above, and pause after each completed task +``` + +Claude will: +1. Finish task 1, then pause: `✅ Database schema done. Continue?` +2. After your confirmation, finish task 2 and pause again +3. And so on + +This lets you verify direction at every stage, avoiding late discovery that things drifted off track. + +#### dispatching-parallel-agents + +This skill can launch multiple sub-agents in parallel. + +**Use case**: when you need to process multiple independent tasks simultaneously. + +```text +Use parallel agents to complete: +- Agent A: write backend APIs +- Agent B: write frontend components +- Agent C: write tests +``` + +Each agent works in its own isolated environment without interference. + +#### subagent-driven-development + +This skill launches an independent sub-agent for each small task. + +**Advantages**: +- Each sub-agent has an isolated context +- Failure of one task does not affect others +- Multiple tasks can run in parallel + +#### using-git-worktrees + +This skill uses Git worktree to create isolated development environments. + +**Benefits**: +- Multiple features can be developed in parallel +- Each worktree is independent +- No mutual conflicts + +### 👀 Code Review Skills + +#### requesting-code-review + +After code is completed, this skill automatically requests code review. + +```text +Automatically trigger code review after finishing the feature +``` + +#### receiving-code-review + +This skill defines how to receive and process review feedback. + +**Review workflow**: +1. Submit code +2. Trigger review automatically +3. Reviewer checks code quality, security, and test coverage +4. Reviewer provides improvement suggestions +5. Fix issues +6. Re-review until approval + +--- + +## Full Superpowers Workflow + +The real power of Superpowers comes from combining multiple skills into a full development workflow. + +### Standard Development Workflow + +```text +1. Brainstorming + ↓ Clarify real requirements through Q&A + +2. Design Document + ↓ Present design in chunks and wait for confirmation + +3. Writing Plans + ↓ Break down into 2-5 minute tasks + +4. Subagent Development + ↓ Launch isolated sub-agent per task + ↓ Two-stage review: spec conformance + code quality + +5. TDD + ↓ Enforce RED-GREEN-REFACTOR + +6. Code Review + ↓ Quality gate + +7. Done +``` + +### Practical Example: Build a User Authentication System with Superpowers + +Let's experience the Superpowers workflow through a complete example. + +#### Step 1: Clarify Requirements with Brainstorming + +```text +I need a user authentication system +``` + +Claude will ask: +- Is it a Web app or mobile app? +- Which login methods are required? +- Is password reset needed? +- Is "remember me" needed? +- ... + +#### Step 2: Generate a Design Document + +After requirements are confirmed, Claude will produce a design document: + +```markdown +# User Authentication System Design + +## Functional Requirements +1. User registration (email + password) +2. User login +3. Password reset (email) +4. Remember me (JWT + Refresh Token) + +## Technology Choices +- Backend: Node.js + Express +- Database: SQLite +- Auth: JWT +- Encryption: bcrypt + +## API Design +- POST /api/auth/register +- POST /api/auth/login +- POST /api/auth/refresh +- POST /api/auth/reset-password +``` + +#### Step 3: Write the Implementation Plan + +```text +Use writing-plans to create the implementation plan +``` + +Claude will generate a detailed task list, each task completable in 2-5 minutes. + +#### Step 4: Execute Development + +```text +Execute the plan above with TDD +``` + +Claude will: +1. Write tests first +2. Confirm tests fail (RED) +3. Write implementation code +4. Confirm tests pass (GREEN) +5. Refactor code (REFACTOR) + +#### Step 5: Code Review + +After completion, code review is triggered automatically to check: +- code quality +- security (SQL injection, XSS, etc.) +- test coverage +- documentation completeness + +--- + +## Superpowers vs Direct Claude Code Use + +| Dimension | Direct Claude Code Use | Using Superpowers | +|------|---------------------|-----------------| +| **Requirement clarification** | AI starts coding directly | Socratic questions clarify requirements first | +| **Development process** | Free-form depending on AI | TDD red-green-refactor enforced | +| **Task management** | One-shot completion | Broken into small tasks with checkpoints | +| **Code quality** | Depends on AI judgment | Code review enforced | +| **Predictability** | Unstable outcomes | Repeatable process | +| **Best for** | Simple tasks, prototype validation | Complex projects, production code | + +### Visual Metaphor + +If Claude Code is a "smart intern": + +- **Direct use**: tell the intern "build a login feature," and they start coding right away, possibly producing something you find off-target +- **With Superpowers**: assign the intern a senior mentor who clarifies requirements, creates plans, and checks code quality + +--- + +## Installation and Configuration in Detail + +### Method 1: Via Marketplace (Recommended) + +```bash +# Add marketplace +/plugin marketplace add obra/superpowers-marketplace + +# Install +/plugin install superpowers@superpowers-marketplace + +# Verify installation +/skills +``` + +### Method 2: Manual Clone + +```bash +# Create directory +mkdir -p ~/.claude/skills + +# Clone repository +git clone https://github.com/obra/superpowers.git ~/.claude/skills/superpowers +``` + +### Method 3: Project-Level Installation + +If you want to use Superpowers in a specific project: + +```bash +# In project root +mkdir -p .claude/skills + +# Clone or copy superpowers +cp -r ~/.claude/skills/superpowers .claude/skills/ +``` + +This allows team members to share the same Superpowers configuration. + +--- + +## Common Skills Quick Reference + +| Skill Name | Function | Use Case | +|---------|------|---------| +| `brainstorming` | Clarify requirements through Socratic questioning | When requirements are unclear | +| `writing-plans` | Break tasks into small steps | Before starting large projects | +| `executing-plans` | Execute plan with checkpoints | During plan-driven development | +| `test-driven-development` | TDD red-green-refactor loop | For all feature development | +| `systematic-debugging` | Four-stage root-cause analysis | When bugs appear | +| `verification-before-completion` | Pre-completion verification | At task completion | +| `requesting-code-review` | Request code review | Before code submission | +| `subagent-driven-development` | Sub-agent-driven development | Parallel tasks | +| `using-git-worktrees` | Git worktree isolation | Parallel feature development | + +--- + +## Best Practices + +### 1. Use Clear Trigger Keywords + +Superpowers skills are keyword-triggered. Learn common trigger words: + +| Skill | Trigger Keywords | +|------|-----------| +| `test-driven-development` | "TDD", "test-driven", "write tests first" | +| `brainstorming` | Auto-triggered when requirements are unclear | +| `systematic-debugging` | "debug", "bug", "not working" | +| `writing-plans` | "make a plan", "planning" | + +### 2. Use Superpowers When Process Discipline Is Needed + +- Production-grade code development -> mention "TDD" +- Requirements are unclear -> let `brainstorming` clarify +- Complex project -> use `writing-plans` to decompose tasks + +### 3. Do Not Force It for Simple Tasks + +If it is a rapid prototype or one-off script, you do not need the full process. Superpowers is most suitable for code requiring long-term maintenance. + +### 4. Skills Can Be Combined + +```text +Implement user authentication with TDD, and after completion, help me do a code review +``` + +This triggers both `test-driven-development` and `code-review` skills. + +--- + +## Frequently Asked Questions + +### Q1: Do I have to specify "TDD" when using Superpowers? + +**Not required**. + +Superpowers is a skill set, and each skill has its own trigger conditions: +- Say "use TDD" -> triggers `test-driven-development` +- Do not say TDD -> Claude may write tests or not (depends on model behavior) + +Superpowers exists to **enforce process discipline**, not to create capability from nothing. + +### Q2: Does Superpowers make development slower? + +At first, it may feel slower because: +- requirement clarification takes time +- tests are written before code +- code review is required + +But in the long run, overall efficiency improves due to reduced rework and fewer bugs. + +### Q3: Do small projects also need Superpowers? + +For prototype validation or very simple tasks, you can use Claude Code directly. Superpowers is better suited for: +- production-grade projects +- multi-person collaboration +- long-term maintainability + +### Q4: What is the difference between Superpowers and Skills? + +| Dimension | Superpowers | Skills | +|------|-------------|--------| +| **Nature** | Complete development methodology framework | Reusable skill packages | +| **Scope** | Covers the full development process | Focuses on specific functions | +| **Relationship** | Superpowers uses Skills internally | Superpowers is a collection of Skills | + +### Q5: Can I customize Superpowers skills? + +Yes. Superpowers is open-source, and you can: +1. Fork the repository +2. Modify existing skills +3. Add new skills +4. Contribute back to the community + +--- + +## References + +### Official Resources + +- [obra/superpowers GitHub](https://github.com/obra/superpowers) - official repository (50,000+ ⭐) +- [Detailed Superpowers Usage Tutorial](https://www.cnblogs.com/gyc567/p/19510203) - detailed Chinese tutorial +- [Superpowers Environment Setup Guide](https://m.blog.csdn.net/gitblog_00683/article/details/144768992) - setup guide + +### Community Resources + +| Repository | Description | +|------|------| +| [affaan-m/everything-claude-code](https://github.com/affaan-m/everything-claude-code) | comprehensive toolkit including TDD workflows | +| [shanraisshan/claude-code-best-practice](https://github.com/shanraisshan/claude-code-best-practice) | official best practices | + +### Related Articles + +- [Goodbye Vibe Coding! Use Superpowers to Make Claude Code Write Engineering-Grade Code](https://juejin.cn/post/7593573617648123956) +- [How I Use Superpowers MCP to Force Claude Code to Plan Before Coding](https://juejin.cn/post/7570341520551673871) +- [Claude Code + Superpowers Beginner Tutorial](https://juejin.cn/post/7594832320030638123) + +--- + +## Summary + +Superpowers is a set of **engineering-grade development skills** that upgrades Claude Code from a "smart intern" to a "disciplined development team." + +### Core Takeaways + +1. **Superpowers is a skill set, not magic** + - After installation, skills are available in the background + - Triggered via keywords or scenarios + - You can manually invoke specific skills + +2. **Remember key trigger phrases** + - Want TDD -> say "use TDD" + - Vague requirements -> `brainstorming` asks proactively + - Bug appears -> mention "debug" to trigger `systematic-debugging` + +3. **Best-fit scenarios** + - ✅ Production-grade code development + - ✅ Long-term maintainable projects + - ✅ Team collaboration projects + - ❌ Rapid prototypes (optional) + - ❌ One-off scripts (optional) + +Remember: **Superpowers does not make AI smarter; it makes AI more disciplined.** diff --git a/docs/en/stage-3/core-skills/workflow/index.md b/docs/en/stage-3/core-skills/workflow/index.md new file mode 100644 index 0000000..738e08e --- /dev/null +++ b/docs/en/stage-3/core-skills/workflow/index.md @@ -0,0 +1,943 @@ +# AI-Assisted Development Workflow + +In the previous chapters, we learned how to use AI IDEs to write code, how to manage code versions with Git, and how to design and implement API interfaces. But when you face a real development task, you may run into questions like these: + +- "This project has thousands of files. Where should I start?" +- "My boss asked me to add a new feature, but I'm not familiar with this part of the codebase." +- "I have no idea where this bug is. There is just too much code." +- "I need to refactor this pile of code, but I'm afraid of breaking something." + +The essence of these questions is: **how do you use AI tools efficiently in real development scenarios to get work done?** + +In this lesson, we will learn how to build a systematic AI-assisted development workflow so that you can use AI efficiently across different development scenarios. Through concrete examples, we will show how to use AI in new feature development, bug fixing, code refactoring, and more. + +> 💡 **Prerequisites** +> +> Before studying this section, it is recommended that you first understand: +> - [AI IDE Basics](../../stage-1/ai-ide/) - master the basic use of AI IDEs +> - [Git and GitHub Workflow](../../stage-2/backend/2.4-git-workflow/) - understand code version management +> - [Using Large Models to Help Write API Code](../../stage-2/backend/2.3-ai-interface-code/) - understand the basic concept of AI-assisted development + +::: info 📚 What you will learn + +1. Understand AI's role in the development process and its capability boundaries +2. Master AI-assisted development strategies for different project types +3. Learn how to use Claude Code in scenarios such as new feature development, bug fixing, and code refactoring +4. Build a project knowledge base to improve collaboration efficiency with Claude Code +5. Master practical techniques for improving AI collaboration efficiency + +::: + +# 1. Understand AI's Capability Boundaries + +Before we start using AI to assist development, we first need to understand what AI can and cannot do. Only then can we build the right collaboration model. + +## 1.1 What AI Is Good At + +Think of AI as a very smart assistant that still needs clear instructions. It can quickly generate a code skeleton based on your description, and it can also read thousands of lines of code in seconds to find the part you need. If there are obvious syntax errors or common security vulnerabilities, it can help you discover them too. Repetitive tasks such as batch-renaming variables, formatting code, and generating documentation comments are especially suitable to hand over to AI. + +Put simply, AI is good at work that has clear rules and can be automated. + +## 1.2 What AI Is Not Good At + +But AI also has its limitations. It does not understand your business logic. Unless you tell it in detail, it will not know how your company's order flow works. It also cannot make decisions such as technical selection or architecture design that require weighing trade-offs, because those depend on your experience and understanding of the project. AI also does not know your team's special conventions, such as "all APIs must have logging" or "error codes must use enums." You need to configure those rules or tell it explicitly. + +Most importantly, code generated by AI cannot be used directly. You must review and test it. It may generate code that looks correct but is actually problematic, and it may ignore certain edge cases. + +## 1.3 How to Collaborate with AI + +Once you understand AI's capability boundaries, the collaboration model becomes clear: you are responsible for deciding what to build, making decisions, and ensuring quality; AI is responsible for executing concrete coding work, finding information, and surfacing obvious problems. + +It is like working with a junior developer. You tell them what needs to be done, they implement it, and then you review the code. The difference is that AI executes much faster, but its judgment is weaker than a human's. + +# 2. Development Strategies for Different Project Types + +Different types of projects require different development styles and AI usage strategies. Choosing the right strategy can greatly improve development efficiency. + +## 2.1 Brand-New Projects (Starting from Scratch) + +**Project characteristics:** +- No historical baggage, so you can design freely +- You need to establish project structure and code conventions +- Suitable for fast iteration and trial-and-error + +**Recommended workflow:** + +**Step 1: Plan the project structure** + +Before you start coding, first ask AI to help you plan the project structure and technical choices: + +```text +I want to build a task management app with these features: +- User registration and login +- Create, edit, and delete tasks +- Task categories and tags +- Task reminders + +Please help me: +1. Recommend a suitable tech stack +2. Design the project directory structure +3. Plan the database schema +``` + +**Step 2: Build the basic framework** + +Based on the plan, ask AI to create the basic project structure: + +```text +Based on the plan we just discussed, help me: +1. Create the project directory structure +2. Initialize config files (package.json, .env, etc.) +3. Create the basic server code +``` + +**Step 3: Implement features one by one** + +Implement feature modules one at a time by priority: + +```text +Now implement the user registration feature with these requirements: +- Register with email and password +- Store passwords in encrypted form +- Email verification +``` + +**Key points:** +- Establish code conventions early so AI generates code that follows them +- Test and verify every feature module as soon as it is completed +- Keep project documentation updated in time + +## 2.2 Mature Projects (Large Existing Codebases) + +**Project characteristics:** +- Large codebase with historical conventions +- You need to keep coding style consistent +- Changes must consider the scope of impact + +**Recommended workflow:** + +**Step 1: Understand the project structure** + +Before changing code, first ask AI to help you understand the project: + +```text +This is an e-commerce project, and I need to add a coupon feature. +Please help me: +1. Analyze the overall project structure +2. Find the order-related code +3. See how other similar features are implemented +``` + +**Step 2: Find reference code** + +Ask AI to find similar implementations in the project as references: + +```text +Find how other promotional features in the project, such as full reduction and discounts, are implemented +``` + +**Step 3: Follow the existing style** + +Ask AI to implement the new feature in the style of the existing code: + +```text +Please implement the coupon feature by referring to how the full-reduction promotion is implemented. +Keep the same code style and directory structure. +``` + +**Key points:** +- Understand first, then change things, so you do not damage the existing architecture +- Keep coding style consistent +- Test related functionality after the change + +## 2.3 Rapid Prototypes (Validating Ideas) + +**Project characteristics:** +- Speed matters most, code quality matters less +- Used to validate product ideas or technical approaches +- May later be discarded or rewritten + +**Recommended workflow:** + +**Describe the requirement directly and implement quickly:** + +```text +Build a simple todo app with these requirements: +- Add, delete, and mark tasks as completed +- Store data locally +- Keep the UI simple, as long as it works +``` + +**Iterate quickly:** + +```text +Add search +Switch it to a dark theme +Add task categories +``` + +**Key points:** +- Do not worry too much about code quality or conventions +- Validate ideas quickly and adjust direction in time +- If the prototype succeeds, it will need refactoring later + +## 2.4 Maintenance Projects (Mostly Bug Fixes) + +**Project characteristics:** +- The code is already stable, and the main task is fixing issues +- You need to locate problems quickly +- Changes must be made carefully to avoid introducing new issues + +**Recommended workflow:** + +**Step 1: Locate the problem** + +```text +User feedback: after clicking the "Submit Order" button, the page freezes +Console error: TypeError: Cannot read property 'id' of undefined + +Please help me: +1. Analyze possible causes +2. Find the relevant code +``` + +**Step 2: Analyze the root cause** + +```text +Check in what situations this error occurs +Inspect the data flow +``` + +**Step 3: Apply the fix** + +```text +Fix this problem, and: +1. Add defensive code to avoid similar issues +2. Add error messages to improve user experience +``` + +**Key points:** +- Test thoroughly after the fix to ensure it does not affect other functionality +- Add defensive code to improve system robustness +- Record the problem and solution for future reference + +# 3. Workflows for Common Development Tasks + +In day-to-day development, we encounter many different types of tasks. Below are several of the most common AI-assisted workflows. + +## 3.1 Developing a New Feature + +**Scenario:** the product manager gives you a new requirement, and you need to implement a new feature. + +**Complete workflow:** + +**Step 1: Understand the requirement** (done by you) + +Before you start coding, first clarify: +- What feature needs to be implemented? +- What are the inputs and outputs? +- What are the edge conditions and exceptional cases? +- What are the performance and security requirements? + +**Step 2: Understand the existing code** (ask AI to help) + +```text +I need to implement a user comment feature. Please help me: +1. Check whether there is anything similar in the project +2. Find how user data and article data are stored +3. Understand the database operation conventions in this project +``` + +**Step 3: Make an implementation plan** (with AI) + +```text +Based on the analysis we just did, implementing the comment feature requires: +1. Creating a comment data model +2. Implementing CRUD APIs for comments +3. Adding comment display and posting functionality to the frontend + +Please help me confirm whether this plan is reasonable and whether anything is missing +``` + +**Step 4: Implement the feature** (AI generates, you review) + +```text +According to the plan, first implement the comment data model. +Requirements: +- Include comment content, author, article ID, and creation time +- Support nested comment replies +- Add necessary indexes +``` + +```text +Now implement the comment API endpoints. +Requirements: +- Post a comment +- Delete a comment (only your own comments can be deleted) +- Get all comments for an article +- Support pagination +``` + +```text +Finally implement the frontend comment component. +Requirements: +- Show the comment list +- Comment posting form +- Support replying to comments +``` + +**Step 5: Test and verify** (done by you) + +- Manually test the main flow +- Test edge cases such as empty content and overly long content +- Test permission control such as being unable to delete someone else's comment +- Ask AI to help you write automated tests + +**Real conversation example:** + +```text +You: I want to add a user comment feature so users can post comments on articles + +AI: Okay, let me first understand the project structure. + [AI analyzes the project code] + I see that the project already has a user system (models/user.js) + and an article system (models/article.js). + + Implementing the comment feature requires: + 1. Creating a comment data model (models/comment.js) + 2. Adding comment-related APIs (routes/comment.js) + 3. Adding comment display and posting functionality to the article detail page + + Should I start implementing it? + +You: Yes, let's start. Create the data model first + +AI: [Generates models/comment.js] + I have created the comment data model with these fields: + - id: comment ID + - content: comment content + - userId: author ID + - articleId: article ID + - parentId: parent comment ID (for nested replies) + - createdAt: creation time + + Should I implement the API next? + +You: Yes, implement the API +``` + +## 3.2 Fixing Bugs + +**Scenario:** a user reports a problem, and you need to locate and fix it quickly. + +**Complete workflow:** + +**Step 1: Gather information** (done by you) + +Record the following information: +- What is the error message? +- How can this problem be reproduced? +- When did it start happening? +- How large is the impact scope? + +**Step 2: Locate the problem** (ask AI to help) + +```text +User feedback: after clicking the "Submit Order" button, the page freezes +Browser console error: TypeError: Cannot read property 'id' of undefined +The error occurs on line 45 of checkout.js + +Please help me: +1. Analyze possible causes +2. Find the relevant code +3. Check the data flow +``` + +**Step 3: Analyze the root cause** (with AI) + +```text +Check where this user object comes from +Inspect under what circumstances user could be undefined +``` + +**Step 4: Implement the fix** (AI generates, you review) + +```text +Fix this problem with these requirements: +1. Add defensive code to check whether user exists +2. If user does not exist, redirect to the login page +3. Add a friendly error message +``` + +**Step 5: Verify the fix** (done by you) + +- Reproduce the issue using the original steps and confirm the problem is solved +- Test related functionality to ensure no new problems were introduced +- Think about whether similar problems exist elsewhere + +**Real conversation example:** + +```text +You: A user reported that after logging in, refreshing the page logs them out + +AI: This is usually a token storage or validation issue. Let me check: + 1. Where is the token stored, localStorage or cookies? + 2. Is the token reloaded when the page refreshes? + + Please tell me which file contains the token-related code + +You: utils/auth.js + +AI: [Analyzes utils/auth.js] + I found the issue: the token is stored in sessionStorage, + and sessionStorage is cleared when the page is refreshed. + + Solution: use localStorage instead + + Would you like me to change it? + +You: Yes, change it + +AI: [Modifies the code] + The change is complete. The token is now stored in localStorage, + so it will not be lost after a page refresh. + + I recommend that you test: + 1. Log in and refresh the page to confirm the user stays logged in + 2. Close and reopen the browser to confirm the user stays logged in +``` + +## 3.3 Refactoring Code + +**Scenario:** the code has become hard to maintain, and it needs refactoring to improve code quality. + +**Complete workflow:** + +**Step 1: Define the refactoring goal** (done by you) + +Clarify the following: +- What problem are you trying to solve? (duplicate code? messy structure? performance issues?) +- How large is the scope of the refactor? +- What level of risk is acceptable? + +**Step 2: Make a refactoring plan** (with AI) + +```text +These three functions (getUserList, getArticleList, getCommentList) +contain a lot of duplicated code for pagination, sorting, and filtering. + +Please help me: +1. Analyze the duplicated parts +2. Propose a refactoring plan +3. Evaluate the risk of the refactor +``` + +**Step 3: Prepare a safety net** (ask AI to help) + +```text +Before refactoring, please write test cases for me first +so that the behavior stays the same after the refactor +``` + +**Step 4: Refactor step by step** (AI executes, you verify) + +```text +According to the plan we just discussed, first extract the shared function +``` + +```text +Now optimize the data structure +``` + +```text +Finally update the call sites +``` + +**Step 5: Verify the result** (done by you) + +- Run all tests and make sure they pass +- Manually test key functionality +- Check whether code quality has improved + +**Real conversation example:** + +```text +You: These three functions have a lot of duplicate code. Can you help me refactor them? + [Paste the code of the three functions] + +AI: I analyzed them, and the main differences are: + - getUserList: gets the user list + - getArticleList: gets the article list + - getCommentList: gets the comment list + + Their common logic is pagination, sorting, and filtering. + + Refactoring plan: + 1. Extract a generic getList function + 2. Pass the different parts as parameters + + This can reduce 80% of the duplicated code. + + Should I implement it? + +You: Yes, but first help me write test cases + +AI: [Generates tests] + I have written tests for all three functions, + covering the main use cases. + + Can I start refactoring now? + +You: Yes, start refactoring + +AI: [Refactors the code] + Refactoring is complete. I created a generic getList function, + and the original three functions now all call that shared function. + + Please run the tests to confirm everything works: npm test +``` + +## 3.4 Code Review + +**Scenario:** a teammate submitted code, and you need to review it. + +**Complete workflow:** + +**Step 1: Quickly understand the change** (ask AI to help) + +```text +Summarize what this PR changed +Which files were mainly modified +``` + +**Step 2: Check common issues** (ask AI to help) + +```text +Check this change for: +1. Security issues (SQL injection, XSS, etc.) +2. Performance problems (N+1 queries, memory leaks, etc.) +3. Obvious bugs +4. Whether the coding style follows the project conventions +``` + +**Step 3: Deep review** (done by you) + +- Whether the business logic is correct +- Whether edge cases were considered +- Whether there is a better implementation approach +- Whether tests should be added + +**Step 4: Give feedback** (done by you) + +- Point out the places that need changes +- Explain why they should be changed +- Provide improvement suggestions + +**Real conversation example:** + +```text +You: Help me review this PR + [Provide the PR link or code] + +AI: I reviewed the change and found these issues: + + 1. Security issues: + - Line 23: user input is not escaped, which may lead to XSS risk + - Line 45: the SQL query is built by string concatenation, which risks injection + + 2. Performance problems: + - Line 67: the database is queried inside a loop, which creates an N+1 problem + + 3. Code quality: + - The handleSubmit function is too long (200 lines); consider splitting it + - Error handling is missing + + Would you like me to explain any of these in more detail? + +You: Explain the SQL injection issue in detail + +AI: On line 45: + ```javascript + const query = `SELECT * FROM users WHERE name = '${userName}'`; + ``` + + This directly concatenates user input into the SQL statement. + If a user enters `' OR '1'='1`, it becomes: + ```sql + SELECT * FROM users WHERE name = '' OR '1'='1' + ``` + This would return all user data. + + The correct approach is to use a parameterized query: + ```javascript + const query = 'SELECT * FROM users WHERE name = ?'; + db.query(query, [userName]); + ``` + + Would you like me to fix it? +``` + +# 4. Build a Project Knowledge Base + +To help AI understand your project better, it is recommended to build a knowledge base inside the project. That way AI can work according to your conventions and habits. + +## 4.1 Create a Project Description File + +Create a `CLAUDE.md` or `AGENTS.md` file in the project root to record key project information: + +```markdown +# Project Overview + +## Project Summary +This is an online learning platform that provides course management, user learning, assignment submission, and other features. + +## Tech Stack +- Frontend: React 18 + TypeScript + Vite +- Backend: Node.js + Express + PostgreSQL +- Deployment: Vercel (frontend) + Railway (backend) + +## Project Structure +``` +src/ +├── components/ # React components +├── pages/ # Page components +├── api/ # API calls +├── utils/ # Utility functions +└── types/ # TypeScript type definitions +``` + +## Code Conventions +- Use ESLint and Prettier to format code +- Component files use PascalCase (such as UserProfile.tsx) +- Utility functions use camelCase (such as formatDate.ts) +- Constants use UPPER_SNAKE_CASE (such as API_BASE_URL) + +## Development Flow +1. Create a feature branch from main +2. Submit a PR after development is complete +3. Merge after code review passes + +## Common Tasks +- Start the development server: `npm run dev` +- Run tests: `npm test` +- Build for production: `npm run build` +- Format code: `npm run format` + +## Notes +- All API calls must include error handling +- User input must be validated and escaped +- Use parameterized queries for database operations to avoid SQL injection +- Sensitive information (passwords, tokens) must not be written to logs + +## Database Schema +- users: user table (id, email, password_hash, created_at) +- courses: course table (id, title, description, teacher_id) +- enrollments: enrollment table (id, user_id, course_id, enrolled_at) +``` + +## 4.2 Record Common Problems and Solutions + +Create `docs/troubleshooting.md` in the project to record common problems: + +```markdown +# Common Problems + +## Development Environment Problems + +### Problem: npm install fails +**Cause:** Node version is incompatible +**Solution:** Use Node.js 18 or higher + +### Problem: database connection fails +**Cause:** environment variables are not configured +**Solution:** Copy .env.example to .env and fill in the database connection info + +## Feature Problems + +### Problem: after users log in, refreshing the page logs them out +**Cause:** the token is stored in sessionStorage +**Solution:** switch to localStorage + +### Problem: image upload fails +**Cause:** file size exceeds the limit +**Solution:** add a file size check on the frontend and limit it to 5MB +``` + +## 4.3 Maintain Technical Decision Records + +Create a `docs/decisions/` directory to record important technical decisions: + +```markdown +# ADR-001: Choosing PostgreSQL as the Database + +## Status +Accepted + +## Background +The project needs to choose a relational database. The candidates are MySQL and PostgreSQL. + +## Decision +Choose PostgreSQL + +## Rationale +1. Better JSON support, suitable for storing course content +2. Stronger full-text search +3. The team is more familiar with PostgreSQL + +## Consequences +- We need to learn PostgreSQL-specific features +- Deployment requires a PostgreSQL environment +``` + +# 5. Techniques for Improving AI Collaboration Efficiency + +By mastering some practical techniques, you can make your collaboration with AI more efficient. + +## 5.1 Be Clear and Specific When Describing Problems + +**Bad description:** +```text +This feature has a problem +Help me optimize it +``` + +**Good description:** +```text +After the user clicks the "Submit" button, the form is not submitted +The browser console reports: Uncaught TypeError: Cannot read property 'value' of null +The error occurs on line 23 of form.js + +This list loads very slowly and has 1000 items +Please help me add pagination with 20 items per page +``` + +**Key points:** +- Provide specific error information +- Explain the expected result +- Give relevant context + +## 5.2 Do Only One Thing at a Time + +**Bad approach:** +```text +Help me implement login, registration, password recovery, profile center, +password change, and email verification +``` + +**Good approach:** +```text +Implement the login feature first, with these requirements: +- Email and password login +- Remember login state +- Error messages + +(After it is done) Now implement the registration feature + +(After it is done) Now implement the password recovery feature +``` + +**Key points:** +- Break large tasks into small tasks +- Test and verify after every completed task +- Confirm there are no issues before moving to the next one + +## 5.3 Verify Results Promptly + +**Bad approach:** +- Let AI modify 10 files in a row +- Only discover at the end that the first change was already wrong +- Waste a lot of time + +**Good approach:** +- Modify one file and test immediately +- Confirm there is no problem, then continue +- Correct issues as soon as they are found + +**Key points:** +- Move in small steps and get fast feedback +- Do not blindly trust AI +- Stay in control of the code + +## 5.4 Make Good Use of Context + +**Technique 1: refer to previous conversation** +```text +Implement according to the plan we just discussed +Refer to the previous getUserList function +``` + +**Technique 2: provide related code** +```text +This is the existing user model code: +[paste code] + +Please implement the article model in the same style +``` + +**Technique 3: explain project background** +```text +This is an e-commerce project using React + Node.js +It already has a user system and a product system +Now we need to add a shopping cart feature +``` + +## 5.5 Save Useful Conversations + +**Scenario:** you solved a complex problem + +**How to do it:** +1. Record the solution in project documentation +2. Refer to it the next time a similar issue appears +3. Share it with other team members + +**Example:** + +Create a document under `docs/solutions/`: + +```markdown +# Solving the N+1 Query Problem + +## Problem Description +When fetching the article list, the system queries the author information once per article, +which causes a performance problem. + +## Solution +Use a JOIN query to fetch all the data in one go: + +```sql +SELECT articles.*, users.name as author_name +FROM articles +LEFT JOIN users ON articles.author_id = users.id +``` + +**Result:** query time dropped from 2000ms to 50ms + +## 5.6 Learn the Art of Asking Questions + +**Technique 1: ask "why" first** +```text +Why does this code cause a memory leak? +Why should we use useCallback instead of a normal function? +``` + +**Technique 2: ask for multiple options** +```text +What are the different ways to implement user authentication? +What are the pros and cons of each? +``` + +**Technique 3: ask for explanations** +```text +How does this code work? +Can you explain this algorithm in detail? +``` + +# 6. Frequently Asked Questions + +## Q1: Can I use AI-generated code directly? + +**A:** No, not directly. It needs review and testing. + +AI-generated code may have the following problems: +- logical errors or poor handling of edge cases +- failure to match the project's coding conventions +- security risks +- insufficient performance optimization + +You need to: +- carefully read the generated code +- understand its logic +- test different scenarios +- confirm that it follows the project conventions + +## Q2: What if AI misunderstands what I mean? + +**A:** Correct it in time and describe the requirement again. + +```text +That's not what I meant. What I mean is... +This understanding is incorrect. It should be... +Let me describe the requirement again... +``` + +If it is still wrong after several corrections, you can: +- provide more context +- give specific code examples +- split the task into smaller pieces + +## Q3: What if I run into something AI cannot solve? + +**A:** AI is not all-powerful. Some problems still need you to solve them yourself. + +Problems AI may not be able to solve: +- very new technologies (AI knowledge has a cutoff date) +- business logic unique to your team +- problems that require access to external systems +- complex performance optimization issues + +At that point, you need to: +- read the official documentation +- search for related solutions +- ask experienced teammates +- ask in the community + +## Q4: How do I judge whether AI's suggestion is reasonable? + +**A:** Use your own experience and knowledge to judge it. + +Evaluation criteria: +- whether it follows best practices +- whether it considers edge cases +- whether there are potential security risks +- whether it fits the project's tech stack +- whether performance is acceptable + +If you are not sure, you can: +- ask AI to explain why it suggests that approach +- ask for alternative solutions +- consult team members + +## Q5: How should a team use AI in collaboration? + +**A:** Establish shared conventions and a shared knowledge base. + +Recommendations for team collaboration: +- share the project's `CLAUDE.md` configuration +- unify code conventions and style +- record solutions to common problems +- regularly share useful prompts +- check AI-generated code during code review + +## Q6: How do I avoid becoming overly dependent on AI? + +**A:** Keep learning and thinking. AI is an assistant, not a replacement. + +Recommendations: +- understand AI-generated code instead of copying it blindly +- actively learn concepts you do not understand +- regularly review foundational knowledge +- try solving problems yourself first, then use AI to verify +- participate in code review to learn from others' experience + +# 7. Summary + +Through this chapter, you have now mastered: + +1. **AI's capability boundaries**: understand what AI is good at and not good at, and build the right collaboration model +2. **Project-type strategies**: different development strategies for brand-new projects, mature projects, rapid prototypes, and maintenance projects +3. **Common task workflows**: complete workflows for new feature development, bug fixing, code refactoring, and code review +4. **Project knowledge base**: learn how to build project documentation so AI can understand your project better +5. **Collaboration techniques**: practical ways to improve AI collaboration efficiency + +**Key takeaways:** + +- **Clear division of roles**: you make decisions and ensure quality, AI handles execution and assistance +- **Clear communication**: be specific and do one thing at a time +- **Verify promptly**: do not trust blindly, test and verify +- **Keep learning**: understand AI's capability boundaries and continuously improve the collaboration model + +Remember: AI is a tool, not a replacement. It can make you more efficient, but the final code quality still depends on your judgment. Start with simple tasks and gradually build trust. You will find that AI can save you a lot of time and let you focus on more valuable work. + +::: tip 💡 Next step +In the next chapter, we will learn how to use AI for code review and quality assurance to ensure code maintainability and security. +::: diff --git a/docs/en/stage-3/cross-platform/3.1-choose-platform/index.md b/docs/en/stage-3/cross-platform/3.1-choose-platform/index.md new file mode 100644 index 0000000..222539f --- /dev/null +++ b/docs/en/stage-3/cross-platform/3.1-choose-platform/index.md @@ -0,0 +1,535 @@ +# How to Choose the Right Platform for Your Application + +You have an idea and want to turn it into a real product. But with so many platform options - WeChat Mini Programs, iOS apps, Android apps, websites, browser extensions, desktop applications - where should you start? + +::: tip 💡 Quick Navigation +If you already know the characteristics of each platform, you can jump directly to [Section 2](#2-ask-yourself-three-questions-first) for the decision process, or see [the decision flowchart in Section 7](#7-summary-platform-selection-decision-flow). +::: + +This article will help you sort out your thinking and find the most suitable development platform based on your specific scenario. + +## 1 Know These Platforms First + +Before discussing "which one to choose," first understand "which ones exist." Below are the mainstream platform categories right now: + +### 1.1 Mobile Platforms + +#### iOS Native App + +The apps you download from the App Store on your iPhone are iOS native apps. Their features are: fast launch, smooth experience, and full access to phone capabilities (camera, location, health data, etc.). But development requires a Mac, and App Store release requires Apple's review. + +**Common examples**: WeChat, Douyin (TikTok China), Xiaohongshu, Keep, Meituan, Alipay + +#### Android Native App + +Apps downloaded from Android app stores, or installed from APK files sent by friends, are Android native apps. They are similar to iOS apps, but Android has more users and more distribution channels. The downside is device fragmentation: developers must adapt to many screen sizes and system versions. + +**Common examples**: Tasker (automation), MX Player (video player), AirDroid (phone manager), Greenify (battery optimization), Xposed Framework (system customization) + +#### WeChat Mini Program + +The "small apps" you can use directly inside WeChat by scanning a code or searching by name, with no installation needed. The advantage is low user friction: everyone already has WeChat, so users can start instantly. The downside is limited capabilities, and it only runs inside WeChat. + +**Common examples**: Pinduoduo (group-buy e-commerce), Meituan Waimai (local services), Mobike (bike sharing), Jump Jump (mini game), Zhouheiya (ordering/shopping) + +#### PWA (Progressive Web App) + +It sounds technical, but it's basically "a web page that can be installed like an app." When users open a site in a mobile browser, they may see "Add to Home Screen." After one tap, an icon appears on the home screen and behaves like an app. The advantage is one codebase for mobile and desktop. The downside is many users do not know this usage pattern. + +**Common examples**: Twitter Lite, Starbucks, Pinterest, Uber, Spotify Web Player + +### 1.2 Desktop Platforms + +#### Electron Desktop App + +You might use them every day: VS Code, Slack, Discord, Notion, Figma - all built with Electron. The key feature is: build desktop software using web technologies (HTML, CSS, JavaScript), and run one codebase across Windows, Mac, and Linux. The downside is larger installers and higher runtime memory usage. + +**Common examples**: VS Code, Slack, Discord, Notion, Figma, WeChat Developer Tools + +#### Qt Desktop Application + +If you have used WPS, VirtualBox, or OBS, they may have been built with Qt. Qt uses C++, with good performance and stability, especially suitable for industrial scenarios. But the learning curve is higher, and C++ knowledge is required. + +**Common examples**: WPS Office, VirtualBox, Autodesk Maya, Telegram Desktop, OBS Studio + +#### Native Desktop Application + +These "heavyweight" applications are usually built with native technologies. Windows often uses C# or C++; macOS uses Swift. They provide the best performance and smoothest experience, but Windows and macOS versions must be developed separately, which is expensive. + +**Common examples**: Microsoft Office, Adobe Photoshop, Final Cut Pro, WeChat (Windows/Mac), QQ Music + +### 1.3 Web-Related Platforms + +#### Website + +These are pages opened by entering URLs in a browser. Advantages: accessible on any device (phone, computer, tablet), no installation required, and searchable by search engines. Downside: internet connection is required, so offline usage is unavailable. + +**Common examples**: Taobao, Zhihu, GitHub, Bilibili, Juejin, CSDN + +#### Browser Extension + +Have you used ad blockers, translation tools, or password managers? These are browser extensions. They run inside browsers and can read/modify web page content. For example, install a translation extension and translate English pages with one click. Advantage: lightweight and starts with browser. Downside: works only in browsers, and extensions are not always cross-compatible across Chrome, Edge, and Firefox. + +**Common examples**: AdBlock Plus, Immersive Translate, 1Password, Grammarly, Tampermonkey, Dark Reader + +### 1.4 Other Platforms + +#### VS Code Extension + +If you are a developer, you likely use VS Code. VS Code extensions are small programs that "add features" to the editor. Advantage: highly targeted developer audience. Downside: only useful for developer users. + +**Common examples**: Prettier, GitLens, GitHub Copilot, ESLint, Live Server, Chinese Language Pack + +#### NFT Smart Contract + +You may have heard about NFTs - those "digital avatars" sold for millions. NFTs are essentially blockchain-based ownership certificates proving a digital item belongs to you. Smart contracts are programs running on blockchain to create and manage NFTs. Advantage: tamper-resistant and tradable. Downside: high technical barrier and volatile market. + +**Common examples**: BAYC, CryptoPunks, NBA Top Shot, Azuki, Moonbirds + +### 1.5 Are There More Options? + +Beyond the platforms above, there are also "middle paths" and more possibilities: + +#### Cross-platform Frameworks + +::: details Click to view cross-platform framework details + +**React Native / Flutter**: want both iOS and Android without writing two codebases? These frameworks let you write once and generate apps for both platforms. Many companies use them, such as Airbnb and Instagram. + +**Tauri**: a "lightweight alternative" to Electron. It also uses web tech to build desktop apps but with smaller installers and faster runtime. Downside: ecosystem is less mature. + +**uni-app**: very popular in China. One codebase can target WeChat Mini Program, iOS app, Android app, and H5 website. Suitable for teams that want "build once, run everywhere." + +**Capacitor / Ionic**: already have a website and want to quickly turn it into an app? These tools can "wrap" your website into an installable app for app stores. + +These frameworks are essentially trade-offs between native and web development: higher development efficiency, but some compromises on performance and experience. +::: + +#### China Mini Program Ecosystem + +::: details Click to view mini program options in China + +**Alipay Mini Program**: finance and local service scenarios. If your users pay bills, order food, or use transit in Alipay, then Alipay Mini Program is a fit. Capabilities like Zhima credit and trust identity are unique to Alipay. + +**Douyin Mini Program**: content commerce and livestream sales. If you sell on Douyin, mini programs can be attached under videos for instant conversion. + +**Kuaishou Mini Program**: lower-tier markets and strong community economy. Kuaishou users are highly engaged, suitable for community group buying and local services. + +**Baidu Mini Program**: search traffic entry. If users search "nearby restaurants" on Baidu, your mini program can appear directly in results. +::: + +#### HarmonyOS Ecosystem + +**HarmonyOS apps**: can run on Huawei phones, tablets, watches, and smart home devices. Developed with ArkTS (similar to TypeScript), one codebase can support multiple devices. If your audience is in Huawei ecosystem or your product involves IoT linkage, HarmonyOS is a key option. + +#### More Developer Tools + +::: details Click to view more developer tool options + +**Command Line Tools (CLI)**: developers use terminal daily. CLI tools can automate repetitive work, generate code templates, and deploy projects. Examples include `create-react-app`, `git`, and `npm`. Suitable for developer productivity and DevOps automation. + +**JetBrains plugins**: besides VS Code, many developers use IntelliJ IDEA, PyCharm, and WebStorm. If your tool targets Java, Python, or frontend developers, JetBrains Marketplace is also worth considering. + +**Cursor / Windsurf plugins**: emerging ecosystems for AI coding tools. If you are building AI-assisted coding features, these IDE plugin ecosystems are growing quickly. +::: + +#### Community Bots + +::: details Click to view community bot options + +**Telegram Bot**: large overseas user base and developer-friendly APIs. Suitable for notifications, automation tasks, and community management. Many crypto projects and dev communities use Telegram. + +**Discord Bot**: core platform for gaming and developer communities. Useful for music playback, game data queries, and server management. If your users are gamers or overseas developers, Discord bots are often essential. +::: + +#### Design and Productivity Tools + +::: details Click to view design tool options + +**Figma plugins**: designers use Figma every day. Plugins can automate design workflows, generate code, and manage design systems. Suitable for design tooling and frontend assistance. + +**Notion integrations**: with Notion API you can automate workflows, sync data, and generate reports. Suitable for knowledge management and project management tools. +::: + +#### Spatial Computing + +**visionOS apps (Apple Vision Pro)**: the new era of spatial computing. Suitable for 3D content display, immersive experiences, education/training, and virtual collaboration. Technical barrier is high, but for frontier exploration this is a future direction. + +--- + +## 2 Ask Yourself Three Questions First + +Before choosing a platform, answer these three core questions: + + + +
+
    +
  • Do users need to use it anytime, anywhere? (mobile first)
  • +
  • Are users used to completing tasks inside WeChat? (mini program)
  • +
  • Will users spend long sessions in office scenarios? (desktop app)
  • +
  • Do users need to find you via search engines? (website)
  • +
+
+
+ + + +
+
    +
  • Does it need access to camera, microphone, GPS, or other hardware?
  • +
  • Does it need offline support?
  • +
  • Does it need push notifications?
  • +
  • Does it need to process large amounts of local data?
  • +
+
+
+ + + +
+
    +
  • What is your development time budget?
  • +
  • Do you have a Mac device (required for iOS development)?
  • +
  • Do you need to cover multiple platforms at once?
  • +
+
+
+ +--- + +## 3 Platform Selection Decision Table + +Use this table to quickly identify your fit: + +| Your scenario | Recommended platform | Why | +|---------|---------|------| +| Users are in WeChat ecosystem and you want fast user growth | WeChat Mini Program | No download needed, easy WeChat sharing, low acquisition cost | +| Need continuous GPS tracking in background and health data access | iOS / Android Native | Direct system API access, best performance | +| Want one codebase for multiple platforms | PWA / Electron | High efficiency, low maintenance cost | +| Users need long sessions on computers | Desktop App (Electron / Qt) | Separate window, offline support, strong system integration | +| Need auto summary/translation/password management while browsing | Browser Extension | Can read/modify webpage content, launches with browser | +| Want technical articles/project showcase indexed by Google | Website / Personal Blog | SEO-friendly, searchable content | +| Want to issue tradable digital membership cards or collectibles | NFT Smart Contract | On-chain ownership, transferable/tradable | + +--- + +## 4 Practical Scenario Examples + +### Scenario 1: I want to build a community group-buy tool + +**💡 Recommended: WeChat Mini Program** + +Why mini program? + +- **Users are already in WeChat**: community users are active in WeChat groups; mini programs can be shared directly in groups +- **Use-and-go behavior**: nobody wants to install a dedicated app just to order vegetables +- **Seamless payment**: one-tap WeChat Pay, no context switching +- **Low acquisition cost**: one group-sharing flow can bring dozens of users + +::: tip 💡 Applicable scenarios +If your product is similar - group buying, booking, surveys, event signup - mini programs are usually the first choice. +::: + +--- + +### Scenario 2: I want to build a running tracker app + +**⚡ Recommended: iOS / Android Native** + +Why native app? + +- **Background running**: app must keep tracking route during running, which mini programs and websites cannot reliably do +- **GPS precision**: native apps can access high-precision location with small error range +- **Health data access**: step count and heart rate access needs Apple HealthKit / Google Fit +- **Reliable push reminders**: daily "time to run" reminders are best done via native push + +::: warning ⚠️ Important note +Any app that requires **long-term background execution** or **deep hardware access** should choose native development. +::: + +--- + +### Scenario 3: I want to build a bookkeeping app + +**📝 Recommended: PWA or Mini Program** + +Why? + +- **High frequency but short sessions**: one record per day, done in 30 seconds +- **No complex hardware needs**: mostly data entry and display +- **Strong cross-platform requirement**: users may record on phone and review reports on desktop +- **Offline scenario**: users may want to log expenses in subway with no signal + +PWA can be installed on home screen and feels like an app, while development cost is about one-third of native. Mini programs are often better for China users. + +--- + +### Scenario 4: I want to build an online education platform + +**📚 Recommended: Website + Mini Program combination** + +Why? + +- **Website handles acquisition**: course pages, instructor profiles, SEO optimization +- **Mini program handles conversion**: trial class, enrollment payment, group join via QR +- **Website handles delivery**: video playback is better on larger web screens +- **Mini program handles touchpoints**: class reminders and homework notifications + +::: tip 💡 Combination strategy +Complex business often needs a **multi-platform combination**, not a single platform. +::: + +--- + +### Scenario 5: I want to build a team collaboration tool + +**🤝 Recommended: Electron desktop app + web version** + +Why? + +- **Desktop side**: users keep computers on at work; desktop apps can stay resident and receive messages +- **Web side**: temporary use on other computers without installation +- **System integration**: desktop app can access local files, system notifications, and shortcuts +- **One codebase**: Electron uses web stack, and desktop/web can reuse about 80% code + +Slack, Notion, and Discord all follow this pattern. + +--- + +### Scenario 6: I want to build a password manager + +**🔐 Recommended: Desktop app + browser extension** + +Why? + +- **Desktop app**: secure local password database storage, supports biometric unlock +- **Browser extension**: autofill on login pages without switching windows +- **Offline availability**: password data stored locally, independent of network +- **Security control**: users know where their data is, reducing cloud leakage concerns + +1Password and Bitwarden both use this combination. + +--- + +### Scenario 7: I want to build a content creation platform + +**✍️ Recommended: Website + personal blog** + +Why? + +- **SEO is the lifeline**: search is your largest long-term traffic source +- **Content is product**: articles, tutorials, and videos are core value +- **Long-term asset**: websites can operate for years, while social accounts can be suspended anytime +- **Flexible monetization**: ads, paid subscriptions, and knowledge commerce can all run on websites + +Medium, Zhihu columns, and personal tech blogs are all essentially content platforms. + +--- + +### Scenario 8: I want to build a developer productivity tool + +**🛠️ Recommended: VS Code extension or CLI tool** + +Why? + +- **Users are already inside the editor**: developers dislike context switching +- **Context awareness**: tools can read current code and provide precise suggestions +- **Easy distribution**: publish to extension marketplace and users install with one click +- **Fast iteration**: no app store review delays, same-day release/update + +Prettier, ESLint, and GitHub Copilot are all VS Code extensions. + +--- + +### Scenario 9: I want to build an industrial monitoring dashboard + +**🏭 Recommended: Qt desktop application** + +Why? + +- **Stability above all**: factories run 24/7 and software cannot crash +- **Hardware communication**: needs serial/Modbus communication with sensors +- **Real-time charting**: pressure/temperature/flow often need millisecond refresh +- **Industrial environment**: industrial computers commonly run Windows, and Qt compatibility is strong + +::: warning ⚠️ Industrial scenarios +Industrial scenarios require stability and hardware interfaces that web technologies usually cannot satisfy. +::: + +--- + +### Scenario 10: I want to issue a digital membership card + +**🎫 Recommended: NFT smart contract** + +Why? + +- **Unforgeable**: on-chain records cannot be tampered with +- **Transferable**: memberships can be gifted or traded on secondary markets +- **Programmable**: smart contracts can automate benefits (for example auto-upgrade after one year) +- **Global reach**: no national boundaries, global participation possible + +Starbucks Odyssey and NBA Top Shot both use NFTs in membership systems. + +--- + +## 5 Quick Platform Capability Comparison + +### 5.1 Mobile Solution Comparison + +| Capability | WeChat Mini Program | iOS Native | Android Native | PWA | +|-----|----------|---------|-------------|-----| +| User acquisition cost | Low (WeChat sharing) | High (app store) | High (app store) | Medium (search engines) | +| Offline usage | Limited | Full | Full | Supported | +| Push notifications | Supported | Supported | Supported | Partial | +| Hardware access | Restricted | Full access | Full access | Restricted | +| Background running | Restricted | Supported | Supported | Restricted | +| Development cost | Low | High | High | Low | +| Review required | Yes | Yes | Yes | No | + +### 5.2 Desktop Solution Comparison + +| Capability | Electron | Qt | Browser Extension | +|-----|----------|-----|-----------| +| Cross-platform | Win/Mac/Linux | Win/Mac/Linux | Chrome/Edge/Firefox | +| System integration | Medium | High | Low | +| Offline usage | Supported | Supported | Partial | +| Hardware access | Via Node.js | Full access | Restricted | +| Installation | Installer package | Installer package | Browser extension store | +| Development stack | Web technologies | C++/QML | JavaScript | + +--- + +## 6 Common Misconceptions + + + + +
+ Not necessarily. If your app is lightweight and use-and-go, a mini program or PWA may be a better choice. Native development is worth it only when you need deep system access or top-end performance. +
+
+ + + +
+ The opposite is true. Websites are the only platform indexable by search engines. If you want content-driven user growth, websites and personal blogs are top choices. Technical articles and project showcases can continuously bring SEO traffic. +
+
+ + + +
+ In office scenarios, desktop apps are still mainstream. VS Code, Slack, and Notion are all desktop apps. If your app needs long-session usage, heavy data handling, or system integration, desktop is often the best choice. +
+
+ + + +
+ Modern PWAs are already very close to native experience. Starbucks, Pinterest, and Uber all have PWA versions. If your app does not require complex hardware integration, PWA is often the most cost-effective cross-platform solution. +
+
+
+ +--- + +## 7 Summary: Platform Selection Decision Flow + +```text +Start + │ + ├─ Are users in WeChat ecosystem? ───────────────────→ WeChat Mini Program + │ + ├─ Need best performance and deep hardware access? ──→ iOS / Android Native + │ + ├─ Need long usage sessions on computers? ───────────→ Desktop App + │ │ + │ ├─ Industrial scenario? ───────────────────────→ Qt + │ └─ General scenario? ──────────────────────────→ Electron + │ + ├─ Need to process browser page content? ────────────→ Browser Extension + │ + ├─ Lightweight + cross-platform + offline? ──────────→ PWA + │ + ├─ Need to be discoverable by search? ───────────────→ Website / Blog + │ + ├─ Developer tool? ───────────────────────────────────→ VS Code Extension + │ + └─ Blockchain asset? ────────────────────────────────→ NFT Smart Contract +``` + +--- + +## 8 Next Step + +::: tip 🎯 Start Taking Action +Based on the analysis above, you should now have a preliminary answer to "which platform to choose." Next, click the matching tutorial to start: +::: + + + + + + + + + + + diff --git a/docs/en/stage-3/cross-platform/3.10-electron-voice-to-text/index.md b/docs/en/stage-3/cross-platform/3.10-electron-voice-to-text/index.md new file mode 100644 index 0000000..b20fad3 --- /dev/null +++ b/docs/en/stage-3/cross-platform/3.10-electron-voice-to-text/index.md @@ -0,0 +1,500 @@ +# How to Build a Cross-Platform Electron Desktop App: A Speech-to-Text Application + +# Chapter 1: What Electron and Desktop App Development Are + +In this tutorial, we will complete a full closed loop: build a speech-to-text desktop app from scratch with Electron, support both cloud API and local model recognition modes, and finally package it into a real desktop application that can be installed and run on Windows, macOS, and Linux. + +For this tutorial, you should at least have: + +- A computer (Windows or Mac, Mac is recommended because local models run very fast on Apple Silicon) +- A Node.js environment (version 18.0 or above) +- Your AI coding assistant (Cursor / Trae / Claude Code) +- (Optional) An OpenAI API Key (if you use cloud mode) +- A microphone (the built-in laptop microphone is fine) + +## 1.1 What Is Electron? + +Apps you use every day, such as **VS Code, Slack, Discord, and Notion**, have one thing in common: they are all desktop applications built with **Electron**. + +Electron is an open-source framework that lets you use **HTML + CSS + JavaScript** (the same stack used for web pages) to build desktop apps that run across **Windows, macOS, and Linux**. Its principle is simple: package Chromium and Node.js together, and your web page becomes a standalone desktop app. + +**One-sentence understanding**: Electron = an "invisible Chrome browser" + Node.js system capabilities. + + + +## 1.2 Core Electron Architecture + +An Electron app consists of two process types. Understanding them is the key to development: + +**Main Process** + +* The "general manager" of the app +* Responsible for creating windows, managing app lifecycle, and accessing native capabilities such as the file system +* Runs in the Node.js environment and can use all Node.js modules +* There is only one main process per app + +**Renderer Process** + +* The "front face" of the app +* Essentially a Chromium web page responsible for UI rendering +* Each window corresponds to one renderer process +* For security reasons, the renderer process cannot directly access Node.js APIs + +**Preload Script** + +* The "bridge" between the main process and renderer process +* Uses `contextBridge` to safely expose selected APIs to the renderer process + +They communicate through **IPC (Inter-Process Communication)**, like making a phone call: the renderer says "I want to start recording," and the main process receives that request and calls the system microphone. + + + +## 1.3 What Are We Building? + +In this tutorial, we will build a **Speech-to-Text** desktop app. Its functionality is straightforward: + +1. Click the "Start Recording" button, and the app starts listening to the microphone +2. After speaking, click "Stop," and the app sends audio to AI for recognition +3. The recognized text is displayed in the UI and can be copied with one click + +**Two recognition modes are available:** + +| Comparison Dimension | Cloud API Mode | Local Model Mode | +|---------|-------------|------------| +| Representative Solution | OpenAI Whisper API | whisper.cpp | +| Internet Required | Yes | No | +| Recognition Speed | Depends on network | Depends on hardware (very fast on Apple Silicon) | +| Chinese Recognition Quality | Excellent | Excellent (large-v3 model) | +| Cost | $0.006/minute | Free | +| Model Size | No download required | tiny model 75MB, large model 3GB | +| Best For | Fast onboarding, lightweight usage | Privacy-focused, offline usage, long-term high-frequency usage | + + + +## 1.4 Important Note: Web Speech API Is Not Available in Electron + +If you have searched for "Electron speech recognition," you may have seen recommendations to use the browser's built-in `Web Speech API`. **Please note: this does not work in Electron.** + +Google has discontinued speech API support for non-Chrome/Edge browser shells. Electron is Chromium-based, but it is not Chrome itself, so `window.SpeechRecognition` will fail directly. + +That is why we need independent solutions such as OpenAI Whisper API or whisper.cpp. + +## 1.5 Tutorial Roadmap + +We will complete the full flow in the following steps: + +1. **Create an Electron project**: Use Electron Forge to scaffold the project and understand inter-process communication +2. **Implement recording**: Capture microphone input in the renderer process and process audio data +3. **Cloud recognition (Option A)**: Use OpenAI Whisper API for speech-to-text +4. **Local recognition (Option B)**: Use whisper.cpp locally without internet access +5. **Packaging and distribution**: Package the app into an installable desktop program + +# Chapter 2: Create the Electron Project + +## 2.1 Initialize the Project with AI + +Open your AI coding assistant and enter this prompt: + +``` +Please help me create a new Electron project with Electron Forge using the Vite template. +The project name is voice-to-text. +Please run: npx create-electron-app voice-to-text --template=vite +After creation, enter the project directory and install dependencies. +``` + +Electron Forge is the official Electron-recommended scaffolding tool. It helps with project initialization, packaging, distribution, and other tedious setup tasks. + +After creation, the project structure is roughly: + +```text +voice-to-text/ +├── src/ +│ ├── main.js # Main process entry +│ ├── preload.js # Preload script (bridge) +│ ├── renderer.js # Renderer process entry +│ └── index.html # App HTML page +├── forge.config.js # Electron Forge config +├── vite.main.config.mjs # Main process Vite config +├── vite.preload.config.mjs # Preload script Vite config +├── vite.renderer.config.mjs # Renderer process Vite config +└── package.json +``` + +## 2.2 Start and Preview + +Ask AI to start the development server: + +``` +Please help me start the Electron development server by running npm start +``` + +After a few seconds, a desktop window appears. This is your Electron app. Even though it only shows a default welcome page now, it is already a real desktop program. + + + +## 2.3 Understand IPC (Inter-Process Communication) + +Before implementing speech features, we need to understand Electron's most important concept: **IPC (Inter-Process Communication)**. + +Because the renderer process (UI) and main process (system capabilities) are isolated, they must use IPC "phone calls" to collaborate: + +```text +Renderer process (UI) Main process (system) + │ │ + │── "I want to start recording" ──────────→ │ + │ │── Call microphone + │ │── Process audio + │ ←──── "Here is the result" ─────────────│ + │ │ + │── Display text in UI │ +``` + +In code, this communication is bridged via `preload.js`: + +```javascript +// preload.js - safely expose APIs to renderer process +const { contextBridge, ipcRenderer } = require('electron') + +contextBridge.exposeInMainWorld('electronAPI', { + // Renderer -> Main + sendAudio: (audioData) => ipcRenderer.invoke('transcribe-audio', audioData), + // Main -> Renderer + onResult: (callback) => ipcRenderer.on('transcription-result', callback) +}) +``` + +```javascript +// main.js - main process listens for messages +const { ipcMain } = require('electron') + +ipcMain.handle('transcribe-audio', async (event, audioData) => { + // Call Whisper API or whisper.cpp here + const text = await transcribe(audioData) + return text +}) +``` + + + +# Chapter 3: Implement Recording + +## 3.1 Capture Microphone Input in the Renderer Process + +The browser (which is the Electron renderer process) provides `navigator.mediaDevices.getUserMedia` to access the microphone. Ask AI to help implement recording: + +``` +Please help me modify src/index.html and src/renderer.js to implement: + +UI: +1. A large circular "Start Recording" button, which turns into a red "Stop Recording" button when clicked +2. Show a simple pulse animation while recording +3. A text display area below for recognition results +4. Two buttons at the bottom: "Copy Text" and "Clear" +5. A settings icon at top-right to switch recognition mode (cloud/local) + +Recording logic (in renderer.js): +1. On button click, request microphone access via navigator.mediaDevices.getUserMedia +2. Use MediaRecorder to record audio in webm format +3. After stopping, convert audio Blob to ArrayBuffer +4. Send it to main process via window.electronAPI.sendAudio +5. Wait for recognition result from main process and display it +``` + +Core recording code: + +```javascript +// renderer.js +let mediaRecorder = null +let audioChunks = [] + +async function startRecording() { + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + channelCount: 1, + sampleRate: 16000, + echoCancellation: true, + noiseSuppression: true + } + }) + + mediaRecorder = new MediaRecorder(stream, { + mimeType: 'audio/webm;codecs=opus' + }) + + audioChunks = [] + mediaRecorder.ondataavailable = (e) => audioChunks.push(e.data) + + mediaRecorder.onstop = async () => { + const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }) + const arrayBuffer = await audioBlob.arrayBuffer() + + // Send to main process for transcription + const result = await window.electronAPI.sendAudio(arrayBuffer) + document.getElementById('result').textContent = result + } + + mediaRecorder.start() +} +``` + + + +## 3.2 Handle Microphone Permissions + +Electron blocks permission requests by default. We need to explicitly allow microphone access in the main process: + +``` +Please help me add microphone permission handling in main.js: +1. Use session.defaultSession.setPermissionRequestHandler to handle permission requests +2. Auto-allow when request type is 'media' +3. For macOS, ensure microphone usage description is declared in package.json or entitlements +``` + +```javascript +// Add to main.js +const { session } = require('electron') + +session.defaultSession.setPermissionRequestHandler( + (webContents, permission, callback) => { + if (permission === 'media') { + callback(true) + } else { + callback(false) + } + } +) +``` + +> **Note for macOS users**: macOS will show a system-level microphone permission dialog. This is normal. Click "Allow." + +# Chapter 4: Option A - Cloud Recognition (OpenAI Whisper API) + +This is the simplest option. You only need an API key and a few lines of code. + +## 4.1 Get an OpenAI API Key + +1. Visit [OpenAI Platform](https://platform.openai.com/), sign up, and log in +2. Go to the API Keys page and click **"Create new secret key"** +3. Copy the generated key (starts with `sk-`) and store it safely + +> **Cost reference**: Whisper API costs **$0.006/minute**. That means recognizing 1 hour of audio only costs $0.36, which is very affordable. + +## 4.2 Call Whisper API in the Main Process + +Ask AI to implement speech recognition in the main process: + +``` +Please help me implement OpenAI Whisper API in main.js: +1. Install node-fetch (if needed) or use built-in fetch in Node.js +2. Create transcribeWithWhisper function that accepts audio ArrayBuffer +3. Convert ArrayBuffer to Blob/File and build FormData +4. Call https://api.openai.com/v1/audio/transcriptions +5. Use model whisper-1 and set language to zh (Chinese) +6. Return the recognized text +7. Read API key from environment variables or config file +``` + +Core code: + +```javascript +// main.js +async function transcribeWithWhisper(audioBuffer, apiKey) { + const blob = new Blob([audioBuffer], { type: 'audio/webm' }) + const formData = new FormData() + formData.append('file', blob, 'audio.webm') + formData.append('model', 'whisper-1') + formData.append('language', 'zh') + + const response = await fetch( + 'https://api.openai.com/v1/audio/transcriptions', + { + method: 'POST', + headers: { Authorization: `Bearer ${apiKey}` }, + body: formData + } + ) + + const data = await response.json() + return data.text +} +``` + + + +## 4.3 Add a Settings UI + +Ask AI to add a simple settings panel in the renderer process to input API key and switch recognition mode: + +``` +Please help me add a settings panel in index.html: +1. Add a gear icon in the top-right corner; click to expand settings panel +2. The panel includes: + - Recognition mode switch (Cloud API / Local model) + - API Key input (only visible in cloud mode) + - Language dropdown (Chinese / English / Auto detect) +3. Save settings to localStorage +4. Close panel when clicking outside +``` + + + +# Chapter 5: Option B - Local Recognition (whisper.cpp) + +If you do not want to rely on cloud APIs, or if you need offline usage, whisper.cpp is the best choice. It is a C++ port of the OpenAI Whisper model and runs fully locally without internet. + +## 5.1 Install whisper.cpp Node.js Bindings + +Ask AI to install and configure: + +``` +Please help me install nodejs-whisper in the project: +npm install nodejs-whisper + +After installation, please help me download the whisper tiny model (small size, fast for testing). +nodejs-whisper will handle model download automatically. +``` + +> **Model selection guide**: +> * `tiny` (75MB): fastest, good for testing and lightweight usage, average accuracy +> * `base` (142MB): balance between speed and accuracy +> * `small` (466MB): clearly better Chinese recognition quality +> * `large-v3-turbo` (1.5GB): recommended; 5-8x faster than large, with only 1-2% lower accuracy +> * `large-v3` (3GB): highest accuracy, but slower and needs better hardware + +## 5.2 Integrate whisper.cpp in Main Process + +Ask AI to implement local recognition: + +``` +Please help me add whisper.cpp local recognition in main.js: +1. Import nodejs-whisper +2. Create transcribeWithLocal function +3. Accept audio ArrayBuffer and save it as a temporary WAV file first (16kHz mono) +4. Call nodejs-whisper for recognition +5. Return recognized text +6. Delete temporary file after recognition +``` + +Core code: + +```javascript +// main.js +const { nodewhisper } = require('nodejs-whisper') +const path = require('path') +const fs = require('fs') +const os = require('os') + +async function transcribeWithLocal(audioBuffer) { + // Save as temp file + const tempPath = path.join(os.tmpdir(), `recording-${Date.now()}.wav`) + fs.writeFileSync(tempPath, Buffer.from(audioBuffer)) + + try { + const result = await nodewhisper(tempPath, { + modelName: 'base', + autoDownloadModelName: 'base', + whisperOptions: { + language: 'zh', + word_timestamps: true + } + }) + return result.map(r => r.speech).join('') + } finally { + // Clean up temp file + fs.unlinkSync(tempPath) + } +} +``` + + + +## 5.3 Good News for Apple Silicon Users + +If you are using an M1/M2/M3/M4 Mac, whisper.cpp can automatically use **Metal GPU acceleration** and **Apple Neural Engine**. Recognition can run **faster than real-time**, which means 1 minute of audio may only take a few seconds to process. + +For NVIDIA GPU users, whisper.cpp also supports **CUDA acceleration**, which provides strong performance too. + +# Chapter 6: Packaging and Distribution + +After development is complete, we need to package the app into distributable installers. + +## 6.1 Package with Electron Forge + +Electron Forge is already included in our project, so packaging is simple: + +``` +Please help me run the Electron Forge packaging command: +npx electron-forge make +``` + +This command automatically generates installers for your current operating system: + +* **macOS**: `.dmg` installer image and `.zip` archive +* **Windows**: `.exe` installer (Squirrel format) +* **Linux**: `.deb` (Debian/Ubuntu) and `.rpm` (Fedora) packages + +Build outputs are in the `out/make/` directory. + + + +## 6.2 App Size Optimization + +One "pain point" of Electron apps is large package size (because Chromium is bundled). Optimization suggestions: + +* Ensure only packages in `dependencies` are bundled, and keep dev dependencies in `devDependencies` +* Use Vite tree-shaking to reduce JavaScript size +* If using local models, consider downloading models on first launch instead of bundling them into the installer + +| Configuration | Estimated Size | +|------|---------| +| Pure Electron app (no model) | ~150-200 MB | +| + whisper tiny model | ~250 MB | +| + whisper large-v3-turbo model | ~1.7 GB | + +## 6.3 Cross-Platform Notes + +**macOS:** +* Publishing to App Store or distributing to others requires **code signing** (Apple Developer ID, $99/year) +* Also requires Apple's **Notarization** process +* Microphone permissions must declare `NSMicrophoneUsageDescription` in `Info.plist` +* Recommend building a Universal Binary to support both Intel and Apple Silicon + +**Windows:** +* Code signing is recommended, otherwise Windows SmartScreen will show security warnings +* Users can still choose "Run anyway" for unsigned apps + +**Linux:** +* No code signing required +* Recommended to provide both `.deb` and `.AppImage` formats + +> **Tip**: For personal projects or small-scale distribution, you can temporarily skip code signing and directly share packaged files with friends. + +# Chapter 7: Final Notes + +Congratulations! You have built a cross-platform speech-to-text desktop app from scratch. Let's recap what we did: + +1. Used Electron Forge to scaffold a cross-platform desktop app +2. Understood main process, renderer process, and IPC communication +3. Implemented microphone recording and audio capture +4. Integrated two speech recognition options: cloud Whisper API and local whisper.cpp +5. Learned how to package and distribute an Electron app + +What makes Electron powerful is that you can build desktop apps at the level of VS Code or Slack using a web-tech stack. And with mature AI speech recognition, a feature like speech-to-text, once requiring a specialized team, can now be built by one person. + +**Advanced directions:** + +* **Real-time subtitles**: Use AudioWorklet for streaming audio and pair with streaming recognition APIs for live transcription +* **Meeting assistant**: Record full meetings, auto-generate timestamped transcripts, and summarize key points with AI +* **Multilingual translation**: Transcribe speech and call translation APIs for real-time language conversion +* **Voice notebook**: Combine with a local database (such as SQLite) to build searchable voice notes + +***Let your voice, and let code record everything for you.*** + +# References + +* [Electron Official Docs](https://www.electronjs.org/docs/latest/) +* [Electron Forge Official Docs](https://www.electronforge.io/) +* [OpenAI Whisper API Docs](https://platform.openai.com/docs/guides/speech-to-text) +* [whisper.cpp GitHub Repository](https://github.com/ggml-org/whisper.cpp) +* [nodejs-whisper npm Package](https://www.npmjs.com/package/nodejs-whisper) +* [MDN MediaDevices.getUserMedia()](https://developer.mozilla.org/en-US/docs/Web/API/MediaDevices/getUserMedia) diff --git a/docs/en/stage-3/cross-platform/3.11-nft-minting/index.md b/docs/en/stage-3/cross-platform/3.11-nft-minting/index.md new file mode 100644 index 0000000..45119a3 --- /dev/null +++ b/docs/en/stage-3/cross-platform/3.11-nft-minting/index.md @@ -0,0 +1,361 @@ +# How to Quickly Build and Mint an NFT: 10-Minute Starter Edition + +# Chapter 1: What NFTs and Smart Contracts Are + +In this tutorial, we will complete a full closed loop: write an NFT smart contract from scratch, deploy it to the Ethereum testnet, mint your own NFT, and view it on OpenSea. The whole process uses browser-based tools with no local environment setup required, and can be finished in 10 minutes. + +For this tutorial, you should at least have: + +- Chrome browser (with MetaMask wallet extension installed) +- A MetaMask wallet account +- A small amount of Sepolia testnet ETH (free to claim, shown below) + +> **Zero cost, zero setup**: the entire process uses browser-based tools (Remix IDE), no Node.js / Hardhat installation needed; code uses OpenZeppelin official secure templates; after minting, you can view your NFT on OpenSea testnet. + +## 1.1 What Is an NFT? + +NFT (Non-Fungible Token) is a type of digital asset on blockchain. Unlike fungible tokens such as Bitcoin or Ether, every NFT is unique, like no two paintings in the world being exactly the same. + +You can understand an NFT as a **"certificate of collection in the digital world."** It can represent: + +* ownership of a digital artwork +* an event ticket +* a game item +* a learning certificate +* even a tweet + +The core value of NFTs is: **they use blockchain technology to prove "this digital item belongs to you," and that proof is public, transparent, and tamper-resistant.** + + + +## 1.2 What Is a Smart Contract? + +A smart contract is a piece of code that runs on blockchain. You can think of it as an **"automatically executed contract"**. Once deployed on-chain, it runs automatically according to code logic, and no one can tamper with it. + +NFTs are created and managed through smart contracts. When you "mint" an NFT, you are actually calling a function in the smart contract to write on-chain: "NFT #0 belongs to your wallet address." + +We will use **Solidity** to write the contract. Do not worry. With ready-made templates from OpenZeppelin, you only need to write fewer than 15 lines of code. + +## 1.3 What NFT Are We Minting? + +We will mint a **"Vibe Coder Learning Certificate"** NFT to prove you completed this tutorial and learned blockchain development basics. This NFT will: + +* have a unique token ID +* be recorded on Ethereum Sepolia testnet +* be viewable and displayable on OpenSea testnet +* (optional) include your custom image + +Of course, you can change it to any theme you like: AI-generated artwork, event souvenir card, pixel avatar, and more. The NFT content is fully up to you. + +## 1.4 Why Use a Testnet? + +Ethereum has "mainnet" and "testnet": + +| Comparison | Mainnet | Testnet (Sepolia) | +|------|----------------|------------------| +| ETH value | Real money | Free to claim, no real value | +| Deployment cost | Requires real gas fees | Completely free | +| Use case | Production release | Learning, testing, development | +| Functional difference | None | Same as mainnet | + +Testnet and mainnet are functionally the same. The only difference is that testnet ETH has no real value. So you can safely learn and experiment on testnet without worrying about spending money. + +## 1.5 Tutorial Roadmap + +We will complete the flow in these steps: + +1. **Prepare wallet and test ETH** (2 minutes): install MetaMask and claim free test ETH +2. **Write and deploy contract** (4 minutes): write NFT contract in Remix IDE and deploy to Sepolia +3. **Mint NFT and check result** (4 minutes): call contract to mint NFT and verify on OpenSea and Etherscan +4. **Advanced: add image to NFT** (optional): store image on IPFS to make NFT complete + +# Chapter 2: Prepare Wallet and Test ETH (2 Minutes) + +## 2.1 Install MetaMask Wallet + +MetaMask is the most popular Ethereum wallet. It is a browser extension that lets you interact with blockchain apps. + +1. Open Chrome and visit [MetaMask official site](https://metamask.io/) +2. Click **"Download"** and install the Chrome extension +3. After installation, click the MetaMask fox icon in the top-right corner +4. Choose **"Create a new wallet"** and set a password +5. **Important**: keep your recovery phrase (12 words) safe. Losing a test wallet is fine, but good habits matter + + + +## 2.2 Switch to Sepolia Testnet + +MetaMask connects to Ethereum mainnet by default. We need to switch to Sepolia testnet: + +1. Click the network dropdown at the top of MetaMask (default: "Ethereum Mainnet") +2. Click **"Show test networks"** +3. Select **"Sepolia test network"** + +If you do not see Sepolia, click **"Add network"** and add manually: + +| Config Item | Value | +|-------|-----| +| Network Name | Sepolia test network | +| RPC URL | `https://rpc.sepolia.org` | +| Chain ID | 11155111 | +| Currency Symbol | SepoliaETH | +| Block Explorer | `https://sepolia.etherscan.io` | + + + +## 2.3 Claim Free Test ETH + +Deploying contracts and minting NFTs requires gas fees. On testnet, gas is paid with test ETH, which is free. + +Visit any faucet below and input your wallet address to claim free Sepolia ETH: + +| Faucet | URL | Per-claim Amount | Login Required | +|--------|------|-----------|------------| +| QuickNode | `https://faucet.quicknode.com/ethereum/sepolia` | 0.1 ETH | Yes | +| Alchemy | `https://www.alchemy.com/faucets/ethereum-sepolia` | 0.1 ETH | Yes | +| Google Cloud | `https://cloud.google.com/application/web3/faucet/ethereum/sepolia` | 0.05 ETH | Yes (Google account) | + +> **Tip**: 0.1 test ETH is enough for deploying a contract and minting dozens of NFTs. If one faucet fails, try another. + +After claiming successfully, return to MetaMask and your balance should change from 0 to 0.1 ETH (it may take a few seconds). + + + +# Chapter 3: Write and Deploy NFT Smart Contract (4 Minutes) + +## 3.1 Open Remix IDE + +Remix is the official Ethereum-recommended online smart contract development environment. It runs fully in the browser and requires no installation. + +Open: **https://remix.ethereum.org/** + +You will see a VS Code-like interface: file explorer on the left, code editor in the middle, and compile/deploy panel on the right. + + + +## 3.2 Create Contract File + +1. In the left file explorer, click the **"contracts"** folder +2. Click the **"+"** button above to create a new file +3. Name it **`MySimpleNFT.sol`** +4. Paste the code below: + +```solidity +// SPDX-License-Identifier: MIT +pragma solidity ^0.8.20; + +// Import OpenZeppelin official secure ERC721 template +import "@openzeppelin/contracts/token/ERC721/ERC721.sol"; + +// Simplest NFT contract: name, symbol, mint function only +contract MySimpleNFT is ERC721 { + uint256 private _tokenId; + + // Initialize collection name and symbol + constructor() ERC721("VibeCoder", "VIBE") {} + + // Mint NFT: call once to mint one token to caller + function mint() public { + _safeMint(msg.sender, _tokenId); + _tokenId++; + } +} +``` + +**Code walkthrough (fewer than 15 lines, and each line is understandable):** + +| Code | Meaning | +|------|------| +| `pragma solidity ^0.8.20` | Specify Solidity compiler version | +| `import "@openzeppelin/..."` | Import OpenZeppelin ERC721 standard implementation (security-audited template) | +| `contract MySimpleNFT is ERC721` | Create a contract inheriting ERC721 standard | +| `ERC721("VibeCoder", "VIBE")` | Set collection name "VibeCoder" and symbol "VIBE" | +| `_safeMint(msg.sender, _tokenId)` | Mint a new NFT to caller | +| `_tokenId++` | Increment token ID after each mint | + +> **What is ERC721?** It is the NFT standard on Ethereum, defining basic NFT capabilities (transfer, owner query, etc.). OpenZeppelin provides a security-audited implementation, so we can inherit directly instead of building from scratch. + + + +## 3.3 Compile the Contract + +1. Click **"Solidity Compiler"** in the left panel (hammer icon) +2. Select compiler version **0.8.20** (or higher in 0.8.x) +3. Click **"Compile MySimpleNFT.sol"** +4. A green check ✅ means compilation succeeded + +> If there is an error, check whether Solidity version matches and OpenZeppelin import path is correct. Remix automatically downloads OpenZeppelin dependencies from npm. + + + +## 3.4 Deploy Contract to Sepolia Testnet + +1. Click **"Deploy & Run Transactions"** in the left panel (Ethereum icon) +2. Set **Environment** to **"Injected Provider - MetaMask"** + - This auto-connects your MetaMask wallet + - MetaMask will pop up a connection request, click **"Connect"** +3. Confirm network is **Sepolia (11155111)** +4. Select **MySimpleNFT** in Contract dropdown +5. Click **"Deploy"** +6. MetaMask pops up transaction confirmation, click **"Confirm"** (gas is very low; testnet is free) + +After a few seconds, when deployment succeeds, the **"Deployed Contracts"** section below will show your contract address. **Copy and save this address**; you will need it later. + + + +# Chapter 4: Mint NFT and Verify Result (4 Minutes) + +## 4.1 Mint Your First NFT + +After successful deployment, in the **"Deployed Contracts"** section in Remix, you will see the contract interaction panel. + +1. Expand the contract panel and find the **"mint"** button (orange) +2. Click **"mint"** directly (no input parameters required) +3. MetaMask pops up transaction confirmation, click **"Confirm"** +4. Wait a few seconds for completion + +Congratulations! You just minted NFT #0, and it now belongs to your wallet address. + +You can continue clicking "mint" to create more. Token IDs auto-increment each time (#1, #2, #3...). + + + +## 4.2 Verify Mint Result + +**Method 1: Verify in Remix** + +In the contract panel, find **"balanceOf"** (blue button), input your wallet address, and call it. If it returns `1` (or the number you minted), minting succeeded. + +You can also call **"ownerOf"**, input `0` (token ID), and it returns your wallet address, proving NFT #0 belongs to you. + +**Method 2: Verify on Etherscan (recommended)** + +1. Open [Sepolia Etherscan](https://sepolia.etherscan.io/) +2. Paste your **contract address** into search +3. You will see the contract details page with all transaction records +4. Click **"Token Tracker"** to view all NFTs minted by your contract + +On Etherscan, every mint transaction has complete records: who minted, when minted, and token ID. This is the charm of blockchain being "public, transparent, and tamper-resistant." + + + +# Chapter 5: Advanced - Add an Image to NFT (Optional) + +The NFTs minted so far only have IDs, without image or description. To make NFTs complete, we need **IPFS (InterPlanetary File System)** to store images and metadata. + +## 5.1 What Is IPFS? + +IPFS is a decentralized file storage network. Unlike regular cloud storage, files on IPFS do not depend on one server, but are distributed across global nodes. This means: + +* files are not lost if one server goes down +* file content is uniquely identified by hashes and cannot be tampered with +* it is ideal for storing NFT images and metadata + +## 5.2 Upload Image to Pinata + +[Pinata](https://pinata.cloud/) is the most popular IPFS storage service. The free tier provides 1GB storage, which is enough for us. + +1. Visit https://pinata.cloud/ and register a free account +2. After login, click **"Upload"** -> **"File"** +3. Select the image you want as NFT artwork (AI-generated image is fine, or any image) +4. After upload succeeds, copy the **CID** (a string like `QmXyz...`) + +Your image URI is: `ipfs://yourCID` + + + +## 5.3 Create Metadata JSON + +NFT metadata is a JSON file describing NFT name, description, and image URI. Create a `metadata.json`: + +```json +{ + "name": "Vibe Coder Certificate #0", + "description": "This NFT certifies that the holder has completed the NFT minting tutorial and entered the world of Web3.", + "image": "ipfs://your-image-cid", + "attributes": [ + { "trait_type": "Course", "value": "Easy Vibe" }, + { "trait_type": "Skill", "value": "Smart Contract" }, + { "trait_type": "Level", "value": "Beginner" } + ] +} +``` + +Upload `metadata.json` to Pinata too, and get a metadata CID. + +## 5.4 Upgrade Contract to Support Images + +To include images in NFTs, we need to slightly upgrade the contract by adding `tokenURI`. Go back to Remix and create a new file `MyNFTWithImage.sol`: + +```solidity +// SPDX-License-Identifier: MIT +pragma solidity ^0.8.20; + +import "@openzeppelin/contracts/token/ERC721/ERC721.sol"; +import "@openzeppelin/contracts/token/ERC721/extensions/ERC721URIStorage.sol"; + +contract MyNFTWithImage is ERC721, ERC721URIStorage { + uint256 private _tokenId; + + constructor() ERC721("VibeCoder", "VIBE") {} + + // Pass metadata URI when minting + function mint(string memory uri) public { + _safeMint(msg.sender, _tokenId); + _setTokenURI(_tokenId, uri); + _tokenId++; + } + + // Overrides required by Solidity + function tokenURI(uint256 tokenId) + public view override(ERC721, ERC721URIStorage) + returns (string memory) + { + return super.tokenURI(tokenId); + } + + function supportsInterface(bytes4 interfaceId) + public view override(ERC721, ERC721URIStorage) + returns (bool) + { + return super.supportsInterface(interfaceId); + } +} +``` + +After deployment, call `mint` and pass your metadata URI (for example `ipfs://QmAbc.../metadata.json`). Then your minted NFT will include image and description. + + + +# Chapter 6: Final Notes + +Congratulations! You have completed a full NFT development loop from scratch. Let's recap: + +1. Understood core concepts of NFTs and smart contracts +2. Installed MetaMask and switched to Sepolia testnet +3. Wrote an NFT smart contract with fewer than 15 lines in Remix IDE +4. Deployed the contract to Ethereum testnet +5. Minted your own NFT and verified it on Etherscan +6. (Optional) Learned how to add image and metadata with IPFS + +The whole process required no local environment installation, cost no money, and was completed fully in the browser. This is the appeal of blockchain development: the barrier is much lower than most people expect. + +**Advanced directions:** + +* **Use Hardhat / Foundry for local development**: when contract logic becomes complex, Remix is not enough. Hardhat and Foundry are professional local frameworks with automated testing, script-based deployment, gas optimization, and more +* **Add whitelist and mint limits**: control who can mint, max mints per wallet, mint price, and similar rules +* **Build a mint frontend**: use React + ethers.js / viem to build a polished mint page for one-click web minting +* **Explore ERC1155 multi-edition NFTs**: ERC1155 allows multiple copies under one token ID, useful for game items and tickets +* **Deploy to mainnet**: when ready, deploy to Ethereum mainnet (or L2 chains like Polygon or Base with lower gas fees) + +***Your first NFT is already on-chain. The door to the blockchain world is now open.*** + +# References + +* [OpenZeppelin ERC721 Docs](https://docs.openzeppelin.com/contracts/5.x/erc721) +* [Remix IDE Official Docs](https://remix-ide.readthedocs.io/) +* [MetaMask Official Docs](https://docs.metamask.io/) +* [Solidity Official Docs](https://docs.soliditylang.org/) +* [Sepolia Etherscan](https://sepolia.etherscan.io/) +* [Pinata IPFS Storage Service](https://pinata.cloud/) +* [ERC721 Standard Spec (EIP-721)](https://eips.ethereum.org/EIPS/eip-721) diff --git a/docs/en/stage-3/cross-platform/3.12-vscode-extension/index.md b/docs/en/stage-3/cross-platform/3.12-vscode-extension/index.md new file mode 100644 index 0000000..6032a14 --- /dev/null +++ b/docs/en/stage-3/cross-platform/3.12-vscode-extension/index.md @@ -0,0 +1,891 @@ +# How to Build a VS Code Extension: Create Your AI Project Assistant + +# Chapter 1: What VS Code Extension Development Is + +In this tutorial, we will complete a full closed loop: build a VS Code extension from scratch that acts as your AI project assistant, with one-click project template generation, AI chat on selected files or code snippets, multi-file Q&A analysis, and custom shortcuts. You will complete development, debugging, and learn how to publish to the VS Code Marketplace. + +For this tutorial, you should at least have: + +- Node.js environment (version 18.0+) +- VS Code editor (version 1.90+) +- Your AI coding assistant (Cursor / Trae / Claude Code) +- (Optional) GitHub Copilot subscription (for Language Model API) + +> **Vibe Coding end-to-end**: we will use an AI coding assistant to generate most code. You only need to understand core concepts and architecture, then describe requirements in natural language. + +## 1.1 What Can VS Code Extensions Do? + +You already use VS Code extensions daily. Prettier formats your code, GitLens shows Git history, and GitHub Copilot helps you write code. These extensions are essentially programs written in TypeScript/JavaScript that extend the editor through VS Code APIs. + +VS Code extensions can do much more than many people expect: + +* **Add new UI elements**: sidebar panels, status bar info, custom Webview pages +* **Handle files and code**: read, modify, and create files; analyze code structure +* **Integrate external services**: call APIs, connect databases, integrate CI/CD +* **Extend editor capabilities**: custom language support, code completion, diagnostics +* **Add AI capabilities**: create AI assistants with Chat Participant API, call models with Language Model API + + + +## 1.2 Core Architecture of a VS Code Extension + +A VS Code extension runs in an isolated **Extension Host** process, separate from the editor main process. This means even if an extension crashes, the editor itself is not affected. + +A typical extension has these core parts: + +* **package.json (manifest)**: extension "ID card," declaring name, entry file, contribution points (`commands`, `menus`, `keybindings`, etc.) +* **extension.ts (entry file)**: extension "brain," exporting `activate()` and `deactivate()` +* **Contribution Points**: what your extension contributes to VS Code in package.json (commands, menu items, keybindings, views, etc.) +* **VS Code API**: the TypeScript API set used to operate editor capabilities + +```text +VS Code editor + │ + ├── Extension Host (extension process) + │ ├── Your extension + │ │ ├── package.json -> declares "what I can do" + │ │ ├── extension.ts -> implements "how to do it" + │ │ └── other modules -> concrete feature code + │ ├── Other extension A + │ └── Other extension B + │ + └── Editor main process (UI rendering) +``` + + + +## 1.3 What Extension Are We Building? + +We will build a VS Code extension named **"AI Project Bot"**, an AI project assistant with the following features: + +| Feature | Description | +|------|------| +| Project templates | Sidebar list of templates, one-click project scaffold generation | +| AI chat | `@project-bot` participant in VS Code Chat for project Q&A | +| File/snippet chat | Right-click selected code or file and send to AI for analysis/explanation/refactoring | +| Multi-file Q&A | Multi-select files in explorer and ask AI to analyze relationships and logic | +| Shortcuts | Custom keybindings to trigger common actions quickly | + + + +## 1.4 Tutorial Roadmap + +We will complete the flow in these steps: + +1. **Create extension project** (3 minutes): scaffold project and understand core files +2. **Implement project templates** (5 minutes): use TreeView to show templates in sidebar and generate projects +3. **Implement AI Chat participant** (5 minutes): create `@project-bot` via Chat Participant API +4. **Implement file/snippet chat and multi-file Q&A** (5 minutes): right-click menus + multi-select analysis +5. **Add shortcuts and UX polish** (3 minutes): keybindings and status bar hints +6. **Publish to marketplace** (optional): package and submit + +# Chapter 2: Create the Extension Project (3 Minutes) + +## 2.1 Generate Project with Scaffold + +VS Code officially provides a Yeoman scaffold tool. Ask AI to run: + +```text +Please help me install VS Code extension scaffolding tools and create a project: +1. Install Yeoman and generator-code: npm install -g yo generator-code +2. Run yo code and choose: + - Type: New Extension (TypeScript) + - Name: ai-project-bot + - Identifier: ai-project-bot + - Description: AI project assistant - template generation, intelligent chat, multi-file Q&A + - Package manager: npm +3. Enter project directory and install dependencies +``` + +Generated structure: + +```text +ai-project-bot/ +├── .vscode/ +│ ├── launch.json # Debug config (F5 starts debugging) +│ └── tasks.json # Build tasks +├── src/ +│ └── extension.ts # Extension entry file +├── package.json # Extension manifest (most important file) +├── tsconfig.json # TypeScript config +└── vsc-extension-quickstart.md # Quick start guide (can be removed) +``` + +## 2.2 Understand package.json: The Extension "ID Card" + +`package.json` is the core file of a VS Code extension. Besides normal npm fields, it has `contributes` to declare everything your extension contributes to VS Code: + +```json +{ + "name": "ai-project-bot", + "displayName": "AI Project Bot", + "description": "AI project assistant - template generation, intelligent chat, multi-file Q&A", + "version": "0.0.1", + "engines": { "vscode": "^1.90.0" }, + "activationEvents": [], + "main": "./out/extension.js", + "contributes": { + "commands": [], + "menus": {}, + "keybindings": [], + "viewsContainers": {}, + "views": {}, + "chatParticipants": [] + } +} +``` + +**Key fields:** + +| Field | Purpose | +|------|------| +| `engines.vscode` | Minimum supported VS Code version | +| `activationEvents` | When extension activates (empty means on-demand activation) | +| `main` | Path to compiled entry file | +| `contributes` | All contributed features (commands, menus, keybindings, views, etc.) | + + + +## 2.3 Understand extension.ts: The Extension "Brain" + +Open `src/extension.ts` and you will see two core functions: + +```typescript +import * as vscode from 'vscode' + +// Called when extension is activated (first command execution, opening specific files, etc.) +export function activate(context: vscode.ExtensionContext) { + console.log('AI Project Bot activated!') + + // Register commands, views, chat participants, etc. + const disposable = vscode.commands.registerCommand( + 'ai-project-bot.helloWorld', + () => { + vscode.window.showInformationMessage('Hello from AI Project Bot!') + } + ) + + context.subscriptions.push(disposable) +} + +// Called when extension is deactivated (for example when VS Code closes) +export function deactivate() {} +``` + +**Core concepts:** + +* `activate(context)`: extension initialization, register all capabilities here +* `context.subscriptions`: an auto-cleanup list; VS Code disposes registered items on deactivation +* `vscode.commands.registerCommand`: register command callable from command palette (`Ctrl+Shift+P`) + +## 2.4 Start Debugging + +Press **F5**, and VS Code opens a new **Extension Development Host** window. This is a fresh VS Code instance with your extension loaded. + +In the new window, press **Ctrl+Shift+P**, type "Hello World," and you will see a message popup. This means your extension is running. + + + +> **Debug tip**: after code changes, in Extension Development Host press **Ctrl+Shift+P** -> **Developer: Reload Window** to reload extension quickly. + +# Chapter 3: Implement Project Templates (5 Minutes) + +## 3.1 Design Template System + +We want to add a "Project Templates" panel in VS Code sidebar where users can browse templates and generate project skeletons with one click. This uses VS Code **TreeView API**. + +Ask AI to implement: + +```text +Please help me implement project templates in ai-project-bot: + +1. Add contribution points in package.json: + - Add a new viewsContainers.activitybar item with id "project-bot", title "AI Project Bot" + - Add a view under it with id "projectTemplates", name "Project Templates" + - Add command "ai-project-bot.createFromTemplate", title "Create Project from Template" + +2. Create src/templates/templateProvider.ts: + - Implement TreeDataProvider with template categories and templates: + - Frontend: React + TypeScript, Vue 3 + TypeScript, Next.js App + - Backend: Express API, FastAPI Python + - Full-stack: T3 Stack (Next.js + tRPC + Prisma) + - Each template item shows name, description, and icon + +3. Create src/templates/scaffolder.ts: + - Implement createProjectFromTemplate function + - Let users choose target folder + - Generate project structure by template type +``` + +## 3.2 Declare View in package.json + +First add sidebar view contributions in `package.json`: + +```json +{ + "contributes": { + "viewsContainers": { + "activitybar": [ + { + "id": "project-bot", + "title": "AI Project Bot", + "icon": "resources/bot-icon.svg" + } + ] + }, + "views": { + "project-bot": [ + { + "id": "projectTemplates", + "name": "Project Templates" + } + ] + }, + "commands": [ + { + "command": "ai-project-bot.createFromTemplate", + "title": "Create Project from Template", + "icon": "$(add)" + } + ], + "menus": { + "view/title": [ + { + "command": "ai-project-bot.createFromTemplate", + "when": "view == projectTemplates", + "group": "navigation" + } + ] + } + } +} +``` + +This config does three things: + +1. Adds an "AI Project Bot" icon entry in the activity bar +2. Creates a "Project Templates" view under that entry +3. Adds a "+" button in the view title bar for project creation + + + +## 3.3 Implement TreeDataProvider + +TreeDataProvider is the interface VS Code uses to fill tree data. We need `getTreeItem` (display info for one node) and `getChildren` (child node list). + +Core code: + +```typescript +// src/templates/templateProvider.ts +import * as vscode from 'vscode' + +interface Template { + name: string + description: string + category: string + command: string // command to generate project, for example "npx create-react-app" +} + +const TEMPLATES: Template[] = [ + { name: 'React + TypeScript', description: 'React project built with Vite', category: 'Frontend', command: 'npm create vite@latest {{name}} -- --template react-ts' }, + { name: 'Vue 3 + TypeScript', description: 'Vue 3 project built with Vite', category: 'Frontend', command: 'npm create vite@latest {{name}} -- --template vue-ts' }, + { name: 'Next.js App', description: 'Next.js App Router full-stack project', category: 'Frontend', command: 'npx create-next-app@latest {{name}} --typescript --app' }, + { name: 'Express API', description: 'Express + TypeScript REST API', category: 'Backend', command: 'npx create-express-api {{name}}' }, + { name: 'FastAPI Python', description: 'Python FastAPI backend project', category: 'Backend', command: 'pip install fastapi uvicorn' }, +] + +// Tree node: category or template +class TemplateItem extends vscode.TreeItem { + constructor( + public readonly label: string, + public readonly collapsibleState: vscode.TreeItemCollapsibleState, + public readonly template?: Template + ) { + super(label, collapsibleState) + if (template) { + this.description = template.description + this.tooltip = `${template.name}\n${template.description}\nCommand: ${template.command}` + this.contextValue = 'template' + this.command = { + command: 'ai-project-bot.createFromTemplate', + title: 'Create Project', + arguments: [template] + } + } + } +} + +export class TemplateProvider implements vscode.TreeDataProvider { + getTreeItem(element: TemplateItem): vscode.TreeItem { + return element + } + + getChildren(element?: TemplateItem): TemplateItem[] { + if (!element) { + // Root: return category list + const categories = [...new Set(TEMPLATES.map(t => t.category))] + return categories.map( + cat => new TemplateItem(cat, vscode.TreeItemCollapsibleState.Expanded) + ) + } + // Children: templates in category + return TEMPLATES + .filter(t => t.category === element.label) + .map(t => new TemplateItem(t.name, vscode.TreeItemCollapsibleState.None, t)) + } +} +``` + +## 3.4 Register View and Create Command + +Register TreeView and project creation command in `extension.ts`: + +```typescript +// src/extension.ts +import { TemplateProvider } from './templates/templateProvider' + +export function activate(context: vscode.ExtensionContext) { + // Register template view + const templateProvider = new TemplateProvider() + vscode.window.registerTreeDataProvider('projectTemplates', templateProvider) + + // Register create project command + const createCmd = vscode.commands.registerCommand( + 'ai-project-bot.createFromTemplate', + async (template) => { + if (!template) { + // If no template passed (called from command palette), let user pick + const pick = await vscode.window.showQuickPick( + TEMPLATES.map(t => ({ label: t.name, description: t.description, template: t })), + { placeHolder: 'Choose a project template' } + ) + if (!pick) return + template = pick.template + } + + // Ask for project name + const name = await vscode.window.showInputBox({ + prompt: 'Enter project name', + placeHolder: 'my-awesome-project' + }) + if (!name) return + + // Ask for target folder + const folder = await vscode.window.showOpenDialog({ + canSelectFolders: true, + openLabel: 'Select target folder' + }) + if (!folder) return + + // Execute creation command + const terminal = vscode.window.createTerminal('AI Project Bot') + terminal.show() + const cmd = template.command.replace('{{name}}', name) + terminal.sendText(`cd "${folder[0].fsPath}" && ${cmd}`) + + vscode.window.showInformationMessage(`Creating ${template.name} project: ${name}`) + } + ) + + context.subscriptions.push(createCmd) +} +``` + +Now press F5 for debugging. You will see AI Project Bot in activity bar. Expand template list and click any template to create a project. + + + +# Chapter 4: Implement AI Chat Participant (5 Minutes) + +## 4.1 What Is Chat Participant API? + +Starting from VS Code 1.90, extensions can create their own AI assistant in Chat panel using **Chat Participant API**. If user inputs `@project-bot help me analyze this project architecture`, your extension receives the message and returns model-generated response. + +Core concepts: + +* **Participant**: your assistant identity in Chat panel, invoked with `@name` +* **Slash Commands**: quick commands supported by participant, such as `/explain`, `/refactor` +* **Language Model API**: call built-in models in VS Code (for example Copilot GPT-4o) +* **Stream**: progressively output responses through `stream.markdown()` + +## 4.2 Declare Chat Participant in package.json + +Add this in `contributes`: + +```json +{ + "contributes": { + "chatParticipants": [ + { + "id": "ai-project-bot.projectBot", + "name": "project-bot", + "fullName": "AI Project Bot", + "description": "Your AI project assistant for code analysis, architecture explanation, and solution generation", + "isSticky": true + } + ] + } +} +``` + +`isSticky: true` means once selected, follow-up messages go to this participant by default, without typing `@project-bot` each time. + +## 4.3 Implement Chat Participant Handler + +Ask AI to write core logic: + +```text +Please help me create src/chat/chatParticipant.ts and implement Chat Participant: +1. Register participant "ai-project-bot.projectBot" +2. Support three slash commands: + - /explain: explain selected code or current file + - /refactor: provide refactoring suggestions + - /template: recommend suitable tech stack templates +3. Use Language Model API with VS Code built-in model +4. Return response in streaming mode (stream.markdown) +``` + +Core code: + +```typescript +// src/chat/chatParticipant.ts +import * as vscode from 'vscode' + +export function registerChatParticipant(context: vscode.ExtensionContext) { + const participant = vscode.chat.createChatParticipant( + 'ai-project-bot.projectBot', + async (request, chatContext, stream, token) => { + // Select available model + const models = await vscode.lm.selectChatModels({ family: 'gpt-4o' }) + const model = models[0] + + if (!model) { + stream.markdown('No language model available. Please make sure GitHub Copilot is installed.') + return + } + + // Build system prompt by slash command + let systemPrompt = 'You are a professional project development assistant.' + + if (request.command === 'explain') { + systemPrompt = 'You are a code explanation expert. Please explain user code in concise Chinese, including purpose, logic flow, and key design decisions.' + } else if (request.command === 'refactor') { + systemPrompt = 'You are a code refactoring expert. Analyze user code and provide specific refactoring suggestions with improved code examples.' + } else if (request.command === 'template') { + systemPrompt = 'You are a tech stack selection expert. Recommend suitable tech stacks and project templates based on user requirements.' + } + + // Build messages + const messages = [ + vscode.LanguageModelChatMessage.User(systemPrompt), + vscode.LanguageModelChatMessage.User(request.prompt) + ] + + // Stream output + const response = await model.sendRequest(messages, {}, token) + for await (const chunk of response.stream) { + stream.markdown(chunk) + } + + return { metadata: { command: request.command || '' } } + } + ) + + // Register slash commands + participant.slashCommandProvider = { + provideSlashCommands: () => [ + { name: 'explain', description: 'Explain code function and logic' }, + { name: 'refactor', description: 'Provide refactoring suggestions and improvements' }, + { name: 'template', description: 'Recommend suitable project templates and tech stacks' } + ] + } + + // Register follow-up suggestions + participant.followupProvider = { + provideFollowups: (result) => { + if (result.metadata?.command === 'explain') { + return [ + { prompt: 'Can you draw a flowchart?', label: 'Generate flowchart' }, + { prompt: 'Any potential bugs here?', label: 'Check potential issues' } + ] + } + return [] + } + } + + context.subscriptions.push(participant) +} +``` + +Call registration in `extension.ts`: + +```typescript +import { registerChatParticipant } from './chat/chatParticipant' + +export function activate(context: vscode.ExtensionContext) { + // ... previous template registration code ... + registerChatParticipant(context) +} +``` + +Now input `@project-bot /explain what does this code do?` in Chat panel, and your extension will call model and generate explanation. + + + +# Chapter 5: File/Snippet Chat and Multi-file Q&A (5 Minutes) + +## 5.1 Right-click Menu: Send Selected Code to AI + +We want users to select code in editor and send it to AI from context menu. This uses VS Code **Context Menu** contribution points. + +Add in `package.json`: + +```json +{ + "contributes": { + "commands": [ + { + "command": "ai-project-bot.explainSelection", + "title": "AI: Explain Selected Code" + }, + { + "command": "ai-project-bot.refactorSelection", + "title": "AI: Refactor Selected Code" + } + ], + "menus": { + "editor/context": [ + { + "command": "ai-project-bot.explainSelection", + "when": "editorHasSelection", + "group": "ai-project-bot@1" + }, + { + "command": "ai-project-bot.refactorSelection", + "when": "editorHasSelection", + "group": "ai-project-bot@2" + } + ] + } + } +} +``` + +**Key config notes:** + +* `when: "editorHasSelection"`: show menu only when text is selected +* `group: "ai-project-bot@1"`: menu grouping and order (`@1`, `@2`) + +## 5.2 Implement Selected-code Analysis + +```typescript +// src/commands/selectionCommands.ts +import * as vscode from 'vscode' + +export function registerSelectionCommands(context: vscode.ExtensionContext) { + // Explain selected code + const explainCmd = vscode.commands.registerCommand( + 'ai-project-bot.explainSelection', + async () => { + const editor = vscode.window.activeTextEditor + if (!editor) return + + const selection = editor.selection + const selectedText = editor.document.getText(selection) + const fileName = editor.document.fileName.split('/').pop() + const startLine = selection.start.line + 1 + const endLine = selection.end.line + 1 + + // Build prompt with context + const prompt = [ + `Please explain the following code (from ${fileName}, lines ${startLine}-${endLine}):`, + '```', + selectedText, + '```', + 'Please explain: 1) what this code does 2) core logic 3) possible improvements' + ].join('\n') + + // Call Language Model API + const models = await vscode.lm.selectChatModels({ family: 'gpt-4o' }) + if (!models.length) { + vscode.window.showErrorMessage('No language model available') + return + } + + // Show results in output panel + const outputChannel = vscode.window.createOutputChannel('AI Project Bot') + outputChannel.show() + outputChannel.appendLine(`\n--- Code Explanation (${fileName}:${startLine}-${endLine}) ---\n`) + + const messages = [ + vscode.LanguageModelChatMessage.User(prompt) + ] + const response = await models[0].sendRequest(messages, {}) + for await (const chunk of response.stream) { + outputChannel.append(chunk) + } + } + ) + + context.subscriptions.push(explainCmd) +} +``` + + + +## 5.3 Multi-file Q&A: Batch Analyze File Relationships + +This is one of the most powerful features: multi-select files in explorer and let AI analyze relationship and logic in one click. + +Add explorer context menu in `package.json`: + +```json +{ + "contributes": { + "commands": [ + { + "command": "ai-project-bot.analyzeFiles", + "title": "AI: Analyze Relationships of Selected Files" + } + ], + "menus": { + "explorer/context": [ + { + "command": "ai-project-bot.analyzeFiles", + "when": "explorerResourceIsFile", + "group": "ai-project-bot" + } + ] + } + } +} +``` + +Implement multi-file analysis command: + +```typescript +// src/commands/multiFileAnalysis.ts +import * as vscode from 'vscode' + +export function registerMultiFileCommands(context: vscode.ExtensionContext) { + const analyzeCmd = vscode.commands.registerCommand( + 'ai-project-bot.analyzeFiles', + async (clickedFile: vscode.Uri, selectedFiles: vscode.Uri[]) => { + // selectedFiles contains all selected files + const files = selectedFiles || [clickedFile] + + if (files.length < 2) { + vscode.window.showWarningMessage('Please select at least 2 files for analysis') + return + } + + // Read all selected files + const fileContents: string[] = [] + for (const file of files) { + const content = await vscode.workspace.fs.readFile(file) + const fileName = vscode.workspace.asRelativePath(file) + fileContents.push( + `--- ${fileName} ---\n${Buffer.from(content).toString('utf8')}` + ) + } + + const prompt = [ + `Please analyze relationships among these ${files.length} files:`, + '', + ...fileContents, + '', + 'Please explain:', + '1. Responsibilities of each file', + '2. Dependency/call relationships among them', + '3. Data flow (if any)', + '4. Architectural suggestions or potential issues' + ].join('\n') + + // Call model and show result + const models = await vscode.lm.selectChatModels({ family: 'gpt-4o' }) + if (!models.length) { + vscode.window.showErrorMessage('No language model available') + return + } + + const outputChannel = vscode.window.createOutputChannel('AI Project Bot') + outputChannel.show() + outputChannel.appendLine(`\n--- Multi-file Analysis (${files.length} files) ---\n`) + + const messages = [ + vscode.LanguageModelChatMessage.User(prompt) + ] + const response = await models[0].sendRequest(messages, {}) + for await (const chunk of response.stream) { + outputChannel.append(chunk) + } + } + ) + + context.subscriptions.push(analyzeCmd) +} +``` + +Usage: in explorer, hold `Ctrl` (`Cmd` on Mac) to multi-select files, right-click and choose "AI: Analyze Relationships of Selected Files." AI reads all selected files and returns analysis. + + + +# Chapter 6: Shortcuts and UX Optimization (3 Minutes) + +## 6.1 Custom Keybindings + +Shortcuts are key to efficiency. Add in `package.json`: + +```json +{ + "contributes": { + "keybindings": [ + { + "command": "ai-project-bot.explainSelection", + "key": "ctrl+shift+e", + "mac": "cmd+shift+e", + "when": "editorTextFocus && editorHasSelection" + }, + { + "command": "ai-project-bot.refactorSelection", + "key": "ctrl+shift+r", + "mac": "cmd+shift+r", + "when": "editorTextFocus && editorHasSelection" + }, + { + "command": "ai-project-bot.createFromTemplate", + "key": "ctrl+shift+n", + "mac": "cmd+shift+n", + "when": "" + } + ] + } +} +``` + +**`when` conditions:** + +| Condition | Meaning | +|------|------| +| `editorTextFocus` | Cursor is in editor | +| `editorHasSelection` | Some text is selected | +| `explorerViewletVisible` | Explorer panel is visible | +| `!editorReadonly` | File is not read-only | + +Multiple conditions connected by `&&` mean all must be satisfied. + +## 6.2 Status Bar Hint + +Add a quick status bar entry so users always know extension is running: + +```typescript +// src/statusBar.ts +import * as vscode from 'vscode' + +export function createStatusBarItem(context: vscode.ExtensionContext) { + const statusBar = vscode.window.createStatusBarItem( + vscode.StatusBarAlignment.Right, + 100 + ) + statusBar.text = '$(hubot) AI Bot' + statusBar.tooltip = 'Click to open AI Project Bot' + statusBar.command = 'ai-project-bot.createFromTemplate' + statusBar.show() + + context.subscriptions.push(statusBar) +} +``` + +`$(hubot)` is VS Code built-in icon syntax. You can find all icons in [Codicon library](https://microsoft.github.io/vscode-codicons/dist/codicon.html). + + + +# Chapter 7: Publish to Marketplace (Optional) + +## 7.1 Prepare for Publishing + +VS Code extensions are packaged and published with **vsce**: + +```text +Please help me install vsce: npm install -g @vscode/vsce +``` + +Before publishing, prepare: + +1. **Azure DevOps account**: register and create an organization at [dev.azure.com](https://dev.azure.com/) +2. **Personal Access Token (PAT)**: create in Azure DevOps with permission **Marketplace -> Manage** +3. **Publisher ID**: create publisher identity in [VS Code Marketplace](https://marketplace.visualstudio.com/manage) + +## 7.2 Improve package.json Metadata + +Add metadata before publishing: + +```json +{ + "publisher": "your-publisher-id", + "repository": { + "type": "git", + "url": "https://github.com/yourname/ai-project-bot" + }, + "categories": ["AI", "Other"], + "keywords": ["ai", "project", "template", "chat"], + "icon": "resources/icon.png", + "galleryBanner": { + "color": "#1e1e2e", + "theme": "dark" + } +} +``` + +You also need a `README.md` for marketplace description and a `CHANGELOG.md` for version history. + +## 7.3 Package and Publish + +```bash +# Package to .vsix (manual install file) +vsce package + +# Publish to marketplace +vsce publish +``` + +After packaging, you get `ai-project-bot-0.0.1.vsix`. You can send this file to friends and they can install via VS Code "Install from VSIX." + +For official marketplace publishing, run `vsce publish`; the extension usually appears within minutes. + + + +> **Tip**: first release may require review. Make sure README is clear and screenshots are complete to speed up approval. + +# Chapter 8: Final Notes + +Congratulations! You have built a fully functional VS Code extension from scratch. Recap: + +1. Created extension project with Yeoman scaffold and understood roles of `package.json` and `extension.ts` +2. Implemented sidebar project template list with TreeView API and one-click project creation +3. Created `@project-bot` AI assistant with Chat Participant API, including slash commands and streaming responses +4. Implemented right-click code selection analysis +5. Implemented multi-file relationship analysis +6. Added custom shortcuts and status bar hint + +The imagination space of VS Code extension development is huge. The tech behind the useful extensions you use every day is exactly what you just learned. + +**Advanced directions:** + +* **Custom Webview panels**: build fully custom UI with HTML/CSS/JS, such as visual architecture graphs and interactive code review interfaces +* **Language Model Tools**: register custom tools callable by AI, such as querying database or executing API requests +* **Diagnostics and CodeLens**: show AI suggestions, performance hints, and security warnings inline +* **Custom language support**: provide syntax highlighting, completion, and diagnostics for DSLs or specific config formats +* **Remote development integration**: make extension work in SSH, containers, and WSL + +***Your editor, your rules.*** + +# References + +* [VS Code Extension API Docs](https://code.visualstudio.com/api) +* [Chat Participant API Guide](https://code.visualstudio.com/api/extension-guides/chat) +* [Language Model API Guide](https://code.visualstudio.com/api/extension-guides/language-model) +* [TreeView API Guide](https://code.visualstudio.com/api/extension-guides/tree-view) +* [Webview API Guide](https://code.visualstudio.com/api/extension-guides/webview) +* [VS Code Extension Publishing Guide](https://code.visualstudio.com/api/working-with-extensions/publishing-extension) +* [Codicon Icon Library](https://microsoft.github.io/vscode-codicons/dist/codicon.html) diff --git a/docs/en/stage-3/cross-platform/3.13-qt-industrial-hmi/index.md b/docs/en/stage-3/cross-platform/3.13-qt-industrial-hmi/index.md new file mode 100644 index 0000000..5304ff4 --- /dev/null +++ b/docs/en/stage-3/cross-platform/3.13-qt-industrial-hmi/index.md @@ -0,0 +1,693 @@ +# How to Build an Industrial Qt Desktop App: Pump Monitoring HMI System + +# Chapter 1: What Industrial HMI and Qt Development Are + +In this tutorial, we will complete a full closed loop: build an industrial-grade pump monitoring HMI (Human-Machine Interface) system from scratch with Qt. It can read sensor data in real time, draw pressure trend charts, trigger automatic over-threshold alarms, and record fault logs. The whole process uses free simulation software on a PC instead of real industrial hardware. + +For this tutorial, you should at least have: + +- A computer (Windows or Mac, Windows recommended for better industrial software compatibility) +- Qt 6.5 development environment (Qt Creator + Qt Serial Bus + Qt Charts modules) +- Modbus Slave simulation software (free download, works as a "virtual pump") +- Your AI coding assistant (Cursor / Trae / Claude Code) + +> **Zero hardware, zero cost**: use free PC simulation software (Modbus Slave) as the lower-level device; no need to buy hardware. Use official Qt `QModbusTcpClient` + Qt Charts modules directly, no manual protocol parsing needed. After running, you will see real-time pressure trends, over-threshold alarm popups, and fault logs, matching real factory workflow. + +## 1.1 What Are Upper Computer and Lower Computer? + +In industrial automation, there are two concepts you must understand: **upper computer** and **lower computer**. + +**Lower Computer**: the "hands and feet" on-site + +The lower computer is the controller that directly interacts with physical devices. In factories, it is usually a **PLC (Programmable Logic Controller)** or **sensor**, responsible for: + +* reading field data (temperature, pressure, flow, liquid level, etc.) +* controlling device actions (start pump, close valve, adjust speed, etc.) +* running predefined logic automatically (for example stop pump when pressure exceeds threshold) + +You can think of the lower computer as a "worker" on the factory floor. It does not need complex thinking, but must execute tasks reliably. + +**Upper Computer**: the "eyes and brain" in the control room + +The upper computer is monitoring software running on PC or industrial computer, which is the **HMI (Human-Machine Interface)** we will build today. It is responsible for: + +* displaying field data in real time (numbers, charts, animations) +* recording historical data and alarm logs +* enabling remote control for operators +* providing data analysis and reports + +You can think of the upper computer as the factory's "monitoring center." Operators can understand plant status from the screen. + +**How do they communicate?** + +Upper and lower computers exchange data through **industrial communication protocols**. The most common one is **Modbus**, a "veteran" protocol born in 1979. It is still widely used because it is simple, reliable, and supported by almost all industrial devices. + +```text +Control room Factory site +┌──────────┐ Modbus protocol ┌──────────┐ +│ Upper │ ◄──────────────────► │ Lower │ +│ computer │ "Tell me pressure" │ computer │ +│ (Qt HMI) │ "Pressure is 1.20MPa"│ (PLC/Sensor) +│ Display │ │ Read data│ +│ Log data │ │ Control │ +│ Alarms │ │ Protect │ +└──────────┘ └──────────┘ +``` + + + +## 1.2 What Is Modbus Protocol? + +Modbus is the "common language" of industrial communication. It defines how upper and lower computers "talk." + +**Only two core concepts:** + +* **Register**: data "cells" in the lower computer. Each has an address (`0`, `1`, `2`, ...), storing a number. For example, address `0` stores pressure and address `1` stores temperature. +* **Read/Write operations**: upper computer can read registers (get data) or write registers (send control commands). + +**Two common Modbus variants:** + +| Variant | Transport | Typical Scenario | +|------|---------|---------| +| Modbus RTU | Serial (RS-485/RS-232) | Short distance, direct device connection | +| Modbus TCP | Ethernet (TCP/IP) | Long distance, network communication | + +This tutorial uses **Modbus TCP**. Since it is network-based, upper-computer app and lower-computer simulator can run on the same machine with no physical wiring. + +## 1.3 Why Choose Qt? + +Qt is a top framework choice for industrial software. Many monitoring interfaces in factories, hospitals, and transportation systems are built with Qt. The reasons are simple: + +| Advantage | Explanation | +|------|------| +| Cross-platform | One codebase compiles to Windows, Linux, and embedded devices | +| Built-in industrial protocol support | Qt Serial Bus supports Modbus natively, no third-party library required | +| Powerful charting | Qt Charts provides professional real-time charts | +| High performance | C++ foundation suitable for real-time data refresh | +| Mature and stable | 30-year history, proven in industrial domain | + +## 1.4 What Are We Building? + +We will build a **Pump Monitoring HMI System** simulating real factory pump pressure monitoring: + +| Function | Description | +|------|------| +| Real-time data reading | Read pressure from lower computer every second | +| Pressure trend chart | Line chart for last 60 seconds of pressure | +| Over-threshold alarm | Popup warning and red UI when pressure exceeds threshold | +| Fault log | Record all alarm events in database for history queries | +| Manual control | One-click start/stop pump (write lower-computer register) | + + + +## 1.5 Tutorial Roadmap + +We will complete the flow in these steps: + +1. **Prepare environment and simulated lower computer** (2 minutes): install Qt 6.5 and Modbus Slave simulator +2. **Create Qt project and connect Modbus** (3 minutes): establish communication between upper app and simulator +3. **Implement real-time read and display** (3 minutes): timed pressure reads and UI updates +4. **Draw real-time pressure trend chart** (3 minutes): dynamic line chart with Qt Charts +5. **Implement alarm and fault logs** (3 minutes): over-threshold alarm + SQLite logging +6. **Package and deploy** (optional): package app into standalone executable + +# Chapter 2: Prepare Environment and Simulated Lower Computer (2 Minutes) + +## 2.1 Install Qt 6.5 + +Qt provides a free open-source version, enough for this tutorial. + +1. Visit [Qt official site](https://www.qt.io/download-qt-installer) and download Qt Online Installer +2. Run installer, log in or register Qt account (free) +3. In component selection, check: + - **Qt 6.5.x** (or newer) + - **Qt Serial Bus** under **Additional Libraries** (Modbus support) + - **Qt Charts** under **Additional Libraries** (chart rendering) + - **Qt Creator** (IDE, usually selected by default) +4. Click install and wait + +> **Tip**: if Qt is already installed but missing Serial Bus or Charts, rerun Qt Maintenance Tool and add components. + + + +## 2.2 Install Modbus Slave: Your "Virtual Pump" + +Modbus Slave is a free Modbus slave simulator. It can simulate an industrial device (PLC/sensor) on your computer so your upper app has something to communicate with. + +1. Visit [modbustools.com](https://www.modbustools.com/modbus_slave.html) and download Modbus Slave +2. Install and open it +3. Configure connection: + - Menu **Connection -> Connect** + - Choose **Modbus TCP/IP** + - IP address: `127.0.0.1` (localhost) + - Port: `502` (default Modbus TCP port) + - Click **OK** to listen + +4. Set simulated data: + - You will see a register table, each row is a register address (`0`, `1`, `2`, ...) + - Double-click value at address **0**, change to **120** (means pressure 1.20 MPa, divided by 100 in app) + - Double-click value at address **1**, change to **350** (means temperature 35.0°C) + - Double-click value at address **2**, change to **1** (pump state: `1=running`, `0=stopped`) + +Now Modbus Slave is your "24/7 virtual pump." Keep the window open; it will continuously respond to read/write requests. + + + +> **Dynamic simulation tip**: Modbus Slave supports auto increment/random changes. Right-click register value and choose "Auto increment" or "Random" to simulate realistic sensor fluctuations. + +# Chapter 3: Create Qt Project and Connect Modbus (3 Minutes) + +## 3.1 Create New Qt Project + +Open Qt Creator and create a new project: + +1. Click **File -> New Project** +2. Choose **Application (Qt) -> Qt Widgets Application** +3. Project name: **PumpHMI** +4. Select installed Qt 6.5 kit +5. Finish creation + +Open `PumpHMI.pro` (or `CMakeLists.txt` if using CMake), and add key modules: + +```pro +QT += core gui widgets serialbus charts sql +``` + +| Module | Purpose | +|------|------| +| `serialbus` | Provides `QModbusTcpClient` for Modbus TCP communication | +| `charts` | Provides `QChart`, `QLineSeries` for real-time trend chart | +| `sql` | Provides `QSqlDatabase` for SQLite fault logs | + +If using CMake, equivalent config: + +```cmake +find_package(Qt6 REQUIRED COMPONENTS Widgets SerialBus Charts Sql) +target_link_libraries(PumpHMI PRIVATE + Qt6::Widgets Qt6::SerialBus Qt6::Charts Qt6::Sql) +``` + +## 3.2 Declare Core Members + +Ask AI to generate header file: + +```text +Please help me write mainwindow.h with core members for pump monitoring HMI: +1. QModbusTcpClient for Modbus TCP communication +2. QTimer for timed data reading +3. QChart + QLineSeries for real-time trend chart +4. QSqlDatabase for fault log storage +5. UI elements: pressure label, status indicator, start/stop button, log table +``` + +Core header: + +```cpp +// mainwindow.h +#ifndef MAINWINDOW_H +#define MAINWINDOW_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class MainWindow : public QMainWindow { + Q_OBJECT + +public: + explicit MainWindow(QWidget *parent = nullptr); + ~MainWindow(); + +private slots: + void connectModbus(); // connect lower computer + void readPressure(); // timed pressure read + void onReadReady(); // read callback + void triggerAlarm(float v); // trigger alarm + void togglePump(); // start/stop pump + +private: + // Modbus communication + QModbusTcpClient *m_modbusClient = nullptr; + QTimer *m_pollTimer = nullptr; + + // Real-time chart + QChart *m_chart = nullptr; + QLineSeries *m_series = nullptr; + QDateTimeAxis *m_axisX = nullptr; + QValueAxis *m_axisY = nullptr; + + // Database + QSqlDatabase m_db; + + // UI elements + QLabel *m_pressureLabel = nullptr; // pressure display + QLabel *m_statusLight = nullptr; // status indicator + QPushButton *m_pumpButton = nullptr; // start/stop button + QTableWidget *m_logTable = nullptr; // log table + + // Alarm threshold + float m_alarmThreshold = 1.50f; // alarm above 1.50 MPa + bool m_pumpRunning = false; + + void setupUI(); + void setupDatabase(); + void logAlarm(float pressure, const QString &message); +}; + +#endif // MAINWINDOW_H +``` + + + +## 3.3 Build Modbus TCP Connection + +Implement connection logic in `mainwindow.cpp`: + +```cpp +// mainwindow.cpp - connection section +void MainWindow::connectModbus() +{ + m_modbusClient = new QModbusTcpClient(this); + + // Connect to Modbus Slave simulator + m_modbusClient->setConnectionParameter( + QModbusDevice::NetworkPortParameter, 502); + m_modbusClient->setConnectionParameter( + QModbusDevice::NetworkAddressParameter, "127.0.0.1"); + m_modbusClient->setTimeout(1000); // 1s timeout + m_modbusClient->setNumberOfRetries(3); // retry 3 times + + if (!m_modbusClient->connectDevice()) { + statusBar()->showMessage("Failed to connect lower computer!", 3000); + return; + } + + statusBar()->showMessage("Connected to lower computer (127.0.0.1:502)", 3000); + + // Start timer, read once per second + m_pollTimer = new QTimer(this); + connect(m_pollTimer, &QTimer::timeout, this, &MainWindow::readPressure); + m_pollTimer->start(1000); // 1000ms = 1s +} +``` + +**Code notes:** + +| Code | Meaning | +|------|------| +| `QModbusTcpClient` | Built-in Qt Modbus TCP client, communicates with lower computer | +| `NetworkPortParameter, 502` | Connect to port `502` (same as Modbus Slave config) | +| `NetworkAddressParameter, "127.0.0.1"` | Connect localhost (simulator runs locally) | +| `m_pollTimer->start(1000)` | Call `readPressure()` every second | + +## 3.4 Read Pressure Data + +```cpp +// mainwindow.cpp - reading section +void MainWindow::readPressure() +{ + if (!m_modbusClient || m_modbusClient->state() != QModbusDevice::ConnectedState) + return; + + // Build read request: start at address 0, read 3 holding registers + QModbusDataUnit readUnit( + QModbusDataUnit::HoldingRegisters, // register type + 0, // start address + 3 // quantity + ); + + // Send async read request + if (auto *reply = m_modbusClient->sendReadRequest(readUnit, 1)) { + if (!reply->isFinished()) { + connect(reply, &QModbusReply::finished, + this, &MainWindow::onReadReady); + } else { + delete reply; // broadcast request, delete directly + } + } +} + +void MainWindow::onReadReady() +{ + auto *reply = qobject_cast(sender()); + if (!reply) return; + + if (reply->error() == QModbusDevice::NoError) { + const QModbusDataUnit unit = reply->result(); + + // Parse values (divide register value for real units) + float pressure = unit.value(0) / 100.0f; // addr 0: pressure (MPa) + float temperature = unit.value(1) / 10.0f; // addr 1: temperature (°C) + int pumpStatus = unit.value(2); // addr 2: pump state + + // Update UI + m_pressureLabel->setText( + QString("%1 MPa").arg(pressure, 0, 'f', 2)); + + // Check alarm + if (pressure > m_alarmThreshold) { + triggerAlarm(pressure); + } + + // Update trend chart (implemented next chapter) + // updateChart(pressure); + + } else { + statusBar()->showMessage( + QString("Read failed: %1").arg(reply->errorString()), 2000); + } + + reply->deleteLater(); +} +``` + +**Modbus reading flow:** + +```text +readPressure() triggered by timer + -> Build QModbusDataUnit ("read addresses 0-2") + -> sendReadRequest() async send (UI not blocked) + -> lower computer returns data + -> onReadReady() triggered + -> parse register values and update UI +``` + + + +# Chapter 4: Draw Real-time Pressure Trend (3 Minutes) + +## 4.1 Initialize Chart + +Qt Charts provides professional chart components. Ask AI to initialize in constructor: + +```text +Please help me initialize Qt Charts real-time line chart in MainWindow constructor: +1. Create QChart and QLineSeries +2. X axis uses QDateTimeAxis, showing latest 60 seconds +3. Y axis uses QValueAxis, range 0-3.0 MPa +4. Line color blue, width 2px +5. Place chart into QChartView and add to layout +``` + +Core code: + +```cpp +// mainwindow.cpp - chart initialization +void MainWindow::setupChart() +{ + m_series = new QLineSeries(); + m_series->setName("Pressure (MPa)"); + m_series->setPen(QPen(QColor("#2196F3"), 2)); + + m_chart = new QChart(); + m_chart->addSeries(m_series); + m_chart->setTitle("Real-time Pressure Trend"); + m_chart->setAnimationOptions(QChart::NoAnimation); // no animation for real-time data + + // X axis: time + m_axisX = new QDateTimeAxis(); + m_axisX->setFormat("HH:mm:ss"); + m_axisX->setTitleText("Time"); + m_chart->addAxis(m_axisX, Qt::AlignBottom); + m_series->attachAxis(m_axisX); + + // Y axis: pressure + m_axisY = new QValueAxis(); + m_axisY->setRange(0, 3.0); + m_axisY->setTitleText("Pressure (MPa)"); + m_axisY->setLabelFormat("%.1f"); + m_chart->addAxis(m_axisY, Qt::AlignLeft); + m_series->attachAxis(m_axisY); + + // Create chart view + QChartView *chartView = new QChartView(m_chart); + chartView->setRenderHint(QPainter::Antialiasing); + + // Add to layout (assuming existing centralLayout) + centralLayout->addWidget(chartView); +} +``` + +## 4.2 Update Chart in Real Time + +Whenever a new pressure value is read, append one point and keep only latest 60 seconds: + +```cpp +// mainwindow.cpp - chart updates +void MainWindow::updateChart(float pressure) +{ + QDateTime now = QDateTime::currentDateTime(); + + // Append new point + m_series->append(now.toMSecsSinceEpoch(), pressure); + + // Keep only latest 60s data + QDateTime cutoff = now.addSecs(-60); + while (m_series->count() > 0 && + m_series->at(0).x() < cutoff.toMSecsSinceEpoch()) { + m_series->remove(0); + } + + // Update X axis range: always show latest 60s + m_axisX->setRange(cutoff, now); +} +``` + +Then call it in `onReadReady()`: + +```cpp +// Add after pressure parsing in onReadReady(): +updateChart(pressure); +``` + +Now run the program. You will see a blue line updating in real time, one point per second, always showing latest 60 seconds. If you modify register values in Modbus Slave manually, the line reflects changes immediately. + + + +> **Performance tip**: `QChart::NoAnimation` is important. Real-time data refresh every second; animations can cause UI lag. This is a common industrial HMI practice. + +# Chapter 5: Alarm System and Fault Logs (3 Minutes) + +## 5.1 Over-threshold Alarm + +When pressure exceeds threshold, we need: red UI warning + popup alert + log record. + +```cpp +// mainwindow.cpp - alarm logic +void MainWindow::triggerAlarm(float pressure) +{ + // Turn UI red + m_pressureLabel->setStyleSheet( + "color: white; background-color: #F44336;" + "font-size: 32px; padding: 10px; border-radius: 8px;"); + + // Status indicator red + m_statusLight->setStyleSheet( + "background-color: #F44336; border-radius: 12px;" + "min-width: 24px; min-height: 24px;"); + + // Popup alarm (only first time crossing threshold to avoid repeated popups) + static bool alarmActive = false; + if (!alarmActive) { + alarmActive = true; + QMessageBox::warning(this, "Pressure Alarm", + QString("Current pressure %1 MPa exceeds threshold %2 MPa!\nPlease check pump status immediately.") + .arg(pressure, 0, 'f', 2) + .arg(m_alarmThreshold, 0, 'f', 2)); + } + + // Record to DB + logAlarm(pressure, + QString("Pressure over threshold: %1 MPa > %2 MPa") + .arg(pressure, 0, 'f', 2) + .arg(m_alarmThreshold, 0, 'f', 2)); + + // Reset when pressure returns to normal + if (pressure <= m_alarmThreshold) { + alarmActive = false; + m_pressureLabel->setStyleSheet( + "color: #2196F3; font-size: 32px; padding: 10px;"); + m_statusLight->setStyleSheet( + "background-color: #4CAF50; border-radius: 12px;" + "min-width: 24px; min-height: 24px;"); + } +} +``` + + + +## 5.2 SQLite Fault Logs + +Industrial systems must log all alarm events for traceability. We use SQLite: + +```cpp +// mainwindow.cpp - database initialization +void MainWindow::setupDatabase() +{ + m_db = QSqlDatabase::addDatabase("QSQLITE"); + m_db.setDatabaseName("pump_alarm_log.db"); + + if (!m_db.open()) { + qWarning() << "Cannot open database:" << m_db.lastError().text(); + return; + } + + // Create alarm table + QSqlQuery query; + query.exec( + "CREATE TABLE IF NOT EXISTS alarm_log (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " timestamp DATETIME DEFAULT CURRENT_TIMESTAMP," + " pressure REAL," + " message TEXT" + ")" + ); +} +``` + +## 5.3 Log and Display Records + +```cpp +// mainwindow.cpp - write logs +void MainWindow::logAlarm(float pressure, const QString &message) +{ + // Write to DB + QSqlQuery query; + query.prepare( + "INSERT INTO alarm_log (pressure, message) VALUES (?, ?)"); + query.addBindValue(pressure); + query.addBindValue(message); + query.exec(); + + // Update on-screen table + int row = m_logTable->rowCount(); + m_logTable->insertRow(row); + m_logTable->setItem(row, 0, + new QTableWidgetItem( + QDateTime::currentDateTime().toString("yyyy-MM-dd HH:mm:ss"))); + m_logTable->setItem(row, 1, + new QTableWidgetItem(QString::number(pressure, 'f', 2))); + m_logTable->setItem(row, 2, + new QTableWidgetItem(message)); + + // Auto-scroll to latest row + m_logTable->scrollToBottom(); +} +``` + +Log table has three columns: time, pressure value, and alarm message. Each alarm appends one row and is persisted to SQLite. + + + +## 5.4 Manually Start/Stop Pump + +Besides reading data, upper computer should control lower computer too. We do this by writing register values: + +```cpp +// mainwindow.cpp - pump control +void MainWindow::togglePump() +{ + if (!m_modbusClient || m_modbusClient->state() != QModbusDevice::ConnectedState) + return; + + m_pumpRunning = !m_pumpRunning; + + // Build write request: write 1 (start) or 0 (stop) to address 2 + QModbusDataUnit writeUnit( + QModbusDataUnit::HoldingRegisters, 2, 1); + writeUnit.setValue(0, m_pumpRunning ? 1 : 0); + + if (auto *reply = m_modbusClient->sendWriteRequest(writeUnit, 1)) { + connect(reply, &QModbusReply::finished, this, [this, reply]() { + if (reply->error() == QModbusDevice::NoError) { + m_pumpButton->setText(m_pumpRunning ? "Stop Pump" : "Start Pump"); + m_pumpButton->setStyleSheet(m_pumpRunning + ? "background-color: #F44336; color: white; padding: 12px;" + : "background-color: #4CAF50; color: white; padding: 12px;"); + statusBar()->showMessage( + m_pumpRunning ? "Pump started" : "Pump stopped", 2000); + } + reply->deleteLater(); + }); + } +} +``` + +In Modbus Slave, you will see address `2` switching between `0` and `1` as you click the button. This is the upper-computer "control" process. + + + +# Chapter 6: Packaging and Deployment (Optional) + +## 6.1 Package with windeployqt / macdeployqt + +Qt provides official deployment tools to collect required dynamic libraries automatically. + +**Windows:** + +```bash +# Build Release first, then run in build directory: +windeployqt PumpHMI.exe +``` + +`windeployqt` copies Qt DLLs, plugins, translation files, etc. next to the executable. That packaged folder can be sent directly. + +**macOS:** + +```bash +macdeployqt PumpHMI.app -dmg +``` + +This generates a `.dmg` installer image. + +## 6.2 Build Installer with Qt Installer Framework + +If you want a professional setup wizard ("Next -> Next -> Finish"), use Qt Installer Framework: + +```text +Please help me create an installer for PumpHMI with Qt Installer Framework: +1. Create installer directory structure (config, packages) +2. Configure config.xml (installer name, version, target directory) +3. Put windeployqt output files into packages/com.example.pumphmi/data/ +4. Run binarycreator to generate installer +``` + + + +# Chapter 7: Final Notes + +Congratulations! You have built an industrial-grade pump monitoring HMI system from scratch. Recap: + +1. Understood core concepts of upper computer, lower computer, and Modbus protocol +2. Simulated a "virtual pump" with Modbus Slave, with no real hardware +3. Built upper-lower communication using Qt `QModbusTcpClient` +4. Drew real-time rolling pressure trend chart with Qt Charts +5. Implemented over-threshold popup alarms and SQLite fault logs +6. Implemented remote start/stop pump control + +The whole process used no real industrial hardware, but the architecture and functions match real factory HMI systems. If you replace Modbus Slave with a real PLC, this app can be used in production scenarios directly. + +**Advanced directions:** + +* **Multi-device monitoring**: connect multiple lower computers and use tabs/split views for different device data +* **Historical playback**: read historical data from SQLite and replay trend charts with timeline controls +* **OPC UA protocol**: Modbus fits simpler scenarios; complex industrial systems often use OPC UA, also supported by Qt (Qt OPC UA module) +* **Web remote monitoring**: use Qt WebSocket to push real-time data to browser for mobile viewing +* **AI predictive maintenance**: feed historical pressure data to ML models to predict failures in advance + +***Use code to protect every device in industrial operations.*** + +# References + +* [Qt Serial Bus Docs](https://doc.qt.io/qt-6/qtserialbus-index.html) +* [Qt Modbus TCP Client Example](https://doc.qt.io/qt-6/qtserialbus-modbus-client-example.html) +* [Qt Charts Docs](https://doc.qt.io/qt-6/qtcharts-index.html) +* [Modbus Protocol Specs](https://modbus.org/specs.php) +* [Modbus Slave Simulator](https://www.modbustools.com/modbus_slave.html) +* [Qt Installer Framework Docs](https://doc.qt.io/qtinstallerframework/) diff --git a/docs/en/stage-3/cross-platform/3.3-wechat-miniprogram/index.md b/docs/en/stage-3/cross-platform/3.3-wechat-miniprogram/index.md new file mode 100644 index 0000000..719822b --- /dev/null +++ b/docs/en/stage-3/cross-platform/3.3-wechat-miniprogram/index.md @@ -0,0 +1,470 @@ +# How to Build the Simplest WeChat Mini Program + +# 1. What WeChat Mini Programs and Mini Program Development Are + +In this tutorial, we will complete a full closed loop: from an idea in your mind to a real mini program that can be searched and opened by QR code inside WeChat. + +Before we start building, we need to establish two basic understandings. + +The first is **essence**: what exactly is a WeChat mini program? How is it different from a normal app or website? Why do so many products choose this format? Only when you understand the core logic can you judge whether your idea fits a mini program. + +The second is **path**: when you say "I want to build a mini program," what does the full path from zero to launch look like? What are the key nodes on that path - what to think about during ideation, how to set up environment, how AI-assisted development improves efficiency, what pitfalls appear in simulator debugging, and what test accounts vs formal release each solve. If you run through this process mentally first, you will not get lost during implementation. + +After these two questions are clear, we can formally enter development. Let us start with the first question: what exactly is a WeChat mini program? + +## 1.1 WeChat Mini Program + +A WeChat mini program can be seen as an app living inside WeChat. You do not need to search in an app store, download, or install. Users can search by name in WeChat, scan a QR code, or open a shared card and use it immediately. After use, they just close it. It does not permanently occupy phone home screen or storage. + +For regular users, mini programs solve many "small tasks": checking delivery, ordering coffee, viewing orders, playing a quick game. Fast startup and unified entry inside WeChat are its biggest experience traits. + +For companies and developers, mini programs are a searchable and shareable "small app format." As long as you register on WeChat Official Platform, complete settings, and pass review, your mini program can open to all WeChat users. Compared with traditional apps, it is easier to get the first batch of users because people are already used to doing many tasks in WeChat. + +In this tutorial, we will not build a complex business system. We choose a classic example: Snake game. It is small and logically clear, yet includes the complete elements a mini program should have: multiple pages, simple interactions, state changes, score recording, etc. It is perfect as your first project. + +## 1.2 WeChat Mini Program Development + +After understanding "what mini programs are," the next question is: what does developing one actually involve? + +You need a clear goal (for example, a Snake game users can play anytime), design the interface users will see, define what should happen under different actions, and finally publish it. + +In traditional development, programmers usually lead all these steps and write a lot of code. In AI-assisted development, this can be split more clearly: you explain what you want, and AI helps with most implementation details. That means for beginners, the most important skill is no longer memorizing syntax, but clearly describing requirements and understanding AI output. + +## 1.3 Several Ways to Develop WeChat Mini Programs + +In real projects, people use different technical routes. To avoid overwhelming you with terms at the beginning, we will only do a rough classification so you understand the common paths. + +The first way is using official native capabilities directly. After creating a project in WeChat DevTools, you will see a fixed set of file types used to describe page structure, styles, and logic. This way stays close to official docs and gives strong control, but for first-time frontend learners, the learning curve is a bit steeper. + +The second way is using cross-end frameworks, such as uni-app. You mainly write web-like code locally (for example `.vue` files), and the framework converts this code to formats WeChat mini programs can run. The advantage is unified structure. If you later publish to other platforms (such as H5 or App), changes are relatively smaller. + +Based on these two methods, this tutorial focuses on mini program SOP using AI-assisted tools. For example, open the whole project in Trae and tell built-in AI directly: "Please add a homepage with title and button in this file" or "Please create a game page that shows snake and score." AI will generate new code snippets or modify/refactor existing code based on current project context. + +These three ways are not mutually exclusive. You can absolutely build in a uni-app project while using Trae AI for most coding work. The key is not picking one method, but knowing where you are now and what tools are available. + +## 1.4 WeChat Mini Program Steps Covered in This Article (High-level Preview) + +This tutorial follows a rhythm from **environment to final product**. Around the Snake example and Trae vibecoding style, we split the process into a reusable route. In later chapters, you will go through these stages: + +1. Build conceptual foundation: understand what mini programs are, what common development methods exist, and who this Snake mini program is for and in what scenarios it is used. +2. Prepare environment: register mini program account, install HBuilderX, Trae, and WeChat DevTools, then create a basic project skeleton with HBuilderX that can run in WeChat DevTools and show the simplest page first. +3. Enter formal development: open project in Trae, use vibecoding dialog with AI to generate homepage and game page layout step by step, and implement core gameplay such as snake movement, eating food, and game over. +4. After core features run, learn to use AI as a "debugging and refactoring partner": ask it to diagnose bugs, tidy structure when code gets messy, and gradually add details such as start/pause, high-score record, and UI polishing. +5. Enter publishing: build project into WeChat-recognizable version, preview and test on real devices in WeChat DevTools, launch first with test account and experience version for process validation, then complete filing and review before formal release so others can search and play your mini program. + +This section only draws the full map and does not expand commands or code details yet. For now, remember these 5 steps: **Understand -> Setup environment -> Vibecoding development -> Debug and polish -> Build and release**. Later chapters will zoom into each step, showing what to prepare, what to say to AI, and what results you should see on screen at each stage. + +# 2. Environment Preparation + +Before writing any line of code, let us prepare the environment first. +The goal of this part is to make sure you no longer get stuck on **where to download tools and why things cannot run**, so you can focus directly on AI dialog and requirement implementation. + +If you can open a browser, download files, and double-click installers, you can complete this section. + +## 2.1 Three Tools Used in This Tutorial + +For Snake mini program development, we use three tools together, each with different responsibilities: + +1. The first is Trae. Think of it as an AI-integrated code editor. It can open project files like a normal IDE and also let you chat with AI in natural language to generate, modify, and explain code. Most "build mini program with AI" operations in this tutorial happen in Trae. Download latest version from https://www.trae.cn . +2. The second is HBuilderX. It has strong support for Vue and uni-app, and offers ready-made mini program templates. We use it to "one-click generate" a base mini program project - this is laying the foundation before handing it to Trae + AI for further iteration. Download from https://www.dcloud.io/hbuilderx.html . +3. The third is WeChat DevTools. This official tool is used to develop and preview mini programs. It runs your project on desktop and supports real-device debugging on mobile. Download from https://developers.weixin.qq.com/miniprogram/dev/devtools/download.html . + +In short: HBuilderX creates base project quickly, Trae helps you code with AI, and WeChat DevTools shows the actual running mini program. + +## 2.2 Register WeChat Official Platform Account and Get AppID + +With tools ready, you still need a **mini program identity**, which is created on WeChat Official Platform. +If you have never registered a mini program before, follow this order: + +1. Enter https://mp.weixin.qq.com in your browser, open WeChat Official Platform, and login by scanning QR code with WeChat. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image1.png) + +2. Choose "Mini Program" on homepage and complete registration prompts, including email, phone number, and entity type (individual or enterprise). + ![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image2.png) +3. After successful registration, enter backend, find "Development Management" or "Development Settings," and you will see a unique ID named AppID. This is your mini program identity and will be used in project config later. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image3.png) + +It is recommended to save AppID where easy to find. In later sections, we will fill this value directly to map local project to your online mini program. + +## 2.3 Install WeChat DevTools + +Next we need a place to actually run and preview mini programs. That is exactly what WeChat DevTools is for. + +1. Visit download page https://developers.weixin.qq.com/miniprogram/dev/devtools/download.html . + On this page you will see versions for different operating systems. Usually choose the stable version matching your system, such as Windows 64-bit or macOS. +2. After download, double-click installer and follow wizard step by step. If unsure, keep default options. +3. After installation, launch WeChat DevTools from desktop or start menu. On first launch, it shows a QR code and asks you to scan with WeChat. Scan and authorize to enter main interface. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image4.png)![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image5.png) + +Later, after project files are ready in Trae, we will import the built mini program into WeChat DevTools and view real running results here. + +## 2.4 Prepare Trae and HBuilderX + +Finally, install the two tools used for actual coding: Trae and HBuilderX. + +You can **install Trae first**. Visit https://www.trae.cn in browser and download the right version for your OS. Installation is like normal software: double-click installer and follow prompts. After install, you get an IDE that can open local folders, inspect code, and chat with AI. All later vibecoding steps happen here. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image6.png) + +**Then install HBuilderX**. Visit https://www.dcloud.io/hbuilderx.html and download your OS package. HBuilderX is lightweight and starts quickly. After install, you can briefly look at interface; no need deep feature study now. In later chapters, we use it to create a uni-app mini program template as project starting point. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image7.png) + +After finishing this section, your environment is complete: you have a mini program account + AppID, a runtime preview tool, and an AI coding IDE. Next we start from **creating the first project skeleton** and make these tools really run. + +## 2.5 Prepare Base Files + +1. Click "New Project". + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image8.png) + +2. Choose default template, set mini program name, select storage path, then click create in lower-right corner: + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image9.png) + +3. Creation success screen appears: + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image10.png) + +4. Then find this folder in file system, open it in Trae, and you will see foundation files are all ready: + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image11.png) + +# 3. Mini Program Development + +In the first two parts, we already clarified "what mini programs are" and "how to set up tools and environment." From this section, we enter hands-on practice: not just concepts, but AI actually helping you build Snake mini program from zero. + +In this section, you will walk through a complete SOP for the development phase, roughly including: + +1. Open current project in Trae and give AI your first complete instruction so it designs and implements a runnable Snake version based on current skeleton. +2. Let Trae modify real project files directly, not only output "example code," and learn to use rollback to restore previous state when needed. +3. Return to HBuilderX and WeChat DevTools, run to mini program simulator, and play this version in simulator to switch from "code perspective" to "user perspective." +4. Based on play results, keep proposing modifications in natural language and let AI iterate controls from button-based to joystick-based, while experiencing a full loop of "find issue -> describe issue -> AI fixes -> verify again." + +You can choose to design every page and button before development. +But for complete beginners, interface and interaction design itself is also a new domain (later we will show AI-assisted design). So in this round we intentionally use another way: start first - let AI generate a runnable version, then refine gradually by viewing effects and chatting in natural language. + +## 3.1 Explain Requirements Clearly in One Shot: Give Trae the First "Master Prompt" + +After opening prepared mini program project in Trae, I did not rush to edit a specific line. Instead, I told built-in AI assistant: + +**I gave AI a command: based on current framework, build a Snake mini program. Please design this mini program and write me a prompt.** + +In other words, I did not ask it to "write one function step by step." I first threw out a complete goal, let AI help plan, and AI not only planned but also directly landed the first implementation. + +After receiving this instruction, Trae reads current project structure, determines where to add pages and where to add logic, and directly modifies project files/code. You do not need to hand-write code or manually create/modify folders. + +## 3.2 Let AI Modify Real Code Automatically, Not Manual Coding + +When you execute this instruction in Trae, AI enters a "project editing" flow. During this process, you can observe key points: + +1. It explains its thinking in chat area, for example which directories it will add pages to and how it will organize game logic. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image12.png)![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image13.png) + +2. It directly edits real project files, instead of only giving "sample code" for copy-paste. +3. After finishing, Trae outputs a short summary telling you what files were changed and what was done. + +If you are not satisfied with this round (or think something is wrong), no need to panic. Trae provides rollback in the top-left outside chat box. You can restore project state before this instruction with one click - like a safety undo key. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image14.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image15.png) + +## 3.3 View Effects in HBuilderX and WeChat DevTools + +After AI completes the first development round, code has been written into project, but you still have not seen real player-side effect. +Next we need to run it. + +Specific operation: go back to HBuilderX, find top menu "Run," select "Run to Mini Program Simulator" -> "WeChat DevTools." This triggers project build and opens result in WeChat DevTools. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image16.png) + +The output panel at bottom shows build process. If final state is "ready" with no errors, build is successful. Then switch to WeChat DevTools to check UI and features of this version. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image17.png) + +In most cases, HBuilderX auto-opens WeChat DevTools and you can directly see updated mini program. If not auto-opened, do this: + +1. Stop current run in HBuilderX first. +2. Launch WeChat DevTools manually and keep it open. +3. Back in HBuilderX, click "Run -> Run to Mini Program Simulator -> WeChat DevTools" again. + +Then you can see the vibecoding mini program in WeChat DevTools: + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image18.png) + +## 3.4 Use Natural Language to Repeatedly Adjust Until Satisfied + +In this practice, AI initially generated a button-controlled Snake: four direction buttons on screen, and snake changes direction when clicked. It is fully playable, but I personally prefer joystick control. For your adjustment requests (not only features, but also UI design and layout; once experienced, you can even ask AI to integrate external model APIs or databases), again: you only need to describe requirements in natural language. + +This is the core advantage of vibecoding: you do not have to dig into code for event binding or coordinate logic. You directly tell AI what you want. For example, in Trae chat you can write: + +Replace buttons with joystick control. When user releases joystick, snake should keep moving in current direction until next joystick action. + +As long as requirement is clear, AI will automatically locate target files and modify control styles, interaction bindings, and direction handling logic. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image19.png) + +After modification, return to WeChat DevTools to check. +If changes are not visible immediately, click "Run" in DevTools or refresh preview window to apply latest build. If still not updated, stop run in HBuilderX and run to simulator again, then you can see updated mini program: + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image20.png) + +## 3.5 What If Problems Appear: Keep Communicating in Natural Language + +AI-generated versions are not always perfect at first. You may encounter: + +- runtime errors and app fails to open; +- features mostly correct, but details differ from your expectation; +- UI usable but still not visually pleasing or convenient enough. + +At these moments, no need to blindly edit code yourself. Describe problems directly to Trae AI assistant in natural language, for example: + +"Joystick control works now, but snake sometimes suddenly stops. Please check current implementation." +Or: "Game is playable now, but interface feels crowded. I want more vertical spacing on mobile. Please adjust layout." + +AI will use current project context + your description, then provide and apply code changes directly. If result becomes worse or direction is wrong, you can still rollback to previous stable version and try another wording. + +Through several such rounds, you can polish from "rough first version" to a joystick-based Snake closer to your preference. +For example, I gave a style reference image and asked AI to adjust UI style accordingly: + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image21.png) + +## 3.6 Final Result and Section Summary + +After repeated rounds of **natural language description -> AI modification -> preview in WeChat DevTools -> continue micro-adjustment**, I finally got this result: + +- complete game page; +- snake moves smoothly and eats food; +- joystick control supported; +- runs correctly in mini program simulator. + +Final product examples: + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image22.png)![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image23.png)![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image24.png) + +In this section, you have seen a complete closed loop: + +1. In Trae, one clear instruction let AI build first Snake mini program version; +2. With HBuilderX + WeChat DevTools, validate real effect from user perspective; +3. Keep proposing modifications in natural language, let AI handle feature and UI optimization; +4. When issues appear, use rollback + rerun to keep process safe. + +Next, you can use same rhythm for your own ideas: not limited to Snake, but also utility mini programs, event pages, or real business prototypes. Your main task is to think clearly and describe clearly. Let AI and tools handle the rest. + +# 4. Mini Program Release + +In the previous three chapters, we completed the full flow from **environment setup** -> **AI-assisted development** -> **running Snake in local simulator**. + +From this chapter, the key question becomes: **how to really publish this work to WeChat, so it is not just a toy, but a usable mini program?** + +To reduce difficulty, we first take the **shortest closed loop**: publish only as a **test/experience version** for yourself and a few teammates. After function and experience are stable, then proceed to formal public release. + +This chapter first covers 4.1 to complete the shortest path for **experience-version launch**. Formal release for all users is explained in 4.2. + +## 4.1 Shortest SOP - Launch as Experience Version + +Goal of this subsection is only one thing: let you open your Snake mini program in WeChat as an **experience version**. + +The whole flow is four tasks: + +1. Find and confirm your AppID in WeChat Official Platform. +2. Configure this AppID in your project. +3. Upload current version in WeChat DevTools. +4. Return to Official Platform and set this uploaded version as "Experience Version." + +Let us go in this order. + +### 4.1.1 Confirm AppID in WeChat Official Platform + +First step: confirm your mini program AppID in WeChat Official Platform. + +You already did this once in **Section 2 Environment Setup**. Here we use it for real. + +1. Visit `https://mp.weixin.qq.com` and log into your mini program backend. +2. Find "Development Management" in left menu, then enter "Development Settings." +3. At top, find "Developer ID" area. There is a line "AppID (Mini Program ID)" - this is your unique ID. + +This ID must exactly match project config. Otherwise WeChat sees it as another app identity and preview/publish will fail. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image25.png) + +### 4.1.2 Fill AppID in Project + +Second step: write this AppID into project configuration so local build maps to your official mini program account. + +If your project uses uni-app template, do this: + +1. Open HBuilderX and load Snake project. +2. Find `manifest.json` in file tree and open it. +3. Scroll to "WeChat Mini Program Configuration," and you will see an input such as "WeChat Mini Program AppID." +4. Paste AppID copied from Official Platform exactly, then save file. + ![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image26.png) + +Now your local project has claimed this mini program identity. Next, when you upload from WeChat DevTools, it will be recorded under this AppID. + +### 4.1.3 Upload a Version in WeChat DevTools + +We have already run project into WeChat DevTools to preview simulator. + +Now we do: "package current code as a version and upload to server." + +Steps: + +1. In top-right toolbar of WeChat DevTools, click "Upload." +2. In popup, fill two key fields: + 1. Version number: for example `1.0.0` (digits and dots only). + 2. Project note: short description, such as "Completed core gameplay." +3. Confirm and click "Upload." Output panel shows build process. If all steps turn green and upload completes, this version is successfully submitted to WeChat server. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image27.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image28.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image29.png)![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image30.png) + +### 4.1.4 Set Uploaded Version as Experience Version in Backend + +Upload only sends code to WeChat side. You still need to tell system "this is an experience version." + +Final step: go back to Official Platform backend and complete loop. + +1. Open `https://mp.weixin.qq.com` and enter mini program backend. +2. In left menu, find "Management" -> "Version Management." +3. In "Development Version" section, you should see the uploaded version: version `1.0.0`, your note, and just-uploaded timestamp. +4. On the right side of this row, use dropdown/action button to choose "Set as Experience Version," confirm action. Before this step, ensure your main category is configured on homepage/category settings. + + ![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image31.png) + + ![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image32.png) + +After completion, this version becomes your mini program "Experience Version." You can generate experience QR code in backend, or add yourself/team as experience members, then scan in WeChat for real-device testing. + +At this point, we have finished the shortest loop from local project to test launch: + +You do not need to open to all WeChat users immediately. In a safe range, run real mini program in real WeChat environment first. That is enough for feature testing, feedback collection, and iteration. + +## 4.2 Formal Launch of Mini Program + +After experience version runs well, you can already play this Snake mini program in your own WeChat. +Next step is moving from limited experience users to a fully public WeChat mini program. + +Break this into steps: complete basic info, choose category, finish filing, then submit review. Follow this order: + +### 4.2.1 Enter Mini Program Release Flow + +First go back to WeChat Official Platform backend and log in. +In left navigation find entries related to "Version Management / Release" (UI may vary slightly over time). You will find "Mini Program Release Flow." + +After entering, top area shows a progress bar. Below it lists steps such as: + +1. Mini Program Information +2. Mini Program Category +3. Operation Information / Filing +4. WeChat Verification (depending on entity type) + +At beginning progress is 0%. As each step is completed, system updates automatically. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image33.png) + +### 4.2.2 Fill Basic Mini Program Information + +First step is completing your mini program "business card," which is what users first see in WeChat. + +On "Mini Program Information" page, you usually need to fill/confirm: + +1. Mini program name + This appears in search results and app header. It has length limits and naming rules. Choose a name that describes function and is easy to remember. +2. Description / intro + Use one or two sentences to explain what this mini program does, for example: "A Snake game developed with AI-assisted coding, suitable for quick casual play." + Keep description consistent with real functionality and avoid exaggerated marketing text. +3. Icon and screenshots + 1. Icon usually requires square image with PNG/JPG support and size/pixel limits (check page rules). Use simple, high-contrast icon. + 2. Upload several screenshots such as homepage, game page, settings page. They help users understand content. +4. Other required fields + Such as tags and service region, fill according to prompts. + Only one principle: all information must match real functionality of your Snake mini program. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image34.png) + +After all fields are done, click Save or Next. First step in release flow is complete. + +### 4.2.3 Select Mini Program Service Category + +After basic information, wizard guides you to "Mini Program Category." +Category is your app's classification in WeChat, affects review route and later display/operation. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image35.png) + +On this page you will see "Add Category." Click it and choose proper category in system category tree, for example: + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image36.png) + +1. Choose "Education" as top-level category; +2. Then choose more specific subcategory such as "Education Tools / Teaching Assistant." In this example, education tools are selected as learning aid for vibecoding. + +In your own project, simply choose the closest category by real use case. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image37.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image38.png) + +After confirming category, click Save. If page shows "category created successfully" and displays your new item, this step is complete. + +### 4.2.4 Complete Filing Information + +Next, release flow asks for "Operation Information / Filing." This verifies responsible entity behind mini program. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image39.png) + +Under individual entity example, flow usually includes: + +1. Select filing type + Choose among types such as "Individual" or "Enterprise," consistent with your registration entity. +2. Fill entity information + Include name, ID type, ID number, etc. This must match registration information, otherwise review may reject. +3. Upload supporting documents + Usually requires ID photos or other proof files, with specific format/size/clarity requirements shown on page. Prepare and upload clear files. + ![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image40.png) + +After submission, system enters "under review" and shows a message like "Information submitted, please wait." This may take some time. You can check progress anytime in backend. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image41.png) + +### 4.2.5 Submit for Review and Wait for Formal Release + +When "Mini Program Information," "Category," and "Operation Information/Filing" are all completed, do final action: submit for review. + +1. Return to release-flow overview page and confirm all items show completed, with progress close to 100%. +2. Click "Submit for Review" (or similar button) to submit current development version to WeChat review team. +3. In "Version Management," this version status becomes "Under Review." After approval it becomes "Published" or available for "Go Live." + +If filing review fails, developers may receive a call specifying failed parts. + +For filing, you may receive verification code and verification link from Ministry of Industry and Information Technology. Open link and fill code + personal info (verification valid for 1 day). If filing passes, you receive email and SMS notice with filing number. +WeChat verification: individual usually pays 30 CNY, enterprise around 300 CNY. Fee is non-refundable regardless of approval result. You may receive verification notice and confirmation call. + +When submitting review, upload operation video/screens and fill required info. Then click "Submit Release" for formal launch. + +![](../../../../zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/images/image42.png) + +# 5. Summary + +At this point, you have completed a full **0-to-1** mini program development loop: from understanding mini programs, to installing Trae, HBuilderX, and WeChat DevTools; from giving AI your idea and letting it "move bricks" in code, to playing first Snake version in simulator; then packaging as experience version, finishing filing/review, and making it truly usable in WeChat - you have personally run through the full chain once. + +More importantly, you did not achieve this by memorizing syntax. You achieved it by clearly expressing requirements + communicating effectively with AI. You have already experienced this: **one natural-language instruction can let AI satisfy your development needs very effectively**. This capability is not limited to Snake. It can transfer to any mini program you want to build later - tools, event pages, educational apps, or real work projects. + +If we summarize into a **general SOP**, it is only five steps: +**Clarify one small requirement -> build project skeleton in Trae -> use vibecoding + AI to create first version -> repeatedly play-test and improve in WeChat DevTools -> upload, file, review, and launch.** +Each time you repeat these five steps, you gain another real mini program that can be opened and shared, and another layer of confidence that "I can use AI to turn ideas into products." + +Next, you can keep polishing this Snake app, or close it and start a blank project from your own idea. No matter what you build, remember one thing: you are no longer just someone who "wants to build something." You are already a vibecoding developer who has run the full workflow. The rest is repetition until this capability becomes habit. + +# References: + +- https://zhuanlan.zhihu.com/p/1889401120939567074 +- https://blog.csdn.net/2401_87407347/article/details/155193007 diff --git a/docs/en/stage-3/cross-platform/3.4-wechat-miniprogram-backend/index.md b/docs/en/stage-3/cross-platform/3.4-wechat-miniprogram-backend/index.md new file mode 100644 index 0000000..b0c8c18 --- /dev/null +++ b/docs/en/stage-3/cross-platform/3.4-wechat-miniprogram-backend/index.md @@ -0,0 +1,3 @@ +# Cross-Platform Development - How to Build a WeChat Mini Program (with Backend) + +> This chapter is currently being written. Stay tuned... diff --git a/docs/en/stage-3/cross-platform/3.5-android-app/index.md b/docs/en/stage-3/cross-platform/3.5-android-app/index.md new file mode 100644 index 0000000..2283b9a --- /dev/null +++ b/docs/en/stage-3/cross-platform/3.5-android-app/index.md @@ -0,0 +1,679 @@ +# How to Build a Simple Android App - Native Compose Development + +# 1 What Android Apps and Android Development Are + +In this tutorial, we will complete a full closed loop: **from an idea in your mind to a real app that can be successfully installed and run on an Android phone.** + +For this tutorial, you should at least have: + +- A computer with decent performance (Windows or Mac) +- An Android phone (optional; if you do not have one, we will use an emulator) +- Android Studio installed (for building) +- Trae installed and registered (for AI coding) + +## 1.1 Definition of Android App + +An Android App is a native application that runs on the Android operating system. Unlike mini programs, it does not depend on a host like WeChat. It runs directly at the system level. It has its own home-screen icon, launches quickly, feels smooth, and can deeply access system-level features such as Bluetooth, sensors, and background services. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image1.png) + +## 1.2 Android App Development + +Android development refers to the whole process of building such applications. In the Vibe Coding development mode used in this tutorial, with **AI-assisted programming**, the developer's role shifts from "code writer" to "product architect": + +1. **You (architect / PM)**: responsible for business logic design, prompt writing, and final acceptance of the result. +2. **Trae (AI engineer)**: responsible for executing instructions, converting natural language into standard Kotlin code and Jetpack Compose layouts, and handling syntax errors and logic details. +3. **Android Studio (build factory)**: responsible for providing the compile environment, packaging code into a runnable app, and offering emulator previews. + +## 1.3 Common Ways to Build Android Apps + +In real development, there is more than one way to build Android apps. We will not go deep here, but only provide an overall understanding. + +**The first way: Native Development** +This is Google's official and recommended route. You directly use **Kotlin** and **Jetpack Compose** to develop. Its advantage is the best performance and full access to phone hardware. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image2.png)![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image3.png) + +**The second way: Cross-Platform Development** +For example Flutter or React Native. The core idea is "write one codebase and generate both Android and iOS apps." + +**The third way: Hybrid Development** +In essence, this is wrapping a webpage inside an app shell. This is fast to develop, but the experience and smoothness are usually not as good as a native app, and it is difficult to build a polished, immersive small tool this way. + +**This tutorial's choice: native development (** **Kotlin + Compose)** combined with AI tools for coding. +The reason is simple: native Jetpack Compose code has a very clear structure and is highly suitable for AI to understand and generate. We do not need to handwrite code from scratch. Instead, we guide Trae with natural language to generate high-quality native code. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image4.png) + +## 1.4 Android App Development Steps Covered in This Tutorial + +To keep the learning process interesting, this tutorial revolves around a relaxing but technically representative case - **Electronic Wooden Fish**. We combine Trae's Vibe Coding mode with a route you can reuse repeatedly: + +1. **Build understanding and environment**: understand what Android apps are, install Android Studio and Trae, and configure China-friendly mirrors so the toolchain works smoothly. +2. **Build the project skeleton**: create a blank Android project that can successfully run in the emulator. +3. **AI iterative development**: open the project in Trae, then through conversation with AI, gradually implement the wooden fish image, tap animation, sound effects, floating text, and more. +4. **Real-device debugging and polishing**: move beyond the emulator, install the app on your actual phone, experience real vibration feedback, and let AI help investigate bugs. +5. **Packaging and publishing**: generate a formal APK and understand how to share or release it. + +This section only draws the big picture and does not expand all commands yet. For now, just remember the main line: **environment setup -> skeleton building -> AI description and generation -> real-device polishing -> packaging and delivery**. In the next chapters, we will take you through each step. + +# 2 Development Environment Setup + +## 2.1 Tools Used in This Tutorial + +During the whole development process, we use three tools together, playing the roles of "design," "construction," and "acceptance." + +- **Trae**: this is your **AI coding partner**. In Vibe Coding mode, we no longer need to type code line by line. Instead, we mainly tell AI in natural language what we want, and it handles code generation and modification. +- **Android Studio**: this is Google's official **app build factory**. Although it has many buttons, in this tutorial we mainly use it to create the project skeleton and compile Trae-generated code into something installable on a phone. +- **An Android device**: this acts as the **test terminal** for viewing the result. You can connect it to your computer for real-device debugging and feel real vibration feedback. If you do not have one, Android Studio's built-in **Emulator** can simulate a virtual phone perfectly, which is enough for early development. + +## 2.2 Download Trae + +Trae is our main battlefield for **Vibe Coding**. You can think of it simply as an **"AI-powered code editor."** + +Visit the official website [https://www.trae.cn](https://www.trae.cn), download the version matching your computer system (Windows or Mac), and install it just like ordinary software by double-clicking the installer and following the prompts. Once this tool is ready, in later practice we will stop staring at boring code windows and instead open the project here and tell AI what to build using natural language. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image5.png) + +## 2.3 Download Android Studio + +We need Android Studio to provide the Android SDK and emulator required for running the app. Visit the official download page [https://developer.android.com/studio?hl=zh-cn](https://developer.android.com/studio?hl=zh-cn) and download the package for your operating system (this tutorial is based on **2025.2.3**). After downloading, install it like normal software, keeping the default options throughout. + +**Special reminder for beginners:** + +Although modern versions of Android Studio have greatly simplified configuration, it still depends on the **JDK (Java Development Kit)** under the hood. If this is your first time doing development, or if you encounter errors related to environment variables or SDK configuration during installation, do not panic. You can refer to this detailed setup guide: [Android Studio2024版本安装环境SDK、Gradle配置](https://blog.csdn.net/keiraee/article/details/142321644?ops_request_misc=elastic_search_misc&request_id=a2b858d1f665095c53afa9114ad8864d&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~top_positive~default-2-142321644-null-null.142^v102^pc_search_result_base4&utm_term=android%20studio%E5%AE%89%E8%A3%85%E5%8F%8A%E9%85%8D%E7%BD%AE&spm=1018.2226.3001.4187) + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image6.png) + +## 2.4 Create a New Project + +Open Android Studio and click **New Project** on the welcome screen. + +**Step 1: Choose a template** + +In the template list, select **Empty Activity** (notice the Jetpack Compose icon on it). + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image7.png)![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image8.png) + +**Step 2: Fill in project configuration** + +Then you will see a configuration form. Fill it roughly as follows and keep the rest at default: + +| **Field** | **Recommended Value** | **Explanation** | +| ----------------- | -------------------------------------------------- | ---------------------------------------- | +| **Name** | My Application 1 | App name shown on the phone home screen | +| **Package name** | com.example.myapplication1 | Unique app identifier | +| **Save location** | Custom path (for example `E:\AndroidProjects\Myapplication1`) | Project storage location; not recommended to place on C drive | +| **Minimum SDK** | API 30 | Covers over 90% of active devices while balancing compatibility and features | +| **Language** | Kotlin (recommended) | Kotlin is Google's officially recommended language, cleaner and safer | + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image9.png) + +**Step 3: Wait for project build** + +Click **Finish**. Android Studio will automatically download dependencies and build the project (you will see a progress bar in the bottom-right corner). + +- _Note: the first project creation may take several minutes. Wait patiently until the bottom progress finishes and the project file tree is fully loaded on the left._ + +## 2.5 Dependency Configuration: Gradle Download and Gradle Repository Mirrors + +> This is one of the few steps in the Vibe Coding workflow where **manual operation** is recommended. Although AI can also help modify config, environment configuration touches low-level files, so manual changes are more reliable. + +Why do we need to modify the configuration? + +By default, Android Studio connects to overseas servers, so downloading build tools and dependencies may take an hour or even fail. After switching to domestic mirrors, it often finishes within a few minutes. **This is a one-time task that pays off forever.** + +1. **Preparation** + +If the bottom-right status bar of Android Studio is currently showing a progress bar like `Gradle Building...`, pause the ongoing dependency download first to avoid file conflicts. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image10.png) + +2. **Speed up Gradle download** + +In the project file tree on the left, expand `gradle` -> `wrapper`, then open `gradle-wrapper.properties`. Change the download source to Tencent's mirror: + +```text +distributionUrl=https\://mirrors.cloud.tencent.com/gradle/gradle-8.7-bin.zip +``` + +Be careful: you only need to replace `services.gradle.org/distributions` with `mirrors.cloud.tencent.com/gradle`. Do not change anything else. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image11.png) + +3. **Speed up dependency repository download** + +Then, open `settings.gradle.kts` in the project root, and replace the content inside the `repositories` block with the following: + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image12.png) + +Replace the highlighted section with this code (latest source list as of 2025-02-21): + +```json + // Aliyun mirrors (covering Maven Central, Google, JCenter, etc.) + maven { setUrl("https://maven.aliyun.com/repository/public/") } + maven { setUrl("https://maven.aliyun.com/repository/google/") } + maven { setUrl("https://maven.aliyun.com/repository/jcenter/") } + maven { setUrl("https://maven.aliyun.com/repository/gradle-plugin/") } + // Huawei Cloud mirror + maven { setUrl("https://repo.huaweicloud.com/repository/maven/") } + // Tencent Cloud mirror + maven { setUrl("https://mirrors.cloud.tencent.com/nexus/repository/maven-public/") } + // NetEase mirror + maven { setUrl("https://mirrors.163.com/maven/repository/maven-public/") } +``` + +It should then look like the screenshot below: + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image13.png) + +4. **Save and apply changes** + +At this point, save the file and click `Try Again` in the top-right corner. Android Studio will re-run the download. Wait a few minutes. When the console shows `BUILD SUCCESSFUL`, it means the environment setup is fully complete and we are ready to start coding. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image14.png) + +## 2.6 Understand the Project Structure + +After project creation succeeds, the **Project** panel will appear on the left. Switch to the **Android** view (default), and you will see key directories like this: + +```text +app/ +├── manifests/ +│ └── AndroidManifest.xml <- app "ID card", declares app name and entry Activity (MainActivity) +│ +├── java/ +│ └── com.example.myapplication1/ +│ ├── MainActivity.kt <- app entry, builds UI with Jetpack Compose +│ │ +│ └── ui/ <- controls the overall UI style (colors, fonts) +├── res/ +│ ├── drawable/ <- image resources (for example ic_launcher.png) +│ ├── mipmap/ <- app icon +│ ├── values/ <- text, color, theme styles +│ │ ├── colors.xml +│ │ ├── strings.xml +│ │ └── themes.xml +│ └── xml/ <- system-related config files (not UI) +└── build.gradle (Module: app) <- app build config (usually untouched at beginner stage) +``` + +As beginners, we usually only need to focus on three files: + +- `MainActivity.kt`: controls behavior and decides "what appears on the screen" +- `AndroidManifest.xml`: registers components and decides "where the app starts" +- `Theme.kt`: defines the visual appearance + +# 3 Android App Development + +In the first two chapters, we already understood what Android apps are and sharpened the two key tools: Trae and Android Studio. From this section on, we leave paper discussion and enter real practice. We will adopt Vibe Coding mode to build a very popular stress-relief app from scratch - **Electronic Wooden Fish**. It fits the "Vibe" theme well (simple and relaxing), while also covering three core parts of Android development: **UI interaction (tapping), data storage (merit count), and multimedia (sound effects)**. + +Now, follow along and send the first instruction to AI. + +## 3.1 The First "Master Prompt": From Zero to One + +In Vibe Coding mode, we do not need to first create layout files and then write logic code as in traditional development. What we need to do is **describe the requirements clearly in one shot and let AI generate the first runnable prototype**. + +Open the project directory we just created in Trae, and in the chat panel on the right, enter the following Prompt: + +```text +You are a senior Android development expert. Please rewrite the current MainActivity.kt and turn it into an "Electronic Wooden Fish" app. Requirements: +1. The screen background is black. +2. Display a wooden fish graphic in the center of the screen, moderate in size, in white. +3. Show a line of white text above it: "Merit: 0". +4. When the wooden fish in the center is tapped, the number increases by 1 and a simple scale animation effect appears (simulating the feeling of knocking). +5. Use Jetpack Compose. +``` + +After sending it, Trae will begin analyzing your project structure. A few seconds later, it will directly generate the full code for `MainActivity.kt`. + +1. From its response, we can see its reasoning logic and interaction logic +2. We can directly see which parts of the code were rewritten +3. If we are not satisfied with the result, we can roll back to the previous version + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image15.png) + +## 3.2 Run and Preview (Emulator Debugging) + +At this point, AI has completed the first round of development. But remember, what we see in Trae is only code "blueprints," not a real interactive app. Trae itself cannot directly run Android apps, so we need to rely on the **Virtual Device emulator** provided by Android Studio. It is like turning your computer screen into a virtual Android phone, allowing us to install the code immediately and view the real result. + +Next, let us configure this "virtual phone." + +**Step 1: Create the emulator** + +Back in Android Studio, find and click **Device Manager** in the right toolbar. If you cannot find it, open it from `View -> Tool Windows -> Device Manager`. + +In the panel, click **Add a new device**, then choose **Create Virtual Device** to enter the device selection window. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image16.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image17.png) + +In the hardware selection window, choose **Phone** and then **Smart Phone** (medium screen size), or any other device profile you prefer such as Pixel, then click **Next**. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image18.png) + +**Step 2: Configure the system image** + +In the **System Image** dialog, select **API 36.1**. If it has not been downloaded yet, click **Download** first, then select it after download is complete, and click **Finish**. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image19.png) + +**Step 3: Start the emulator** + +After successful creation, your new phone will appear in the device manager list. Click the **triangle play button** on the right. After a short wait, a phone-shaped window will pop up - this is your Android emulator. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image20.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image21.png) + +**Step 4: Run the app** + +Now comes the magic moment. Make sure the emulator has started and is showing the desktop, then click the prominent **green Run triangle** in the top toolbar of Android Studio (or use shortcut `Shift + F10`). Android Studio will automatically compile the code written by Trae, package it as an app, and install it into the emulator. + +Within seconds, you should see the emulator screen light up, showing a white wooden fish graphic in the center with the text "Merit: 0" above it. Try tapping it and see whether the number increases and the animation works. This is your first Android app. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image22.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image23.png) + +## 3.3 Optimization Iteration (Add Assets and Sound) + +At this stage, our app already has a basic shape: tapping increases the number. But it is still just a "mute" white geometric shape, lacking fun. Next, we will make the Electronic Wooden Fish much more immersive by adding a real image and knock sound effect. + +**This is exactly the most attractive part of Vibe Coding mode.** In traditional development, adding sound effects and more complex animations is often a beginner's nightmare. You need to manage `MediaPlayer` resource loading and releasing (otherwise memory leaks may happen), and also calculate animation curves. In Vibe Coding mode, you do not need to care about these low-level details at all. You only need to tell AI like a director: "change the prop and add a sound effect when tapped," and the implementation appears immediately. + +**Step 1: Prepare assets** +You need one wooden fish image (`png`) and one knock sound effect (`mp3`). + +- **Image asset**: copy the prepared `white_muyu.png` into `app/src/main/res/drawable` +- **Audio asset**: in Android Studio, right-click the `res` folder in the left project panel, choose `New -> Android Resource Directory`, select **raw** as the resource type, click OK, then copy `voice.mp3` into the new `res/raw` folder. _(Note: if you plan commercial release, make sure you have legal rights to all assets.)_ + +Here are the image and sound assets I found for you. If it is inconvenient for you to search for your own, you can directly use them. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image24.png) + +Knock sound download link: https://www.aigei.com/s?q=%E6%9C%A8%E9%B1%BC&type=sound +Choose the first 1-second sound effect. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image25.png) + +**Step 2: Send the iteration instruction** + +After the assets are ready, go back to Trae. Trae will modify the code again and handle the audio-loading and animation logic for you. You only need to tell it which assets to use. Enter this Prompt: + +```text +I have added the assets. The image path is res/drawable/white_muyu.png and the sound effect path is res/raw/voice.mp3. Please update the code: +1. Replace the wooden fish icon in the center with my image. +2. Play the knocking sound every time the wooden fish is tapped. +3. When tapped, show a temporary "+1" text above the wooden fish, then let it float upward and disappear (like floating score text in games). +``` + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image26.png) + +**Step 3: Verify the result** + +After Trae finishes modifying the code, return to Android Studio and click the green Run button again (Re-run) to restart the emulator. At this point, your app will feel transformed. Try tapping continuously - you should hear a crisp "tok tok" sound and see the floating "Merit +1" text jumping out. This completes the key transition from "demo" to "product." + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image27.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image28.png) + +## 3.4 What If Bugs Appear? (Debugging Loop with AI) + +AI-generated code is not guaranteed to be perfect on the first try, just like top engineers also cannot promise bug-free code in one shot. But in Vibe Coding mode, bugs are no longer a wall blocking you; they become stepping stones in your collaboration with AI. + +**Case 1: the app crashes** + +Suppose the app crashes immediately after clicking Run, or tapping the wooden fish does not play sound. Traditionally, you would need to search for the error code, browse dozens of technical forums, and read lots of difficult English posts. In Vibe Coding mode, you only need to do one thing - **be a courier**. + +**Steps:** + +1. **Open the log**: find the **Logcat** panel at the bottom of Android Studio (the small cat icon). +2. **Locate the error**: you will see scrolling logs, and the **red lines** are usually the key errors. +3. **Copy and paste**: select the red English error text, copy it, and paste it into Trae: "I got this error while running. Please help me fix it." +4. AI may immediately tell you something like: "This happened because vibration permission was not declared in `AndroidManifest.xml`," and then give you the fixed code. You just click Apply and move on. + +**Case 2: the app runs, but the experience feels bad** + +Sometimes the app does not crash, but still feels unsatisfying. For example, when tapping the wooden fish very quickly, you may notice that new "+1" animations do not show up until the previous "+1" fully disappears. That makes the feedback feel laggy and not satisfying. You do not need to study multi-threading or animation queues yourself. You only need to clearly describe that discomfort to AI. + +Send this "advanced instruction" to Trae: + +```text +Please modify the current animation logic to solve the "fast tapping does not trigger" problem. +Current issue: it seems there is only one animation state, so I have to wait until the previous "+1" completely disappears before another click responds. +Requirements: +1. Replace the single animation state with a mutableStateListOf-based list. +2. Every time the wooden fish is tapped, add a new "+1" instance immediately to the list (with its own ID and initial position), regardless of whether the previous animation has finished. +3. In the UI, iterate through this list so each "+1" runs its own upward-floating + fade-out animation independently. +4. After a "+1" animation finishes, automatically remove it from the list to prevent memory leaks. +Please directly provide the updated MainActivity.kt code. +``` + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image29.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image30.png) + +## 3.5 Final Result Showcase + +In the previous steps, we already completed an Electronic Wooden Fish that can be seen and heard. To make it closer to a publishable app, we will use one final iteration to add **touch feedback** and **customization**. We will implement two core features: first, **vibration feedback**, so every tap gets a physical response from the phone motor and greatly improves immersion; second, **custom text**, allowing users to modify the text on screen, for example changing "Merit +1" to "Salary +1" or "Trouble -1". + +Send the following carefully designed Prompt to Trae. It will handle the dialog logic, state switching, and hardware interaction in one pass: + +```text +Role: You are an Android Jetpack Compose expert. +Task: Please add "custom text" and "vibration feedback" to the existing Electronic Wooden Fish app. +Requirements: +1. Haptic Feedback +Whenever the user taps the wooden fish, in addition to sound and animation, call the phone's haptic feedback (using LocalHapticFeedback.current) to give a light tactile response. +2. Custom Text Feature (UI and interaction) +Entry: Add a small edit icon next to the top text such as "Merit +1" (you can use Icons.Default.Edit). +Dialog logic: When the icon is tapped, show a dialog (Dialog/AlertDialog). + Dialog title: "Modify Content" + Input: Allow the user to enter the text they want to accumulate (default is "Merit") + Value choice: Below the input, provide two options (for example RadioButton or toggle) so the user can choose "+1" or "-1" + Save button: After clicking save, close the dialog and apply the new settings to the home screen + Data refresh: If the user updates the content, reset the top counter to 0 and start counting from zero again +3. Effect update +After saving, both the top counter text and the floating animation text shown when tapping the wooden fish should change to the user's custom format. + The floating text size should not exceed the size of the top counter text + Example: if the user enters "Salary" and chooses "+1", the top counter logic becomes +1 and the floating text becomes "Salary+1" + If the user enters "Trouble" and chooses "-1", the top counter logic becomes -1 and the floating text becomes "Trouble-1" +4. Technical requirements: +Make sure the new state (text and number) correctly affects the animation. +Please directly provide the full updated MainActivity.kt while keeping the previous sound and animation logic unchanged. +``` + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image31.png) + +# 4 Real-device Debugging and Polishing + +The emulator is convenient, but it cannot simulate real phone vibration or fully reflect real touch latency. To get the most accurate "feel," we need to install the app on a real Android phone. Below are two connection methods you can choose from: + +1. **Wireless debugging (Wi-Fi)**: no data cable required, convenient for daily checking. But your computer and phone must be on the **same Wi-Fi network**. +2. **USB wired debugging**: more stable and less likely to disconnect, suitable when the network is poor or initial installation fails. + +## 4.1 Wireless Debugging + +This is the most convenient method on Android 11 and above. + +**Step 1: Prepare the phone** + +1. Make sure the phone and computer are on the **same Wi-Fi**. +2. Open **Developer options** and enable **Wireless debugging**. +3. Tap **Wireless debugging** to enter details, then choose **Pair device with QR code**. Your phone will open a scanner view. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image32.png)![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image33.png) + +**Step 2: Pair on the computer** + +1. Back in Android Studio, click the device selector in the top toolbar. +2. Choose **Pair Devices Using Wi-Fi** from the dropdown. +3. A QR code will pop up on screen. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image34.png) + +**Step 3: Scan to connect** + +1. Use your phone to scan the QR code on your computer screen. +2. Both the phone and computer should show "pairing successful." +3. At this point, Android Studio's top device bar will automatically display your phone model (for example `Google Pixel 8`). + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image35.png) + +4. Run the app by clicking ▶️ Run + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image36.png) + +## 4.2 USB Wired Debugging + +If wireless connection is unstable, or your network is complicated, plugging in with a cable is always the most reliable solution. Although it is less convenient, it gives the fastest transfer speed and almost never disconnects. + +### 4.2.1 Prepare USB Driver in Android Studio (Windows only) + +Mac users can skip this step, because macOS usually recognizes the phone directly. Windows users need to make sure the computer can recognize the Android phone, which usually means installing Google's USB driver: + +1. In Android Studio, click `Tools -> SDK Manager` (or find it under `Settings -> Languages & Frameworks -> Android SDK`) +2. Switch to the **SDK Tools** tab +3. Check **Google USB Driver** and click **Apply** to download and install it + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image37.png)![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image38.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image39.png) + +### 4.2.2 Download the Same SDK Version as Your Real Device + +**Step 1: Check the phone's Android version** + +Using an OPPO phone as an example: open Settings -> About phone -> check Android version (in the example it is Android 12). + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image40.png) + +**Step 2: Download that Android platform version in Android Studio** + +1. In Android Studio, click `Tools -> SDK Manager` +2. Stay in the default **SDK Platforms** tab +3. Select Android 12.0 and click Apply to download + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image41.png) + +### 4.2.3 Enable Developer Mode on the Phone + +Open your phone settings, go into developer options, and turn on **USB debugging**. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image42.png) + +### 4.2.4 Install the USB Driver Authorization on the Phone + +At this point, pick up your phone. It should show an important security dialog: "Allow USB debugging?" Make sure to check **Always allow** and then tap **Allow** or **OK**. This is the key authorization that gives the computer control for debugging. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image43.png) + +### 4.2.5 Run the App on the Phone + +1. In Android Studio's top device selector, you should now see your phone model (for example `OPPO-PDKM00`) +2. Click ▶️ Run. Your phone will show the "Allow USB debugging?" dialog; check "Always allow" and confirm +3. The app will automatically install and launch + +Now try tapping the wooden fish on your phone and feel the real vibration motor response. This is the full Vibe Coding experience. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image44.png)![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image45.png)![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image46.png) + +# 5 Package the App as APK + +The code is done, and the real-device test also works. Now we need to "take the app out" of Android Studio and turn it into a file you can send to friends for installation. This process is called **packaging**. In Android development, packaging has two completely different modes, and we choose based on the usage scenario. + +## 5.1 Package the Debug Version (for Quick Sharing) + +If you only want to share the app with friends for a quick try, or send it to test phones for verification, the **Debug version** is the fastest option. It is like a "draft" - fully functional, but not formally signed, so it cannot be submitted to app stores. + +**The steps are very simple:** in the top menu of Android Studio, find `Build`, hover over `Generate App Bundles or APKs`, and click `Generate APKs` from the submenu. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image47.png) + +Wait about 5 seconds depending on project size. In the bottom-right console area of Android Studio, a prompt will appear. Click the blue `locate` link and the output folder will open automatically. The file named `app-debug.apk` is the package we want. + +You can directly send it through WeChat or QQ to any Android phone, and the recipient can install and use it. Note that debug is not a release version. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image48.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image49.png) + +## 5.2 Package the Release Version + +If you want to publish the app to an app store (such as Google Play or Huawei AppGallery), or avoid the "unsafe app" warning during installation, then you must package a **Release version**. This version requires a unique **digital signature**, which is like an anti-counterfeit seal proving that you developed this app and that it has not been tampered with. + +> Core purpose of signing +> +> - Determine the publisher's identity: because an app with the same package name can replace an installed program, signing prevents that from being abused +> - Ensure app integrity: the signing process covers every file in the package, ensuring they are not replaced afterward + +Android app signing is like attaching a seal. After the seal is attached, the app and the developer are locked together: the app is yours, and you are responsible for it. Others cannot impersonate you, and you cannot impersonate others. + +**Step 1: Start the signing wizard** + +In the top menu, select `Build`, then click `Generate Signed Bundle / APK`. In the popup window, you will face two choices: + +- Android App Bundle (`.aab`): required by Google Play, smaller in size, but cannot be directly installed on a phone +- APK: standard install package, can be installed directly +_For demonstration, we choose APK first and click Next._ + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image50.png)![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image51.png) + +**Step 2: Create a digital key (KeyStore)** + +This is where beginners get stuck most often. Because this is your first release packaging, you need to create a new **keystore**. Click **Create new** below `Key store path`. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image52.png) + +In the popup, fill in the required information, similar to registering an account. We strongly recommend that the keystore password and key alias password be **the same**, and that you **write them down carefully**. If you lose this password, your app can never be updated again in the future. + +After finishing, click OK. You will return to the previous screen, and the key information you just filled in will already be populated automatically. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image53.png)![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image54.png) + +**Step 3: Generate the formal package** + +Click Next, choose **release** under Build Variants, and finally click **Create**. + +After a short wait, Android Studio will again show a "Generate Signed APK" success prompt in the bottom-right corner. Click **locate**, and this time you will see the digitally signed formal package in the folder (usually named `app-release.apk`). This file is the final product you deliver as a developer. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image55.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image56.png)![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image57.png) + +# 6 Official Release to App Stores / Markets + +When your app development is finished and the Release package is ready, the next step is to publish it so more people can download and use it. Right now, the main distribution channels are divided into two categories: **domestic Android app stores** and **overseas app stores (Google Play)**. + +## 6.1 Publish to Domestic Markets + +The Android ecosystem in mainland China is special. There is no single official store (because Google Play is not directly accessible). Instead, the market is split between **phone-maker app stores** and **third-party platforms**. The major **manufacturer stores** include Huawei, Xiaomi, OPPO, vivo, Meizu, Samsung, etc. Since they are preinstalled on devices, they have the largest traffic. The main **third-party platforms** include Tencent MyApp and 360 Mobile Assistant. + +### 6.1.1 The Core Difficulty: The "Roadblock" for Individual Developers + +Before registering an account, there is one very important thing you must know: **domestic app markets are very strict with individual developers**. + +At present, almost all major domestic app stores (Huawei, Xiaomi, OV, MyApp, etc.) **require** a *Software Copyright Registration Certificate* for submission. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image58.png)![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image59.png) + +- **What is it?** It is a legal document proving that the app belongs to you. +- **Cost to obtain it**: you need to apply through the copyright bureau. Doing it yourself usually takes 2-3 months; using an agency for faster processing may cost from several hundred to over a thousand RMB. +- **Current reality**: without this certificate, your app will very likely fail review, or you may not even be able to create the app entry. In addition, categories such as news, finance, and healthcare may also require ICP filing or other qualifications. + +So if your app is just a personal practice project or small tool, and you do not want to spend time and money applying for this certificate, I suggest jumping directly to Section 6.2 and considering Google Play instead, or simply sharing the APK file with friends directly. + +### 6.1.2 Register a Developer Account + +If you have already prepared the required qualifications, or have decided to publish in domestic markets, the first step is account registration. The process is similar across major platforms, usually requiring ID verification for individuals or business license verification for companies. + +Below are the developer platform URLs for major app markets: + +Tencent Open Platform: https://open.tencent.com/ + +360 Open Platform: http://dev.360.cn + +Baidu Developer Platform: http://app.baidu.com + +Xiaomi Open Platform: https://dev.mi.com + +Huawei Developer Alliance: http://developer.huawei.com/consumer/cn + +Alibaba Developer Platform: http://open.uc.cn +Alibaba distribution integrates Wandoujia, Ali Jiuyou, PP Assistant, UC App Store, Shenma Search, and YunOS App Store. You only need to register one Alibaba developer account. + +Samsung Developer Platform: http://support-cn.samsung.com/App/DeveloperChina/Home/Index + +OPPO Developer Alliance: http://open.oppomobile.com + +vivo Developer Alliance: https://dev.vivo.com.cn + +Lenovo Open Platform: http://open.lenovo.com + +Meizu Developer Alliance: http://open.flyme.cn + +Gionee Developer Alliance: https://open.appgionee.com + +**Using Tencent MyApp as an example:** visit the Tencent Open Platform and click register. It is recommended to log in directly with a QQ account. Note that once a QQ account is bound, it is difficult to unbind, so it is better to use a dedicated work QQ account. Follow the prompts, choose "Individual Developer" or "Enterprise Developer," upload your ID photos, and complete face verification. After passing verification, click **Create App** to start. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image60.png)![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image61.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image62.png) + +### 6.1.3 Submission Flow and Required Materials + +After account review is approved, you can create the app and submit it for review. You need to prepare the following "four-piece set": + +1. **Installation package**: the **Release APK** packaged in Chapter 5 +2. **Text information**: +3. **App name**: must not contain sensitive words +4. **One-line intro**: within 20 Chinese characters, simple and direct (for example: "A relaxing electronic wooden fish app") +5. **Detailed description**: 200+ Chinese characters introducing the app's functions and usage scenarios +6. **Visual materials**: +7. **App icon**: high-definition PNG, usually 512x512 +8. **App screenshots**: prepare 4-5 clear screenshots of the app in use, preferably covering the main pages, usually in consistent size such as 1080x1920 +9. **Qualification document**: upload a scanned copy of your Software Copyright Registration Certificate + +**Submission and review:** after filling in all information and uploading the APK, click **Submit for Review**. The review cycle is usually 1-3 business days. During that period, pay attention to email or SMS. Reviewers may reject the submission because screenshots are unclear, descriptions are not standardized, or required qualifications are missing. In that case, you revise according to the feedback and resubmit. + +## 6.2 Publish to Overseas Market (Google Play) + +If you do not want to deal with the complexity of software copyright certificates and filings in domestic app stores, or if your target audience is global, Google Play is the best choice for individual developers. + +### 6.2.1 Preparation + +- **Google account**: a normal Gmail account is enough +- **$25 registration fee**: this is a **one-time lifetime fee**, and requires a credit card that supports USD payments (Visa / Mastercard) +- **Reliable network access**: you need to be able to access Google Play Console smoothly +- **Formal installation package**: note that Google Play requires the **.aab** (Android App Bundle) format, not APK. In Android Studio, choose Android App Bundle during packaging. The steps are almost identical to packaging APK. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image63.png) + +### 6.2.2 Google Play Console Release Process (Overview) + +Because Google Play registration and payment still have some entry barriers (such as the need for an overseas credit card), this tutorial does not currently provide step-by-step screenshots. But here is the common four-step process: + +**Step 1: Create an app and enter the console** + +Click `Create app`, fill in the app name (`Electronic Wooden Fish`), choose English as the language, choose App and Free as the app type, then check the agreement. After that, you will have access to the backend. + +**Step 2: Decorate the store page** + +This is the user's first impression. You need to upload the prepared app **icon** (512x512) and a **feature graphic** (1024x500). As for the English description, you can simply ask Trae: **"Please help me write an English description for publishing Electronic Wooden Fish on Google Play, in a light and relaxing tone."** AI usually writes it more naturally than a direct translation. + +**Step 3: Privacy and content rating** + +- Privacy policy: search for "App Privacy Policy Generator" and generate a free link to paste in +- Content rating: fill out a simple questionnaire (for example, whether there is violence or gambling). Electronic Wooden Fish usually gets a general 3+ rating. + +**Step 4: Upload and publish** + +Under the `Production` menu, click `Create new release`, upload your `.aab` file, save, and submit for review. Google Play review is usually fast (1-3 days). Once approved, your app can be downloaded worldwide. + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image64.png) + +_If you have already completed developer account registration, this video tutorial can guide you through the rest of the process:_ [Android应用上传GooglePlay谷歌市场全流程教程](https://www.bilibili.com/video/BV16REQzGEnk/?share_source=weixin&vd_source=b42f227a4f2d413fbde18499d83227cf) + +# 7 Final Notes + +That brings us to the end of the tutorial. Looking at the Electronic Wooden Fish you personally created on your phone, I wonder how you feel now. + +As someone trained in software engineering, I actually feel quite emotional in today's fast-developing AI era. In the past, we worked through thick programming books, learned complex syntax, struggled with environment setup, and spent half of our day fighting red error messages. But times have changed, and now we are increasingly learning how to direct AI. + +Through this Vibe Coding practice, you have already experienced the full Android app development process. The technical barrier is indeed getting lower. We no longer need to grind through dry code all the time, and can spend more energy on deciding **what to build**. But no matter how strong the tools are, they are still just tools. Do not let this app gather dust on your phone. Keep tinkering with it, break it and fix it again. Only when you start having your own ideas and bringing them to life do you truly cross the threshold. + +If this tutorial helped you feel that "building an app is not actually that hard," then I am honored to have helped bring one more new-generation builder into the development world. + +I am really looking forward to your next creation. Keep going! + +![](../../../../zh-cn/stage-3/cross-platform/3.5-android-app/images/image65.png) + +**_Hope you have fun in the world of Android development!_** + +# References + +CSDN: [(2024.03.04)如何打包Android Studio项目?](https://blog.csdn.net/GenuineMonster/article/details/136443130?ops_request_misc=&request_id=&biz_id=102&utm_term=android%20studio%20%E6%89%93%E5%8C%85%20APK%20%E5%B9%B6%E5%88%86%E4%BA%AB&utm_medium=distribute.pc_search_result.none-task-blog-2~all~sobaiduweb~default-1-136443130.142^v102^pc_search_result_base4&spm=1018.2226.3001.4187) + +CSDN: [Android Studio安装及配置](https://blog.csdn.net/Changersh/article/details/149838228?ops_request_misc=&request_id=&biz_id=102&utm_term=android%20studio%E5%AE%89%E8%A3%85%E5%8F%8A%E9%85%8D%E7%BD%AE&utm_medium=distribute.pc_search_result.none-task-blog-2~all~sobaiduweb~default-0-149838228.142^v102^pc_search_result_base4&spm=1018.2226.3001.4187) diff --git a/docs/en/stage-3/cross-platform/3.6-ios-app/index.md b/docs/en/stage-3/cross-platform/3.6-ios-app/index.md new file mode 100644 index 0000000..c6ad589 --- /dev/null +++ b/docs/en/stage-3/cross-platform/3.6-ios-app/index.md @@ -0,0 +1,734 @@ +# How to Build an iOS App - Native SwiftUI Development + +## Chapter 1: What an iOS App and iOS App Development Are + +In this tutorial, we will complete a full closed loop: **from an idea in your mind to a real iOS app that can be successfully installed and run on an iPhone.** + +For this tutorial, you should at least have: + +1. A Mac running a relatively recent macOS +2. An iPhone running a relatively recent iOS version, with developer mode enabled +3. Xcode successfully installed +4. Trae installed and opened +5. A usable Apple ID + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image1.png) + +### 1.1 iOS App + +An iOS App is a native application running on the iPhone operating system. It launches quickly, feels smooth, and can deeply use system features such as notifications, camera, and local storage. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image2.png) + +### 1.2 iOS App Development + +At its core, building an iOS App only involves a few things: + +1. Clarify the problem your app is solving +2. Design the interface users can see and operate +3. Define how the app behaves under different actions +4. Build the app correctly and install it on an iPhone + +### 1.3 Common Ways to Build iOS Apps + +In real development, there is more than one way to build an iOS App. We will not go deep here, but only provide an overall understanding. + +The first way is Apple's official native approach: create a project in Xcode and use Swift and SwiftUI to build the interface and logic. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image3.png) + +The second way is to use cross-platform frameworks, such as React Native and Flutter, and adapt one codebase to multiple platforms. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image4.png) + +Based on the approaches above, this tutorial chooses: **native SwiftUI development as the foundation, with AI tools doing the majority of the coding work**. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image5.png) + +### 1.4 iOS App Development Steps Covered in This Tutorial (High-Level Preview) + +The sample app used in this tutorial is **FridgeChef**. + +The user enters the ingredients currently available in the fridge, and the app uses a real AI API to generate a feasible recipe, then saves the result locally for later review. This example fully covers the core parts of a real iOS application, including UI input and display, network requests, data parsing, local storage, and final installation and running on a real device. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image6.png) + +- The overall idea from prototype to native app + +In implementation, this tutorial adopts a staged approach. We will first use AI to quickly generate an interface prototype with HTML and CSS, confirm the layout structure and information hierarchy in the browser, and then migrate it into SwiftUI. + +- Overall development flow preview + +Overall, the following chapters will go through these stages in order: + +1. Build basic understanding + Understand the shape of an iOS app, common development methods, and what problem this sample app solves. +2. Complete environment setup + Prepare a Mac and an iPhone, update the systems, install Xcode and Trae, and create a basic iOS project that can run successfully in the simulator. +3. Enter formal development + Open the project in Trae and gradually generate the UI and basic interaction through conversation with AI, turning the app from an empty shell into something usable. +4. Debug and organize + When compilation errors appear or behavior does not match expectations, let AI help troubleshoot; when the structure becomes messy, use AI to refactor and simplify it. +5. Run on a real device + Configure signing, install the app on a real iPhone, and complete one full verification from code to hardware. + +## Chapter 2: Development Environment Preparation + +### 2.1 Required Devices and Systems + +In this practice, two pieces of hardware are irreplaceable: a Mac and an iPhone. +At the same time, both devices should be running **a relatively recent official system version**. + +#### 2.1.1 Mac + +iOS apps can only be developed and compiled on macOS. This is a hard requirement of Apple's platform. + +To ensure Xcode can be installed and used normally, it is recommended that you update macOS to a relatively recent official version first. You can check and update from **System Settings -> General -> Software Update**. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image7.png) + +#### 2.1.2 Real iPhone Device + +In addition to the Mac, this tutorial also requires a real iPhone for verifying whether the app can be installed and launched correctly. + +To keep the debugging process smooth, the iPhone should also run a relatively recent iOS version. You can check and update from **Settings -> General -> Software Update**. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image8.png) + +Later in development, this iPhone will be connected to the Mac by cable for real-device debugging. + +#### 2.1.3 Enable Developer Mode on iPhone + +To install and run debug apps from Xcode on a real device, you need to enable developer mode on the iPhone. + +Steps: + +1. Open **Settings** +2. Enter **Privacy & Security** +3. Scroll to the bottom and find **Developer Mode** +4. Turn it on, then restart the device as prompted +5. After restart, unlock the device and confirm enabling developer mode + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image9.png) + +If your iPhone has never been connected to Xcode or other development tools before, you may find that **Developer Mode** does not appear under **Privacy & Security**. This is not a system issue - it simply means developer mode has not yet been triggered. + +In that case, you can make it appear by following these steps: + +1. Open **Settings -> Privacy & Security -> Analytics & Improvements** +2. Turn on **Share With App Developers** +3. Go back one level, enter **Privacy & Security** again, and scroll to the bottom +4. You should now see **Developer Mode**, then enable it and restart the device + +After completing the above steps, developer mode only needs to be enabled once. Future real-device debugging with Xcode will not require repeating this configuration. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image10.png) + +### 2.2 Required Software + +After devices and systems are ready, you still need to install the software used for development. This tutorial only uses two categories of tools: the official iOS development tool and the AI-assisted development tool. + +#### 2.2.1 Xcode + +Xcode is Apple's official development tool for iOS. In this tutorial, it is mainly used to create iOS projects, compile Swift / SwiftUI code, and run the app on the simulator or a real device. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image11.png) + +Xcode can be found and installed directly from the App Store. After installation, when you open it for the first time, you will see the welcome screen. Later project creation starts from there. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image12.png) + +#### 2.2.2 Trae + +Trae is the main environment where development work is performed in this tutorial. You will place the whole iOS project into Trae and collaborate with AI through dialog to complete development. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image13.png) + +### 2.3 Apple ID and Development Debugging Notes + +On the iOS platform, in order for an app to be installed on a real device, it must go through developer signing. This tutorial does not require you to pay for Apple Developer Program membership. A personal Apple ID is enough. + +### 2.4 Checklist Before Moving On + +Before entering the next chapter, you can compare your current state with the checklist below. + +You should now already have: + +1. A Mac running a relatively recent macOS +2. An iPhone running a relatively recent iOS version with developer mode enabled +3. Xcode successfully installed +4. Trae installed and opened +5. A usable Apple ID + +If all of these are ready, you can continue and create your first iOS app. + +## Chapter 3: Create the First iOS Project + +### 3.1 Use Xcode to Create a New Project + +Open Xcode. On the welcome screen, choose to create a new project. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image14.png) + +Click **Create new project** to enter the project template selection screen. + +### 3.2 Choose App Template and Tech Stack + +On the template selection screen, use the following configuration: + +1. Platform: iOS +2. Application type: App + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image15.png) + +Click **Next** to enter the project information configuration screen. + +### 3.3 Configure Project Information + +On the project information screen, just fill in the basic settings: + +1. Product Name: app name (for example `FridgeChef`) +2. Team: choose your personal Apple ID +3. Organization Identifier: reverse-domain format (for example `com.example`) +4. Bundle Identifier: generated automatically, keep default +5. Testing System: Swift Testing with XCTest UI Tests +6. Storage: choose Core Data (for later saving recipe history) +7. Leave the other options at default + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image16.png) + +Click **Next** and choose the project storage location. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image17.png) + +### 3.4 Recognize the Project Structure After Creation + +After the project is created, Xcode will automatically open the workspace. At this point, you do not need to understand every file. You only need to recognize a few key parts. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image18.png) + +In the default project, you will see: + +- A folder named after the project +- A Swift file ending with `App` (the application entry) +- A `ContentView.swift` file (the default page) + +This is already the smallest runnable iOS App. + +### 3.5 Run the First iOS App + +Before changing any code, run the original project directly. + +In the top toolbar of Xcode, keep the default iPhone simulator selected, then click the **Run** button on the top left. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image19.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image20.png) + +If everything is normal, the simulator will show a blank app that can start successfully. The first compilation may take a relatively long time. In later chapters, we reduce waiting time by using HTML prototypes first. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image21.png) + +To stop the app, click **Stop** next to the Run button. + +### 3.6 What You Have Actually Achieved at This Stage + +Even though the interface is still simple, you have already completed several key confirmations: + +1. The project can compile successfully +2. The simulator can run the app correctly +3. The development process has already been proven to work end-to-end + +This means that future problems will mainly focus on **the code and logic themselves**, rather than environment issues. + +### 3.7 Hand the Project Over to Trae + +Starting from the next section, the main development work will gradually move into Trae. + +What you need to do is simple: **open the iOS project folder you just created in Trae.** + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image22.png) + +## Chapter 4: AI-Assisted Development Practice - Build FridgeChef from Scratch + +This chapter is the core part of the entire tutorial. + +This tutorial does not use the traditional route of "write SwiftUI first, repeatedly compile, and keep tweaking previews." Instead, we use a more efficient flow: +**first use \*\***HTML\***\* to quickly validate the interface structure, then migrate the confirmed result into SwiftUI, and finally gradually complete business logic, local data, and interaction details.** + +### 4.1 Stage One: Requirement Clarification + +Before writing code, the first step is not building pages - it is clarifying what we are building. **Let AI first act like a \*\***product manager\***\* and organize the requirements into a structured specification document.** + +In Trae's chat window, enter the following instruction. Trae will generate a `REQUIREMENTS.md` file in the project root, describing the functionality and structure of the whole app. + +📋 **Prompt to copy:** + +```text +We are now going to develop an iOS App called "FridgeChef". + +1. Core concept +This is an AI assistant that solves the problem of "I don't know what to cook with the leftover ingredients in my fridge." +Users input the ingredients they currently have, and the app calls a large model to generate a practical recipe. + +2. Core functions +- Home page: + Show a prominent "Start Cooking" entry, and below it display historical recipe records in card or list form. +- Input page: + Users input ingredients, supporting text input or simple quick tags. +- Result page: + Display the AI-generated recipe, including dish name, ingredient list, and cooking steps. + +3. Technical requirements +- Use SwiftUI +- Save data locally (Core Data) +- Support basic page navigation and state updates + +Please help me organize this into a clear, structured REQUIREMENTS.md document from the perspective of a product manager, and save it in the project root. +``` + +After generation, quickly read through the document and confirm whether the function points match your expectations. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image23.png) + +### 4.2 Stage Two: Visual Prototype + +Let AI quickly draw a high-fidelity interface prototype using **HTML\*\*** + \***\*CSS**, so we can confirm the overall layout and style first. Continue by entering this in Trae: + +📋 **Prompt to copy:** + +```text +The requirements are confirmed. +Please use HTML + Tailwind CSS to generate a high-fidelity interface prototype for me. + +Design style: Neo-Pop +Colors: +- Background: light cream #FFFDF5 +- Accent colors: acid green #CCFF00, hot pink + +Visual characteristics: +- 3px thick black borders +- Hard shadow without blur (offset 4px) +- Large rounded cards, overall sticker / comic feeling + +Layout requirements: +- Home page should use a Bento Grid-like layout +- Include two screens: home page and input page + +Please generate a single-file index.html and simulate an iPhone screen ratio around the content. +``` + +After generation, find `index.html` in the file list and open it directly in a browser. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image24.png) + +At this stage, the point is not whether every detail is perfect. The point is whether **the page structure is reasonable, the main elements are complete, and the overall direction is correct.** + +### 4.3 Stage Three: Native Recreation + +Once the HTML prototype is finalized, **translate the confirmed interface into SwiftUI.** + +Steps: + +1. Upload the `index.html` file (or a browser screenshot) into Trae +2. Tell AI to generate SwiftUI code based on it + +📋 **Prompt to copy:** + +```text +[index.html uploaded] + +Please read the layout and style of this HTML file. + +Task: recreate this interface in the current project using SwiftUI. + +Requirements: +1. Encapsulate a NeoPopStyle modifier including background color, thick border, and hard shadow +2. Create HomeView.swift for the home layout +3. Create InputView.swift for the input page +4. Use Mock Data for now, and make sure it can display correctly in Xcode Preview and simulator +``` + +After it finishes, open Xcode and run the simulator. You will see an iOS app that already has a complete visual structure. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image25.png) + +### 4.4 Stage Four: Connect the AI API + +Once the interface is done, the app is still only a display layer. Next we need to connect real AI capability. In this tutorial we use the large-model service provided by **SiliconFlow**: +[https://cloud.siliconflow.cn](https://cloud.siliconflow.cn/) + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image26.png) + +SiliconFlow provides an API compatible with the OpenAI API specification, so it is very convenient to call from an iOS project using standard network requests. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image27.png) + +Before starting, you need to register an account on the site and create an API Key. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image28.png) + +This Key will be used for later model calls. + +📋 **Prompt to copy:** + +```text +Now we need to connect AI capability. + +Please create APIService.swift. + +Configuration: +- Base URL: https://api.siliconflow.cn/v1 +- Model: Qwen/Qwen2.5-7B-Instruct +- API Key: define it as a variable for now, I will fill it later + +Functions: +- Write a generateRecipe(ingredients: [String]) method +- The System Prompt must strictly require the model to return pure JSON only +- JSON fields should include: dishName, ingredients, steps + +Also define a RecipeModel struct for parsing the returned data. +``` + +After the code is generated, fill in your own Key inside `APIService.swift`. + +### 4.5 Stage Five: Core Data Local Storage + +To let the app remember the recipes it has generated, we need to bring in local data storage. This stage is divided into two steps. + +**Step 1: manually configure Core Data in Xcode** + +1. Open `FridgeChef.xcdatamodeld` +2. Create a new Entity named `RecipeEntity` + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image29.png) + +3. Add the following attributes: + 1. `id`: **UUID** + 2. `name`: **String** + 3. `cookTime`: **String** + 4. `difficulty`: **String** + 5. `desc`: **String** + 6. `timestamp`: **Date** + 7. `colorIndex`: **Integer 16** + + ![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image30.png) + +**Step 2: let AI write the logic code** + +📋 **Prompt to copy:** + +```text +I have finished configuring the Core Data Entity. + +Entity: RecipeEntity +Attributes: id, name, difficulty, timestamp, colorindex, cookTime, desc + +Please complete the following tasks: +1. Save data into Core Data after recipe generation succeeds +2. Use FetchRequest on the home page to read historical records and display them in reverse chronological order +3. When the database is empty, show a friendly empty-state message +``` + +### 4.6 Stage Six: Generate an App Icon + +The final step is to prepare a proper icon for the app. Here we use **Lovart** to generate the icon asset: [https://www.lovart.ai/zh](https://www.lovart.ai/zh) + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image31.png)![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image32.png) + +📋 **Prompt to copy into Lovart:** + +```text +Subject: A cute anthropomorphic fridge character with a happy face +Style: Minimalistic App Icon, Neo-pop style, thick black outlines, vector art +Colors: Acid green (#CCFF00) and deep blue +Background: Solid cream color +Negative Prompt: Text, realistic details, 3D render, complex background +``` + +After generation, crop the image to 1024x1024 and drag it into `Assets.xcassets` -> `AppIcon` in Xcode. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image33.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image34.png) + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image35.png) + +Run the app again, and you will now see a complete, recognizable, real iOS application. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image36.png) + +### 4.7 Stage Seven: Advanced Experience Upgrade + +Once the functionality is stable, if you want to further improve the visual style, you only need to describe the effect you want to AI, let it generate a new design proposal, and then migrate the confirmed result into SwiftUI. + +📋 Reference Prompt: + +```text +The app's functionality is already complete, but I want to try a more visually impactful UI style. +Please first generate a new design draft in HTML + Tailwind CSS for me, with the file name design_v2.html. + +Design style: Neo-Pop (dopamine style) +Color requirements: +Use Deep Royal Blue as the full-screen background +Use Acid Green (#CCFF00) as the accent color + +Visual feel: +All cards should use a 3px thick black border +Use a hard shadow without transparency blur, shifted down-right + +Layout requirements: +Keep the home page structure unchanged +Use pill-shaped buttons and input boxes + +Please generate the full code so I can preview it in a browser. +``` + +After it is generated, open this HTML file in a browser. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image37.png) + +Once the HTML version is finalized, you can begin modifying the iOS project. + +📋 Reference Prompt: + +```text +[design_v2.html uploaded] +Please analyze the visual style of this HTML and migrate it into the current iOS project. + +Task requirements: +Create a new NeoPopStyle.swift file +Encapsulate a neoPopBlue() style modifier + +The modifier needs to include: +- rounded corners +- thick black border +- opaque hard shadow + +Refactor HomeView: +- change the background to Deep Royal Blue +- use Acid Green for the primary button +- use white background for historical record cards +- make sure text remains clear and readable on the dark background + +Please provide the full modified code. +``` + +Click Run in Xcode again. If everything works, you should see: + +- the functionality is exactly the same as before +- the visual style has changed significantly +- the overall app quality feels noticeably upgraded + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image38.png) + +## Chapter 5: Running, Debugging, and Error Handling + +In the previous chapter, you completed the core functionality and successfully ran the app in the simulator. +But for an iOS app, true completion is not just "compiles successfully" - it is **stable operation, and knowing how to handle problems when they appear**. + +### 5.1 Run the App in Xcode + +First, make sure the project can run correctly in Xcode. + +In the top-left of Xcode, select the run device and keep the default iPhone simulator. Click the **Run** button to compile and run. If everything is normal, the app will launch in the simulator and display the interface built in Chapter 4. + +### 5.2 Run the App on a Real Device + +Connect your iPhone to the Mac using a cable. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image39.png) + +When connecting for the first time, the phone will show **Trust This Computer?** Tap trust and enter the unlock passcode. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image40.png) + +In Xcode's device list, select your iPhone, then click **Run** again. + +At this point, you should be able to see the **FridgeChef** icon on your phone's home screen, and open and use it normally. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image41.png) + +This step marks the completion of one full iOS development closed loop. + +### 5.3 Where iOS Development Errors Usually Come From + +In real development, **encountering errors is normal**, not an exception. + +Common issues usually come from these categories: + +1. **Compilation errors** + Swift syntax, type mismatches, missing parameters, etc. Xcode will directly highlight them in red. +2. **Runtime errors** + The app compiles, but crashes during execution - for example, array out of bounds or force-unwrapping a nil value. +3. **Permission or configuration errors** + Network requests blocked by the system, missing Info.plist configuration, signing issues, etc. +4. **Logic errors** + The app does not crash, but the behavior is wrong - for example, buttons not responding or data not refreshing. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image42.png) + +When any error appears, you only need to **copy the full error message exactly as it is into Trae's chat box.** With awareness of the project context, Trae can help you do the debugging. + +### 5.4 Common Real-device Debugging Errors and Solutions + +Errors during real-device debugging are very common. These problems are usually not caused by code itself, but by device trust, security rules, or signing configuration. If the app cannot run on your iPhone smoothly, you can check this section first. + +#### 1. Signing and registration problems + +**Common symptoms:** + +- Xcode shows red errors like + `"Communication with Apple failed"` + or + `"No profiles for 'com.xxx.xxx' were found"` +- Or it says + `"Your team has no devices which are compatible"` + +**Cause:** + +- The Bundle Identifier is not unique or valid +- The current iPhone has not yet been registered under your Apple ID for development + +**Solution:** + +1. **Modify the Bundle Identifier** + In Xcode project settings, change the Bundle Identifier to something more unique, such as: + `com.yourname.FridgeChef` +2. **Let Xcode auto-register the device** + In the error prompt, click `Try Again` or `Register Device`, and let Xcode complete the device registration and certificate configuration automatically. + +#### 2. Device pairing and connection problems + +**Common symptoms:** + +- Xcode shows + `"Device is not available because pairing is in progress"` +- Or it says + `"Device Locked"` +- Or you already tapped Trust, but Xcode still remains stuck + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image43.png) + +**Cause:** + +- The iPhone is still locked +- The pairing process has not fully completed +- Xcode has not refreshed the connection state + +**Solution:** + +1. Unlock the phone + Make sure the iPhone is unlocked and stays on the home screen. +2. Finish the trust process + When the phone pops up **Trust This Computer?**, tap **Trust** and **enter the lock-screen passcode**. +3. Refresh the connection state + If it is still stuck, unplug the cable, wait 2-3 seconds, and reconnect. If necessary, restart Xcode and try again. + +#### 3. The app installs but cannot open + +**Common symptom:** + +- The app icon already appears on the iPhone home screen +- The system shows + **Untrusted Developer** + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image44.png) + +**Cause:** + +This is an iOS security mechanism. Debug apps installed with a personal Apple ID require manual trust authorization. + +**Solution:** + +1. Open **Settings** +2. Enter **General** +3. Tap **VPN & Device Management** +4. Under **Developer App**, find your Apple ID +5. Tap **Trust**, then confirm again + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image45.png) + +After that, return to the home screen and tap the app again. It should now run normally. + +## Chapter 6: If You Want to Publish the App to the App Store + +In this tutorial, what we mainly completed is the full closed loop for a **personal development and debugging version of an app**: from creating the project, implementing functions, and debugging, all the way to successfully installing and using it on a real device. + +If you want to go further and formally publish the app to the **Apple App Store** so that all users can download and use it, then you need to enter a more formal release process. Since that process involves a paid developer account, review rules, and compliance requirements, and is not the main practical focus of this tutorial, the following content is only provided as an **overall reference and roadmap**. + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image46.png) + +> The following content references Apple's official review requirements and public experience discussions (including original Zhihu sharing). Links are listed below. If any link becomes unavailable, you can search by title or keyword to find the original source. + +### 6.1 Apple Developer Program + +To publish an app to the App Store, you must join Apple's paid developer program: + +- **Apple Developer Program** (USD $99 per year) +- Official site: [https://developer.apple.com/](https://developer.apple.com/) + +After joining, you can use **App Store Connect** to create the app entry, manage versions, and publish formally. + +### 6.2 App Store Connect: Create the App Entry + +In App Store Connect, you need to create a complete app record, including but not limited to: + +1. App name and Bundle ID +2. Description, keywords, and privacy policy link +3. App icon, screenshots, and preview materials +4. Pricing and distribution region settings + +All this information must be completed before submission can proceed. + +### 6.3 Build and Submit for Review + +After the metadata is ready, you need to: + +1. Use the paid developer account in Xcode to sign a Release build +2. Build and upload the formal version +3. Submit it for review in App Store Connect + +After submission, the app enters Apple's review queue. The review time is typically 1-3 days, depending on the case. + +### 6.4 Review Rules and Common Reasons for Rejection + +Apple mainly reviews apps from the following aspects: + +- functionality and stability +- privacy and data compliance +- consistency between metadata and actual functionality +- whether there is infringement or misleading behavior + +If the app does not meet requirements, the review will be rejected and Apple will provide a specific reason. The developer then needs to modify the app and resubmit. + +### 6.5 What to Do After Rejection + +If the app is rejected, you can: + +- modify the code or description according to the feedback +- resubmit the version +- communicate with the review team through App Store Connect + +This is a very common part of the publishing process and does not mean the project has failed. + +### Reference sources + +The following content references Apple's official documentation and public experience sharing: + +- App Store Review Guidelines (Apple official) + [https://developer.apple.com/app-store/review/guidelines/](https://developer.apple.com/app-store/review/guidelines/?utm_source=chatgpt.com) +- Official guide to submitting for review + [https://developer.apple.com/cn/help/app-store-connect/manage-submissions-to-app-review/submit-for-review](https://developer.apple.com/cn/help/app-store-connect/manage-submissions-to-app-review/submit-for-review?utm_source=chatgpt.com) +- Full illustrated guide to iOS App Store publishing and review pitfalls (Zhihu) + [https://zhuanlan.zhihu.com/p/146128612](https://zhuanlan.zhihu.com/p/146128612) + +## Chapter 7: Summary + +![](../../../../zh-cn/stage-3/cross-platform/3.6-ios-app/images/image47.png) + +Congrats! At this point, you have personally walked through the complete iOS app development process from 0 to 1. From setting up the environment, running the project, and then gradually landing interface, functionality, data, and real-device testing, all the key stages have been completed smoothly. More importantly, you did not get here by memorizing Swift syntax - you handed most of the implementation to AI. No matter what your background is, every attempt like this makes you more fluent, and you will realize that iOS development is not as difficult as it once seemed. Even if you could not write a single line of code before, you can still build your own app. + +Looking back, the whole process is not actually that complicated: decide what you want to build, use HTML to test the interface quickly, convert it into SwiftUI, connect the API and local data, and then run through debugging once. Based on this, in the future you can also casually build a personal alarm clock, a minimal todo list, or even a chatbot that speaks in the tone of your favorite celebrity. + +This is exactly the most important thing that this tutorial - and easy-vibe - wants to teach you. I am looking forward to the newest creations from all of you future vibe coding masters, and to the day I get dazzled by your work. diff --git a/docs/en/stage-3/cross-platform/3.8-pwa-local-app/index.md b/docs/en/stage-3/cross-platform/3.8-pwa-local-app/index.md new file mode 100644 index 0000000..8df6b43 --- /dev/null +++ b/docs/en/stage-3/cross-platform/3.8-pwa-local-app/index.md @@ -0,0 +1,681 @@ +# How to Build a Local PWA App: Turn a Website into a "Real App" + +# 1 What PWA and PWA Development Are + +In this tutorial, we will complete a full closed loop: **from an ordinary web project to a "real app" that can be installed on a desktop and a phone home screen and still works when offline.** You will personally turn a React app into a PWA, deploy it online, and install it on your phone for testing. + +What we are going to build is a **Tomato Farm** app - a PWA that perfectly combines the Pomodoro technique with a farming game. You earn points through 25 minutes of focused work, then use those points to buy seeds and plant crops. As your level increases, you unlock more farmland and better seeds. Most importantly, it keeps working even without internet, and all data is stored locally. + +For this tutorial, you should at least have: + +- A computer (Windows or Mac) +- A Node.js environment (version 18.0 or above) +- Your AI coding assistant (Cursor / Trae / Claude Code, etc.) +- A phone (for testing mobile installation) + +## 1.1 Definition of PWA + +**PWA (Progressive Web App)** is a special kind of website. Through **Service Worker** technology, it gains the ability to "cache and take over itself." + +### Why ordinary websites cannot work offline, but PWAs can + +An ordinary website needs to download HTML, CSS, and JS files from the server every time it opens, so if the network is down, it simply cannot load. A PWA, on the other hand, uses a **Service Worker** (a JS script running in the browser background) to cache these files locally on the first visit. After that, even if the network is disconnected, the Service Worker can read files directly from local cache and display the page normally. + +**A simple analogy**: an ordinary website is like borrowing a book from a library every time (you must have internet), while a PWA is like buying the book and putting it on your own bookshelf (after the first download, you can still read it offline). + +### PWA vs Ordinary Website vs Native App + +| Feature | Ordinary Website | PWA | Native App | +|------|---------|-----|---------| +| **Installation** | Not needed | Optional (add to home screen) | Must download from app store | +| **Offline use** | ❌ No | ✅ Yes (after caching) | ✅ Yes | +| **Update method** | Auto refresh | Auto / background update | Manual user update | +| **Size** | None | A few hundred KB to a few MB | Tens of MB or more | +| **Development cost** | Low | Low (one codebase) | High (separate iOS / Android) | + +**One-sentence summary**: a PWA is "a webpage that can store its own files" - it has the lightness of a website (no installation required, auto-updating) and the experience of a native app (offline support, installable to desktop/home screen). + + + +## 1.2 Why Choose PWA? + +In the Vibe Coding era, PWA is one of the most cost-effective "cross-platform solutions": + +| Comparison Dimension | Native App | PWA | +|---------|---------|-----| +| Development cost | Must develop iOS / Android / desktop separately | One codebase for all platforms | +| Installation | Must go to app store | Install directly in browser, instant | +| Update method | Users must update manually | Auto updates, invisible to user | +| Package size | Often tens of MB | Usually only a few hundred KB | +| Offline support | Built in naturally | Supported through Service Worker | +| Best scenarios | Deep hardware access needed (AR / Bluetooth, etc.) | Content display, tools, lightweight apps | + +**One-sentence summary**: if your app does not need AR through camera or Bluetooth hardware access, PWA is almost the easiest choice. + +## 1.5 Tutorial Roadmap + +To make the learning process less boring, this tutorial revolves around a fun and practical case - **Tomato Farm**. It is a Pomodoro farming game that combines focused work with gamified rewards. Together with the Vibe Coding mode of AI coding assistants, we will break the process from zero to phone installation into a reusable route: + +1. **Build understanding and environment**: understand what PWA is, install Node.js and an AI coding assistant, and make sure the toolchain is smooth. +2. **Build the project skeleton**: create a React + TypeScript project that can run locally. +3. **AI iterative development**: through conversation with AI, build Pomodoro countdown, farming system, level system, SVG crop rendering, and more. +4. **PWA configuration and offline testing**: add Service Worker and Manifest, then verify offline support. +5. **Deployment and phone installation**: deploy to Vercel to get an HTTPS URL, then install and use it on a phone. + +This section only gives the big picture, without expanding the exact commands. For now, just remember the main line: **Environment setup -> Skeleton building -> AI description and generation -> PWA configuration -> Deployment delivery**. In the next chapters, we will walk through each step with you. + +# 2 Development Environment Setup + +## 2.1 Tools Used in This Tutorial + +During the whole development process we use three tools together, and they take the roles of "design," "construction," and "acceptance." + +- **AI coding assistant (Cursor / Trae / Claude Code)**: this is your **AI coding partner**. In Vibe Coding mode, we no longer need to write code line by line. Instead, we mainly tell AI in natural language what functionality we want, and it handles code generation and modification. +- **Node.js + Vite**: these are the **project build factory**. Node.js provides the JavaScript runtime, and Vite is a next-generation frontend build tool with extremely fast speed, especially suitable for building PWAs. +- **A phone**: this acts as the **test device** to verify the running result. You can directly access the deployed PWA in the browser on your phone and test the real installation and offline functionality. + +## 2.2 Install Node.js + +Node.js is the basic environment for PWA development. Visit the official website [https://nodejs.org](https://nodejs.org) and download the **LTS (Long Term Support)** version (this tutorial is based on Node.js 18.x or above). + +After download, install it like ordinary software by double-clicking the installer and keeping default options. + +After installation, open the terminal (CMD / PowerShell on Windows, Terminal on Mac) and run: + +```bash +node --version +npm --version +``` + +If you see version outputs such as `v18.17.0` and `9.6.7`, it means installation is successful. + + + +## 2.3 Install the AI Coding Assistant + +The AI coding assistant is the main battlefield of **Vibe Coding**. You can simply understand it as an **"editor with a super AI built in."** + +**Recommended choices:** + +- **Trae**: visit [https://www.trae.cn](https://www.trae.cn) and download the matching version for your OS +- **Cursor**: visit [https://cursor.sh](https://cursor.sh) and install it +- **Claude Code**: if you are already using Claude, you can use Claude Code directly + +The installation process is very simple, just like installing normal software. After preparing this tool, in later practice we no longer need to stare at boring code windows. Instead, we will open the project here and use natural language in the chat box to ask AI to write code and fix bugs. + + + +## 2.4 Create a New Project + +Open your AI coding assistant and enter the following Prompt in the chat box: + +```text +Please help me create a React project named tomato-farm-pwa for building a Tomato Farm app. +It needs to support TypeScript, and also include PWA functionality (the kind that can be installed to a phone home screen). +``` + +AI will automatically perform the following steps: + +**Step 1: Create the project** + +```bash +npm create vite@latest tomato-farm-pwa -- --template react-ts +``` + +**Step 2: Enter the project and install dependencies** + +```bash +cd tomato-farm-pwa +npm install +``` + +**Step 3: Install the PWA plugin** + +```bash +npm install vite-plugin-pwa -D +``` + +After AI finishes, your project structure will roughly look like this: + +```text +tomato-farm-pwa/ +├── public/ # Static assets (icons, SVG materials go here) +├── src/ +│ ├── App.tsx # Main component +│ ├── main.tsx # Entry file +│ └── App.css # Styles +├── index.html # HTML entry +├── vite.config.ts # Vite config (PWA config goes here) +├── package.json +└── tsconfig.json +``` + +## 2.5 Understand the Project Structure + +After the project is created, we need to understand the role of several key files: + +| File / Directory | Purpose | +|----------|---------| +| `src/App.tsx` | Main application component, where the core page logic is written | +| `src/main.tsx` | Application entry file, responsible for mounting the React app | +| `vite.config.ts` | Vite configuration file, where the core PWA config is written | +| `public/` | Static asset directory, where PWA icons and SVG materials go | +| `index.html` | HTML entry file, usually does not need modification | + +As beginners, we mainly need to care about three parts: + +- `App.tsx`: controls program behavior and decides "what appears on screen" +- `vite.config.ts`: configures PWA behavior and decides "how the app is installed and cached" +- `public/`: stores the app icons and assets + +## 2.6 Prepare App Icons + +PWA needs icons before it can be installed. At minimum, we need two PNG images in **192x192** and **512x512** sizes. + +You can ask AI to generate them: + +```text +Please help me generate two app icons with sizes 192x192 and 512x512. +Use a green gradient background and draw a red tomato in the middle. Save them in the public folder. +``` + +Or you can also create your own icons with any design tool (Figma, Canva) and put them into the `public/` directory. + + + +## 2.7 Configure `vite-plugin-pwa` + +This is the most critical step. Open `vite.config.ts` and ask AI to configure the PWA plugin: + +```text +Please help me change vite.config.ts into a PWA configuration so the webpage can be installed to a phone home screen: +- The app name is "Tomato Farm", with a green theme +- Use icon-192.png and icon-512.png from the public directory as icons +- Enable automatic updates +- Cache all js, css, html, and image files so the app can work offline +``` + +AI will generate a configuration similar to this: + +```typescript +import { defineConfig } from 'vite' +import react from '@vitejs/plugin-react' +import { VitePWA } from 'vite-plugin-pwa' + +export default defineConfig({ + plugins: [ + react(), + VitePWA({ + registerType: 'autoUpdate', + manifest: { + name: 'Tomato Farm', + short_name: 'Tomato Farm', + description: 'Focus, plant, and grow', + theme_color: '#4CAF50', + background_color: '#ffffff', + display: 'standalone', + icons: [ + { + src: '/icon-192.png', + sizes: '192x192', + type: 'image/png' + }, + { + src: '/icon-512.png', + sizes: '512x512', + type: 'image/png' + } + ] + }, + workbox: { + globPatterns: ['**/*.{js,css,html,ico,png,svg}'] + } + }) + ] +}) +``` + +**Key configuration explanation:** + +* `registerType: 'autoUpdate'`: when you publish a new version, the app will update automatically the next time users open it, without manual operation. +* `display: 'standalone'`: after installation, it runs in its own window, without browser address bar, and feels like a native app. +* `workbox.globPatterns`: tells the Service Worker which file types should be cached and still accessible offline. + + + +# 3 Build the Tomato Farm PWA + +In the previous two chapters, we already understood what a PWA is and completed the environment setup. From this section onward, we stop talking only in theory and move into hands-on practice. We will use Vibe Coding mode to build a fun and practical app from scratch - **Tomato Farm**. It perfectly combines the Pomodoro technique with gamified incentives and covers the core elements of PWA development: **UI interaction (Pomodoro timer), data storage (points and crops), and offline capability (Service Worker caching).** + +Now, let us send the first instruction to AI. + +## 3.1 The First "Master Prompt": From Zero to One + +In Vibe Coding mode, we do not need to follow the traditional approach of first creating layout files and then writing logic code. What we need to do is **describe the requirements clearly in one shot and let AI generate the first runnable version**. + +Open the project directory we just created in your AI coding assistant, and enter the following Prompt: + +```text +Please help me write the main page for the Tomato Farm app, with the following functions: + +**Pomodoro Timer** +- A 25-minute countdown timer with start, pause, and reset +- Show remaining time and a progress bar +- Give the user 10 points after completing one focus session + +**Farming System** +- 3 plots of farmland, but initially only the first one is available; the later ones are unlocked after leveling up +- A shop to buy seeds: carrot costs 5 points, tomato 10 points, corn 15 points +- After buying seeds and planting them, crops slowly grow, and when mature they can be harvested for points + +**Level System** +- Level by total points: 0-100 points = Beginner Farmer, 100-300 = Skilled Farmer, above 300 = Farm Master +- Unlock new land and better seeds after leveling up + +**UI Design** +- Top shows level, points, and upgrade progress bar +- Middle shows the Pomodoro countdown +- Below is the farmland grid +- Bottom has the shop button +- Use a green theme and make it look fresh and cute +- Must adapt to phone screens + +**Data Saving** +- All data (points, level, farmland state) must be saved, and refreshing the page should not lose it +``` + +After sending it, you will see AI start reasoning and analyzing your project structure. A few seconds later, it will directly generate the complete code for `App.tsx`. + +1. From its response, we can see its reasoning logic and interaction logic +2. We can directly see which code it changed +3. If we are not satisfied, we can roll back to the previous version + + + +## 3.2 Run and Preview (Local Development Server) + +Now AI has completed the first round of development, but remember: what we see in the coding assistant is still just code "blueprints," not a truly interactive app. We need to start a local development server so we can actually run the code and view the real effect. + +Run this in the terminal of your AI coding assistant: + +```bash +npm run dev +``` + +After a few seconds, the terminal will show output like this: + +```text + VITE v5.0.0 ready in 300 ms + + -> Local: http://localhost:5173/ + -> Network: use --host to expose + -> press h + enter to show help +``` + +Open `http://localhost:5173/` in your browser, and you should see: + +- level, points, and a progress bar at the top +- a Pomodoro countdown in the middle +- farmland area below +- a shop button at the bottom + +Try clicking the "Start Focus" button and see if the countdown works properly. Click on a farmland tile and see if you can buy seeds and plant them. This is the first version of your PWA app. + + + +## 3.3 Optimization Iteration (Add SVG Crops and Animation) + +At this point, our app already has a basic shape: Pomodoro timer, farming system, and leveling system. But it may still look rough, with crops perhaps shown only as text or simple blocks. Next, we will add beautiful SVG crops and growth animation to make the Tomato Farm come alive. + +**This is exactly where Vibe Coding becomes so attractive.** In traditional development, drawing SVG graphics and building complex growth animations can be a nightmare for beginners. You not only need to handle SVG path drawing, but also calculate animation curves. In Vibe Coding mode, you do not need to worry about those low-level details. You just tell AI like a director: "Give the crops nicer SVG graphics and make them grow with animation," and the complex code appears almost instantly. + +**Step 1: Prepare SVG crop assets** + +You can ask AI to draw SVG directly in code, or prepare SVG files and put them under `public/`. In this tutorial, we recommend letting AI generate SVG code directly because it is more flexible. + +**Step 2: Send an iteration instruction** + +Return to the AI coding assistant and enter the following Prompt: + +```text +Please make the crops look better and add growth animation: + +**Crop graphics** +- Carrot: orange body with green leaves +- Tomato: red round shape with little green leaves +- Corn: yellow corn cob with green outer leaves +Just use simple shapes + +**Growth animation** +- When first planted, it starts as a small sprout and gradually grows to maturity +- Show 3 stages + +**Harvest effect** +- When clicking a mature crop, play a simple harvest animation +- Show how many points were gained + +**Overall polish** +- Farmland tiles should have borders and background color +- Crops should appear centered in the tile +- Overall style should feel a little cuter +``` + +AI will modify the code again and handle the SVG rendering and animation logic. After it finishes, refresh the browser, and you should see better crop graphics and smooth growth animations. + + + +## 3.4 Add Sound Effects and Notifications (Optional) + +If you want Tomato Farm to feel more immersive, you can also add sound effects and notifications. This also only needs a simple Prompt: + +```text +Please add sound effects and notifications to Tomato Farm: + +**Sound effects** +- Play a "ding" when focus starts +- Play a victory sound when focus is completed +- Also add matching sound effects for planting and harvesting + +**Notifications** +- Show "Congratulations, you finished a focus session!" after a focus cycle ends +- Show "Congratulations, you leveled up to XX!" when leveling up +- Show "You unlocked a new farmland plot!" when new land is unlocked + +You can implement this with simple audio files or the Web Audio API +``` + +AI will help you add sound effects and notifications, making the Tomato Farm more lively and enjoyable. + + + +# 4 Experience the PWA Locally + +## 4.1 Build and Preview + +The PWA Service Worker only takes effect in production builds (it will not register in development mode). So we need to build first, then preview: + +```text +Please help me run these commands: +1. npm run build (build production version) +2. npm run preview (start local preview server) +``` + +After build, Vite will generate all files in the `dist/` directory, including the auto-generated `sw.js` (Service Worker) and `manifest.webmanifest`. + +Once the preview server starts, open the address shown in the terminal (usually `http://localhost:4173`). + +## 4.2 Install the PWA on Desktop + +After opening the preview URL, you will notice an **install icon** appears on the right side of the browser address bar (usually a small download arrow or "+" sign). + +**Chrome / Edge installation steps:** + +1. Click the install icon on the right side of the address bar +2. Click **Install** in the popup dialog +3. The PWA will open in a standalone window, and a shortcut will be created on your desktop / Start Menu / Dock + +The installed PWA looks just like a native desktop app - no address bar, no tabs, with its own window and icon. Now you can open Tomato Farm anytime and begin your focus-and-farming journey. + + + +**macOS Safari installation steps:** + +1. Open the PWA URL in Safari +2. Click **File -> Add to Dock** from the menu bar +3. The PWA icon will appear in the Dock + +## 4.3 Test Offline Capability + +This is the coolest part of PWA. Let us verify whether offline mode really works: + +1. Make sure the PWA has been opened in the browser at least once (so the Service Worker can cache resources) +2. **Disconnect the network** (turn off Wi-Fi or unplug the cable) +3. Refresh the page - you will find that **Tomato Farm still loads normally!** +4. Start a Pomodoro session - after it finishes you gain points, buy seeds, plant crops - and all the data is still saved normally in `localStorage` + +You can also open Chrome DevTools (F12) -> Application -> Service Workers to inspect Service Worker status and cached resource lists. + + + +## 4.4 Data Persistence and Sync Options + +Now your Tomato Farm can already run offline, and all data is saved in the browser's `localStorage`. But there is one key problem: **if the user switches devices or clears browser data, all farm data will be lost**. For serious production apps, we need to think about data persistence and cross-device synchronization. + +### 4.4.1 Limitations of Local Storage + +The `localStorage` we are currently using has several obvious limitations: + +| Limitation | Description | +|--------|------| +| **Device-bound** | Data is only stored in the current browser on the current device; switching devices means losing it | +| **Limited capacity** | Usually only 5-10MB of storage space | +| **Easy to lose** | Clearing browser data or uninstalling the PWA causes data loss | +| **Cannot sync** | Progress on phone cannot sync to desktop | + +If your Tomato Farm is just a personal tool, this may not be a problem. But if you want users to invest long term and accumulate data, a more reliable solution is needed. + +### 4.4.2 Option 1: Cloud Sync (Recommended) + +The most reliable solution is synchronizing data to a cloud database. For PWAs, **Supabase** is an excellent choice - it provides a PostgreSQL database, real-time subscriptions, and authentication, and also offers a free tier. + +**Implementation idea:** + +1. **User login**: use email or social login to establish user identity +2. **Automatic data sync**: every operation automatically saves to the cloud +3. **Offline-first**: the app still works when offline, then syncs automatically when the network returns +4. **Cross-device sync**: progress on phone is available immediately on desktop + +**Prompt example:** + +```text +Please help me migrate Tomato Farm data storage from localStorage to Supabase cloud sync: + +**Functional requirements** +- Add user login (email + password or Google login) +- Save user data (points, level, farmland state) to Supabase database +- Still work offline, and automatically sync when the network recovers +- Support multi-device sync, so crops planted on the phone can also be seen on desktop + +**Tech stack** +- Use @supabase/supabase-js client +- Implement optimistic updates (update UI first, then sync to cloud) +- Add a simple sync status indicator +``` + +**Pros:** + +- Data will not be lost; users only need to log in again when switching devices +- Free tier is enough for personal projects +- Supports real-time subscriptions, giving good multi-device sync experience + +**Cons:** + +- Requires user registration/login, adding usage friction +- Needs network connection to perform syncing + +### 4.4.3 Option 2: Export / Import Backup + +If you do not want to add a backend service, a simpler compromise is **manual backup and restore**. + +**Implementation idea:** + +1. **Export**: package farm data as a JSON file and let users download it +2. **Import**: users can select a previously exported JSON file to restore data +3. **Automatic reminder**: remind users to back up periodically + +**Prompt example:** + +```text +Please add data backup functionality to Tomato Farm: + +**Export** +- Add an "Export Data" button on the settings page +- Package all data in localStorage into a JSON file +- Automatically download it to the user's device + +**Import** +- Add an "Import Data" button that accepts a JSON file +- Validate file format before restoring +- Show a warning before import because it overwrites current data + +**Automatic reminders** +- If the user has not backed up for over 7 days, show a friendly reminder +``` + +**Pros:** + +- Simple to implement, no backend service required +- Users fully control their own data +- Can transfer across devices by sharing the exported file + +**Cons:** + +- Requires manual operation, so the experience is not smooth +- If the user forgets to back up, data can still be lost + +### 4.4.4 Option 3: Browser Extension Sync (For Chrome Users) + +If your PWA mainly targets Chrome users, you can consider **Chrome Storage Sync API**. This is a cross-device synced storage service provided by Chrome, where data automatically syncs with the user's Google account. + +**Note:** this requires packaging the PWA as a Chrome extension as well, which is more suitable for developers with technical experience. + +### 4.4.5 Recommended Choice Strategy + +| Scenario | Recommended Solution | +|------|----------| +| Personal lightweight tool | `localStorage` only is enough | +| Want to avoid data loss, but do not want too much complexity | Export / import backup | +| Official product with better user experience | Supabase cloud sync | +| Mainly for Chrome users | Chrome Storage Sync | + +**For an app like Tomato Farm, my suggestion is:** + +1. **MVP stage**: start with `localStorage` to verify the product idea quickly +2. **Iteration stage**: add export / import backup so users have a data safety net +3. **Mature stage**: integrate Supabase to achieve real cloud synchronization + +Remember: **progressive enhancement** is the core philosophy of PWA. First make the app run, then gradually add more advanced capabilities. + + + +# 5 Deploy Online + +PWA must run under HTTPS in order to work correctly. The good news is that mainstream deployment platforms now provide free HTTPS automatically. We will use **Vercel** as an example (you could also use Netlify or GitHub Pages). + +## 5.1 Deploy to Vercel + +**Step 1: Install the deployment tool** + +```text +Please help me install Vercel's deployment tool +``` + +**Step 2: Deploy the project** + +```text +Please help me deploy this project to Vercel. The project name is tomato-farm-pwa +``` + +AI will handle the deployment steps automatically. You only need to: +- choose your account +- confirm creating a new project +- keep the other options at default + +After waiting a few dozen seconds, Vercel will automatically build and deploy your project. When done, you will get an HTTPS URL like `https://tomato-farm-pwa.vercel.app`. + + + +**Step 3: Verify the PWA** + +Open the deployed URL in your browser, and you should see: + +1. an install icon appear on the right side of the address bar +2. in DevTools -> Application -> Manifest, your configured app info such as the name "Tomato Farm" +3. in the Service Workers tab, the Service Worker shown as activated + +## 5.2 Deploy with GitHub Pages (Alternative) + +If you prefer GitHub Pages, you need additional path configuration: + +```text +Please help me modify the config so the project can be deployed to GitHub Pages. +My repository name is tomato-farm-pwa, so please adjust the path configuration accordingly. +``` + +Then push the build output to the `gh-pages` branch of your GitHub repository. + +# 6 Install the PWA on a Phone + +This is the most exciting part - turning your Tomato Farm webpage into an "app" on your phone. + +## 6.1 Install on Android + +1. Open your deployed Tomato Farm PWA URL in the **Chrome browser** on your phone +2. Chrome may automatically show an **"Add to Home screen"** prompt banner - just click it +3. If it does not show automatically, tap the **three-dot menu** in the top-right corner -> **Install app** or **Add to Home screen** +4. Confirm installation, and a Tomato Farm app icon will appear on your phone's home screen + +Open it and you will notice it runs in full-screen mode, without the browser address bar or navigation buttons, looking almost exactly like a native app. Now you can start focusing and farming anytime. + + + +## 6.2 Install on iPhone + +On iOS, PWA can only be installed through the **Safari** browser (other browsers do not support installation): + +1. Open your deployed Tomato Farm PWA URL in **Safari** +2. Tap the **Share** button at the bottom (square with an upward arrow) +3. In the menu, choose **Add to Home Screen** +4. Give the app a name and tap **Add** + +Starting from iOS 26, all websites added to the home screen will open in standalone app mode by default, which is a major improvement. + + + +> **Known limitations on iOS:** +> * Push notifications require iOS 16.4 or above, and the PWA must already be added to the home screen +> * Background Sync is not supported +> * Storage space is more limited than on Android + +## 6.3 Audit Your PWA with Lighthouse + +Google provides a tool called **Lighthouse**, which can score your PWA. Open Chrome DevTools (F12) -> Lighthouse -> check "Progressive Web App" -> click "Analyze page load." + +A qualified Tomato Farm PWA should get a full score in the PWA category. If not, Lighthouse will tell you the exact reasons and suggest fixes. + + + +# 7 Final Notes + +Congratulations! You have successfully built a Pomodoro farming PWA that can be installed on both desktop and mobile. Let us review what we did: + +1. Created a Tomato Farm web app with Vite + React +2. Added Service Worker and Manifest via `vite-plugin-pwa` +3. Deployed it to Vercel to get an HTTPS URL +4. Successfully installed it on both desktop and mobile, and tested offline capability + +Now your Tomato Farm PWA can already achieve: +* **Focus farming**: help users stay focused through the Pomodoro mechanism +* **Gamified rewards**: use planting, leveling, and unlocking to motivate repeated use +* **Offline usability**: even with no network, users can still focus, plant, and manage their farm +* **Cross-platform installation**: develop once and install on multiple kinds of devices + +The charm of PWA is its "progressiveness" - you do not need to make it perfect at the very beginning. First make the website installable and available offline, then gradually add advanced capabilities such as push notifications and background sync. + +**Advanced directions:** + +* **Push notifications**: use Push API + Notification API to remind users when a Pomodoro finishes, or when crops are ready to harvest +* **Background sync**: use Background Sync API to sync farm data to the cloud after the network returns +* **Smarter caching strategies**: use different Workbox strategies such as CacheFirst, NetworkFirst, and StaleWhileRevalidate for different kinds of assets +* **Publish to app stores**: use [PWA Builder](https://www.pwabuilder.com/) to package the Tomato Farm PWA into an Android APK or Microsoft Store app +* **Social features**: add a friend system so users can visit each other's farms and exchange crops + +***One codebase, all platforms - this is the power of PWA. Focus, plant, and grow!*** + +# References + +* [Vite PWA Official Docs](https://vite-pwa-org.netlify.app/guide/) +* [Google PWA Development Guide](https://web.dev/progressive-web-apps/) +* [MDN Web App Manifest Docs](https://developer.mozilla.org/en-US/docs/Web/Manifest) +* [Workbox Caching Strategies Overview](https://developer.chrome.com/docs/workbox/caching-strategies-overview/) +* [PWA Builder - Publish PWA to App Stores](https://www.pwabuilder.com/) diff --git a/docs/en/stage-3/cross-platform/3.9-browser-ai-extension/index.md b/docs/en/stage-3/cross-platform/3.9-browser-ai-extension/index.md new file mode 100644 index 0000000..0a1b627 --- /dev/null +++ b/docs/en/stage-3/cross-platform/3.9-browser-ai-extension/index.md @@ -0,0 +1,503 @@ +# How to Build a Browser AI Assistant Extension: Summarize Any Webpage in One Click + +# Chapter 1: What Browser Extensions and Chrome Extension Development Are + +In this tutorial, we will complete a full closed loop: build an AI-driven Chrome browser extension from scratch. It can read the content of any webpage you are browsing, then use AI to generate a one-click summary. You will personally complete the extension development, debugging, and learn how to publish it to the Chrome Web Store. + +For this tutorial, you should at least have: + +- Chrome browser (version 138+ recommended if you want to use built-in AI) +- A code editor (VS Code / Cursor / Trae) +- (Optional) An OpenAI or Claude API Key + +## 1.1 What Is a Browser Extension? + +You have definitely used browser extensions before: ad blockers, translation tools, password managers... They are like "extra gear" for your browser, giving you superpowers while browsing the web. + +Imagine this: you open a 5,000-word technical blog post, click the extension button once, and a few seconds later a concise Chinese summary appears in the side panel. That is exactly what we are going to build. + +![placeholder: A preview image showing a long article webpage on the left and an AI-generated summary displayed in the Chrome side panel on the right](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image1.png) + + + +## 1.2 The Basic Architecture of a Chrome Extension + +Chrome extensions (based on Manifest V3) consist of several core parts, each with its own role: + +* **Manifest file (`manifest.json`)**: the extension's "ID card," declaring its name, permissions, entry files, and more. +* **Service Worker (background script)**: the extension's "brain," handling events and calling APIs in the background. It does not run continuously, but starts when needed. +* **Content Script**: the extension's "eyes," injected into webpages and able to read DOM content. +* **Side Panel**: the extension's "face," showing UI on the right side of the browser where users see AI summary results. +* **Options Page**: lets users configure API Key and related settings. + +Their workflow looks like this: + +```text +User clicks the extension icon + -> Side panel opens + -> User clicks the "Summarize" button + -> Side panel notifies the Service Worker + -> Service Worker asks Content Script to read page text + -> Content Script returns page content + -> Service Worker sends content to AI API + -> AI returns the summary + -> Service Worker sends the summary back to the side panel for display +``` + +![placeholder: An architecture flowchart showing how Content Script, Service Worker, and Side Panel pass messages to each other](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image2.png) + + +## 1.3 Two AI Options: Cloud API vs Built-in Browser AI + +Our extension has two ways to access AI capability: + +**Option A: Call cloud AI APIs (OpenAI / Claude)** + +* Pros: powerful model capability, supports all devices +* Cons: needs an API Key, requires internet, has usage cost +* Best for: high-quality summaries and handling more complex content + +**Option B: Use Chrome built-in AI (Summarizer API)** + +Starting from Chrome 138, Google built AI capability based on Gemini Nano directly into the browser. One of them is the **Summarizer API** - it runs entirely locally, requires no API Key, no internet, and is completely free. + +* Pros: free, privacy-friendly, no API Key needed +* Cons: requires Chrome 138+, better hardware (4GB+ VRAM or 16GB+ RAM), model capability is weaker than cloud AI +* Best for: users who care about privacy, do not want to pay, and have sufficient hardware + +**This tutorial will implement both options**, and you can choose based on your own situation. + +## 1.4 Tutorial Roadmap + +We will build a Chrome extension called **"AI Page Summarizer"** from scratch, following these steps: + +1. **Build the extension skeleton**: create a Manifest V3 project structure and load it into Chrome +2. **Implement the core feature**: Content Script reads the page + Service Worker calls AI API + side panel shows results +3. **Integrate Chrome built-in AI**: use Summarizer API to provide free local summarization +4. **Testing and debugging**: learn Chrome extension debugging techniques +5. **Publish to Chrome Web Store**: package and submit for review + +# Chapter 2: Build the Extension Skeleton + +## 2.1 Create the Project Structure + +Open your AI coding assistant (Cursor / Trae / Claude Code), create an empty folder named `ai-page-summarizer`, then enter the following in the chat box: + +```text +Please help me create a Chrome browser extension project using Manifest V3. +The project name is ai-page-summarizer, and its function is to summarize webpage content with AI. +Please create the following file structure: + +ai-page-summarizer/ +├── manifest.json # MV3 manifest file +├── background.js # Service Worker background script +├── content.js # Content script (reads webpage text) +├── sidepanel.html # Side panel HTML +├── sidepanel.js # Side panel logic +├── sidepanel.css # Side panel styling +├── options.html # Settings page +├── options.js # Settings page logic +└── icons/ # Icons folder + +Requirements for manifest.json: +1. manifest_version: 3 +2. Permissions: storage, activeTab, scripting, sidePanel +3. Use service_worker: "background.js" for background +4. Configure side_panel with default path sidepanel.html +5. Configure default icon and title for action +``` + +AI will generate the full project skeleton for you. Let us look at what each file does. + +## 2.2 `manifest.json`: The Extension's "ID Card" + +This is the most important file in a Chrome extension. It tells the browser what the extension is, what permissions it needs, and which components it contains: + +```json +{ + "manifest_version": 3, + "name": "AI Page Summarizer", + "version": "1.0", + "description": "Use AI to summarize any webpage in one click", + "permissions": ["storage", "activeTab", "scripting", "sidePanel"], + "background": { + "service_worker": "background.js" + }, + "action": { + "default_title": "AI Page Summarizer", + "default_icon": { + "16": "icons/icon-16.png", + "48": "icons/icon-48.png", + "128": "icons/icon-128.png" + } + }, + "side_panel": { + "default_path": "sidepanel.html" + }, + "options_page": "options.html", + "icons": { + "16": "icons/icon-16.png", + "48": "icons/icon-48.png", + "128": "icons/icon-128.png" + } +} +``` + +**Permission explanation:** + +* `storage`: lets the extension store data such as the user's API Key +* `activeTab`: lets the extension access the current tab the user is viewing (only after user interaction, so it is very safe) +* `scripting`: lets the extension inject scripts into pages to read content +* `sidePanel`: lets the extension use Chrome side panel API + +![placeholder: Screenshot of manifest.json in the editor](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image2b.png) + + +## 2.3 Prepare Icons + +Chrome extensions need icons in three sizes: 16x16, 48x48, and 128x128. You can ask AI to generate them: + +```text +Please help me generate three simple Chrome extension icons (16x16, 48x48, 128x128), +with a rounded rectangle, gradient purple background, and a white AI lightning symbol in the center. +Save them in the icons/ directory as icon-16.png, icon-48.png, and icon-128.png. +``` + +## 2.4 Load the Extension into Chrome + +Before writing code, let us first load this "empty shell" extension into Chrome, so every later change can be previewed immediately: + +1. Open Chrome and enter `chrome://extensions/` in the address bar +2. Turn on **Developer mode** in the top-right corner +3. Click **Load unpacked** +4. Select your `ai-page-summarizer` folder + +You will see the extension appear in the list, and its icon will show up in the Chrome toolbar. + +![placeholder: Screenshot of Chrome extensions page showing how to enable developer mode and load an extension](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image3.png) + + + +> **Tip**: after every code change, go back to `chrome://extensions/` and click the **refresh button (🔄)** on the extension card to update it. + +# Chapter 3: Implement the Core Feature - Read Page + AI Summary + +## 3.1 Content Script: Read Page Text + +Content Script is a script injected into the webpage. It can directly access the page DOM. We use it to extract page text. + +Ask AI to write `content.js`: + +```text +Please help me write content.js with the following functions: +1. Listen for messages from Service Worker +2. When receiving a "getPageContent" message, extract the current page text content +3. Extraction logic: get document.body.innerText, and also get the page title and URL +4. Return the extracted content via sendResponse +``` + +AI will generate code like this: + +```javascript +// content.js +chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { + if (request.action === 'getPageContent') { + const content = document.body.innerText || document.body.textContent + sendResponse({ + content: content.trim(), + title: document.title, + url: window.location.href + }) + } + return true // Keep the message channel open +}) +``` + +## 3.2 Service Worker: Call AI API + +Service Worker is the extension's "brain." It coordinates communication among components and calls external AI APIs. + +Ask AI to write `background.js`: + +```text +Please help me write background.js with the following functions: +1. When the user clicks the extension icon, open the side panel +2. Listen for "summarize" messages from the side panel +3. After receiving the message, send "getPageContent" to the content script in the current tab to get page content +4. After receiving the page content, read the user's configured API Key and model selection from chrome.storage.local +5. Call the corresponding AI API according to the configuration (support OpenAI and Claude) +6. Send the AI summary back to the side panel + +For OpenAI, call https://api.openai.com/v1/chat/completions and use model gpt-4o-mini +For Claude, call https://api.anthropic.com/v1/messages and use model claude-sonnet-4-20250514 +System prompt: Please summarize the following webpage content in Chinese, extract the key points, and keep it within 300 Chinese characters. +``` + +Core code looks like this: + +```javascript +// background.js + +// Open the side panel when the user clicks the icon +chrome.sidePanel.setPanelBehavior({ openPanelOnActionClick: true }) + +// Listen for messages from the side panel +chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { + if (request.action === 'summarize') { + handleSummarize(request.tabId).then(sendResponse) + return true // Async response + } +}) + +async function handleSummarize(tabId) { + // 1. Get page content + const [response] = await chrome.tabs.sendMessage(tabId, { + action: 'getPageContent' + }) + + // 2. Read user settings + const { apiKey, provider } = await chrome.storage.local.get([ + 'apiKey', 'provider' + ]) + + if (!apiKey) { + return { error: 'Please configure your API Key in the settings page first' } + } + + // 3. Call AI API + const summary = provider === 'claude' + ? await callClaude(response.content, apiKey) + : await callOpenAI(response.content, apiKey) + + return { summary, title: response.title } +} +``` + +![](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image4.png) + + +## 3.3 Side Panel UI: Show Summary Result + +The side panel is the main interaction UI for users. Ask AI to write the HTML, CSS, and JS for the side panel: + +```text +Please help me write these three files for the side panel: + +sidepanel.html: +- Show the plugin name "AI Page Summarizer" at the top +- A blue "Summarize Current Page" button +- A loading animation area (hidden by default) +- A result display area showing the page title and AI summary +- A "Copy Summary" button at the bottom + +sidepanel.css: +- Clean modern design, similar to Notion typography +- Width adapts to the side panel +- Buttons have hover effects +- Loading animation implemented with CSS + +sidepanel.js: +- When clicking the "Summarize" button, get the current tab ID +- Send a summarize message to background.js +- Show loading animation +- Hide loading and display summary after receiving result +- Use navigator.clipboard.writeText in the "Copy" button to copy text +``` + +![placeholder: Screenshot of side panel UI showing three states: summary button, loading state, and summary result](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image5.png) + + + +## 3.4 Settings Page: Configure API Key + +Users need a place to enter their own API Key. Ask AI to write the settings page: + +```text +Please help me write options.html and options.js: +- A dropdown to choose AI provider (OpenAI / Claude) +- A password input for API Key (type="password") +- A "Save" button +- Save config with chrome.storage.local.set +- Read saved config from storage and fill the form on page load +- Show "Settings saved" after saving +``` + +> **Security reminder**: the API Key is stored in `chrome.storage.local` and only kept on the local device. But if you want to publish this extension to the Chrome Web Store for others to use, a safer approach is to build a backend proxy server so the API Key is not exposed directly on the client side. + +![placeholder: Screenshot of settings page showing provider selection and API Key input p1](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image6-1.png) +![placeholder: Screenshot of settings page showing provider selection and API Key input p2](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image6-2.png) +![placeholder: Screenshot of settings page showing provider selection and API Key input p3](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image6-3.png) + + +# Chapter 4: Use Chrome Built-in AI (No API Key Needed) + +Starting from Chrome 138, Google built AI capability based on **Gemini Nano** directly into the browser. The one best suited for our case is the **Summarizer API** - it runs entirely locally, needs no API Key, needs no internet, and is free. + +## 4.1 Check Browser Support + +Built-in AI has hardware requirements: + +* Desktop Chrome 138+ (Windows 10+, macOS 13+, Linux, ChromeOS) +* 22 GB available storage space (for model download) +* 4GB+ GPU VRAM, or 16GB+ system RAM with 4+ CPU cores + +Enter `chrome://flags` in Chrome address bar, search for the flag related to Summarization, and ensure it is **Enabled**. +* In Chrome 131-137, this switch is called Summarization API. +* In Chrome 138-144, it was renamed to Summarization API for Gemini Nano. +* In Chrome 145+, Summarization API for Gemini Nano was removed, and its summarization function was integrated into Prompt API for Gemini Nano. + +![placeholder: Screenshot of chrome://flags showing the Summarization API switch](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image7.png) + + +## 4.2 Use Summarizer API + +Ask AI to add built-in AI support in `background.js`: + +```text +Please help me add Chrome built-in Summarizer API support in background.js: +1. Add a summarizeWithBuiltinAI function +2. First check whether Summarizer.availability() returns 'readily-available' +3. If available, create a summarizer instance, configure type as 'key-points', format as 'markdown', and length as 'medium' +4. Call summarizer.summarize() to summarize +5. In handleSummarize, add a branch for provider === 'builtin' +``` + +Core code: + +```javascript +async function summarizeWithBuiltinAI(text) { + // Check availability + const availability = await Summarizer.availability() + if (availability !== 'readily-available') { + throw new Error('Chrome built-in AI is not available. Please check browser version and hardware requirements.') + } + + // Create summarizer + const summarizer = await Summarizer.create({ + type: 'key-points', + format: 'markdown', + length: 'medium' + }) + + // Run summary + const summary = await summarizer.summarize(text, { + context: 'This is a webpage article' + }) + + return summary +} +``` + +## 4.3 Update the Settings Page + +Add a **"Chrome Built-in AI (Free, No API Key Needed)"** option to the provider dropdown in `options.html`. When users choose it, hide the API Key input because it is no longer needed. + +```text +Please help me modify options.html and options.js: +1. Add an option "Chrome built-in AI (free, no API Key needed)" to the provider dropdown, with value "builtin" +2. Hide the API Key input when builtin is selected +3. Show the API Key input when OpenAI or Claude is selected +``` + +![placeholder: Screenshot of updated settings page showing three AI provider options, with API Key input hidden when Chrome built-in AI is selected](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image8.png) + + +# Chapter 5: Testing and Debugging + +## 5.1 Local Testing Workflow + +Debugging Chrome extensions is a bit different from debugging normal webpages: + +**Debug Service Worker:** +1. Open `chrome://extensions/` +2. Find your extension and click the **Service Worker** link +3. A dedicated DevTools window opens where you can see `console.log` output and network requests + +**Debug Side Panel:** +1. Open the side panel +2. Right-click inside the side panel content +3. Choose **Inspect** +4. This opens DevTools for the side panel + +**Debug Content Script:** +1. Open DevTools with F12 on any webpage +2. In the Console panel, click the execution context dropdown in the top-left +3. Select your extension name +4. Then you can see `console` output from the Content Script + +![placeholder: Screenshot of Chrome DevTools showing how to choose different execution contexts to debug different extension components](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image9.png) + + +## 5.2 Common Troubleshooting + +| Problem | Possible Cause | Solution | +|------|---------|---------| +| Clicking the icon does nothing | Service Worker error | Check the Service Worker DevTools Console | +| Cannot get page content | Content Script not injected | Refresh the page and try again, check `matches` config in manifest | +| API call fails | API Key is wrong or expired | Re-enter the API Key in the settings page | +| Side panel is blank | `sidepanel.html` path is wrong | Check `side_panel.default_path` in manifest | + + +# Chapter 6: Publish to Chrome Web Store (Optional) + +If you want to share the extension with others, you can publish it to the Chrome Web Store. + +## 6.1 Prepare for Publishing + +1. **Register a developer account**: visit [Chrome Web Store Developer Dashboard](https://chrome.google.com/webstore/devconsole) and pay the one-time $5 registration fee +2. **Enable 2-Step Verification**: your Google account must enable 2-Step Verification before publishing +3. **Prepare assets**: + * Extension icon: 128x128 PNG + * At least one screenshot: 1280x800 recommended + * Detailed functional description + * Privacy policy explanation (if your extension processes user data) + +## 6.2 Package and Upload + +1. Compress the extension folder as a `.zip` file (not `.crx`) +2. Click **New Item** in Developer Dashboard +3. Upload the `.zip` file +4. Fill in store information (name, description, screenshots, category, etc.) +5. Fill in privacy practices (declare what user data your extension collects) +6. Click **Submit for Review** + +Google will review submitted extensions, which usually takes several business days. The fewer permissions you request and the clearer your description is, the faster the review usually goes. + +![placeholder: Screenshot of Chrome Web Store Developer Dashboard showing extension upload and metadata form](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image10.png) +![placeholder: Screenshot of Chrome Web Store Developer Dashboard showing extension upload and metadata form p2](../../../../zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/images/image10-1.png) + + + +# Chapter 7: Final Notes + +Congratulations! You have built an AI-driven browser extension from scratch. Let us review what we did: + +1. Understood the Manifest V3 architecture of Chrome extensions +2. Used Content Script to read webpage content +3. Used Service Worker to call AI APIs and generate summaries +4. Used Side Panel to display the summary result +5. Also learned how to use Chrome built-in AI without any API Key + +Browser extension development is a very interesting field - it lets you "enhance" any webpage on the internet. Besides summarizing pages, you can build many more things with a similar architecture: + +**Advanced directions:** + +* **Translation assistant**: translate foreign webpages into Chinese in one click +* **Reading annotations**: highlight and annotate pages, then save to the cloud +* **Price tracking**: monitor price changes on e-commerce pages and notify users +* **Code explainer**: select code on GitHub and let AI explain it automatically + +The arrival of Chrome built-in AI lowers the barrier even further - you do not even need an API Key to build AI-powered extensions. As browser AI capabilities continue to grow, the imagination space in this field will only get larger. + +***Go give your browser some superpowers!*** + +# References + +* [Chrome Extension Official Docs - Manifest V3](https://developer.chrome.com/docs/extensions/develop/) +* [Publish Chrome Extension to Chrome Web Store](https://developer.chrome.com/docs/webstore/publish?hl=zh-cn) +* [Chrome Side Panel API](https://developer.chrome.com/docs/extensions/reference/api/sidePanel) +* [Chrome Built-in AI - Summarizer API](https://developer.chrome.com/docs/ai/summarizer-api) +* [Chrome Built-in AI - Prompt API](https://developer.chrome.com/docs/ai/prompt-api) +* [OpenAI API Docs](https://platform.openai.com/docs/api-reference) +* [Anthropic Claude API Docs](https://docs.anthropic.com/en/docs/) +* [Anthropic Claude API Docs](https://developer.chrome.com/docs/webstore/publish?hl=zh-cn) diff --git a/docs/en/stage-3/index.md b/docs/en/stage-3/index.md index c831588..1642fc8 100644 --- a/docs/en/stage-3/index.md +++ b/docs/en/stage-3/index.md @@ -1,93 +1,153 @@ # Advanced Development -Welcome to the **Advanced Development** stage! Here, you will build complex cross-platform applications, master WeChat Mini Program development, and challenge yourself with more advanced AI-native application development. +Welcome to the **Advanced Development** stage! Here, you will build complex cross-platform applications, master WeChat Mini Program development in practice, and explore deeper AI-native application development. ## What You Will Learn ### Core Skills -Deeply master the MCP protocol and Claude Code advanced techniques to improve development efficiency: +Master the MCP protocol and advanced Claude Code techniques in depth to improve development efficiency: + + + + + + - ### Cross-Platform Development -Build WeChat Mini Programs, Android, and iOS applications to achieve cross-platform coverage: +Build WeChat Mini Programs, Android and iOS applications, and achieve cross-platform coverage: + + + + + + + + - ### Personal Brand -Build your own personal website and tech blog to establish personal influence: +Build your own personal website and technical blog to establish personal influence: + +### AI Capabilities Appendix -### Advanced AI Capabilities +Explore advanced AI technologies such as RAG and LangGraph to build complex AI application workflows: -Explore advanced AI technologies like RAG and LangGraph to build complex AI application workflows: - ## Who Is This For - Advanced developers with full-stack development experience who want to challenge more complex applications - Engineers who want to master cross-platform development technologies - Explorers who want to deeply understand AI-native application development -- Tech bloggers who want to build their personal technical brand +- Technical bloggers who want to establish their personal technical brand ## Prerequisites -- Complete the "Full-Stack Development" stage, or have full-stack development experience -- Familiar with frontend frameworks (such as React/Vue) and backend development +- Complete the "Junior-to-Intermediate Development" stage, or have full-stack development experience +- Be familiar with frontend frameworks (such as React/Vue) and backend development - Understand basic AI concepts and API usage -Ready to challenge advanced development? Click the left navigation to begin learning! +Ready to challenge advanced development? Click the left navigation to start learning! diff --git a/docs/en/stage-3/personal-brand/3.7-personal-website-blog/index.md b/docs/en/stage-3/personal-brand/3.7-personal-website-blog/index.md new file mode 100644 index 0000000..293092d --- /dev/null +++ b/docs/en/stage-3/personal-brand/3.7-personal-website-blog/index.md @@ -0,0 +1,985 @@ +# How to Build Your Own Personal Website and Academic Blog - Static Deployment with GitHub Pages + +# 1. What Is a Personal Website and Academic Blog? + +In this tutorial, we will run through a complete closed loop: **from finding an existing website template, to modifying it into a personal homepage for Elon Musk, and finally publishing it online for free**. + +For this tutorial, you should at least have: + +* **A computer** (Windows or Mac) +* **Your GitHub account** (used to store website code and provide free hosting) +* **Trae installed** (your AI coding partner) +* **A Git environment** +* **A Ruby environment** + +## 1.1 What is an academic personal homepage? + +An **academic personal homepage** is your own private territory on the internet. + +Unlike WeChat Moments, Zhihu, or LinkedIn, it does not depend on any platform's recommendation algorithm, and it will not disappear if a platform shuts down. It is a long-term, stable **personal showcase space** that can be indexed by Google and Google Scholar. It usually contains your bio, publications, projects, and technical blog. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image1.png) + +## 1.2 Why build your own website? + +In the Vibe Coding development model, we no longer need to work through thick HTML/CSS books like people did ten years ago. With AI, the role of building a website shifts from "struggling coder" to "website editor-in-chief": + +1. **You (Editor / PM)**: decide the site's tone and content. For example: "Put Musk's Mars colonization PPT here," or "Change this button to Tesla red." +2. **Trae (AI Engineer)**: handles the hard implementation work. It turns your natural-language instructions into code, including layout, color schemes, and mobile adaptation. +3. **GitHub Pages (Showroom)**: provides a free server and domain so people around the world can see your work. + +**Why is it worth having for academics or technical people?** + +* **Externally (building influence)**: it is an **"evergreen business card."** When applying for PhD programs, jobs, or collaborations, a tidy personal homepage is often much more persuasive than a PDF resume. +* **Internally (knowledge accumulation)**: it is your **"second brain."** You can use it to record course notes, technical thinking, and build your own knowledge system. +* **For the future (being discoverable)**: search engines like structured content. With a homepage, when people search your name, **the content you define** can appear first, instead of unrelated people with the same name. + +## 1.3 Four typical ways to build a personal website + +In practice, there are countless ways to build a website. Here we introduce only the four most mainstream ones: + +**Method 1: hand-writing from scratch with HTML / CSS / JS** +This is the traditional computer science route. You write the code character by character. The advantage is extreme flexibility. The disadvantage is a very high barrier to entry, and it is easy to get stuck while tweaking CSS. It is not ideal for those of us who want to focus on content. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image2.png) + +**Method 2: visual site builders such as Wix / WordPress** +This is like building with blocks. The advantage is easy drag-and-drop editing. The disadvantage is that it often requires payment, tends to generate bloated code, lacks an academic-geek feel, and is difficult to customize deeply. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image3.png) + +**Method 3: GitHub-based templates (Static Site Generators)** +This is the **most recommended** mainstream route in academic and geek communities. We directly fork a mature template written by others, such as one based on Jekyll or Hugo, and then only modify the configuration files and content. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image4.png) + +**Method 4: Vibe Coding (AI visual generation flow)** +With AI agents that have strong multimodal visual understanding, you only need to see a website style you like online, take a screenshot, and tell the AI: "Write me a webpage based on this style." The AI can then analyze the visual elements and generate the underlying code for you. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image5.png) + +**The choice in this tutorial: GitHub Pages + academic template + AI modifications.** +The reason is simple: + +* **Zero cost**: no need to buy a server, no need to buy a domain. +* **High quality**: templates are often designed by top developers, with minimal style, professional structure, and fast loading. +* **Easy to maintain**: you mainly write Markdown, similar to writing in Feishu Docs or Notion, and AI helps generate the webpage. + +## 1.4 The full roadmap of this tutorial + +To make the configuration process more intuitive and less boring, we will use a fun case: **building an academic homepage for Musk**. + +Although Elon Musk is not a university professor, he has published many public "technical white papers," such as *Hyperloop Alpha*, and also has many famous projects, such as SpaceX and Tesla. We will use those materials as test data and, together with Trae's Vibe Coding workflow, walk through a reusable site-building route: + +1. **Find the skeleton**: locate a high-quality website template on GitHub and fork it into your own repository. +2. **Prepare the environment**: pull the code locally and configure Trae so the AI can read your project. +3. **Iterate with AI**: replace the template's placeholder person with Elon Musk, upload his resume, change the "publication list" into a "technical white paper showcase," and even ask AI to recolor the site to "Mars red." +4. **Deploy online**: push the modified code back to GitHub and instantly get an accessible website URL. + +This section is only responsible for drawing the big picture. For now, just remember the main line: +**Fork template -> AI renovation -> push online** +In the following sections, we will walk through every step together. + +# 2. Environment Preparation + +## 2.1 Tools used in this tutorial + +The whole build process uses four tools or resources, each playing the role of designer, contractor, landowner, or logistics system. + +* **A computer**: Windows or Mac is fine. Unlike Android development, which often has high memory requirements, web development is very lightweight and runs smoothly on an ordinary office laptop. +* **Trae**: this is your **AI coding partner** and core productivity tool. In Vibe Coding mode, you do not need to master HTML or CSS syntax. You mainly tell AI in natural language, such as "Change the navigation bar to black" or "Put Musk's photo here," and let it write and modify the code for you. +* **A GitHub account**: this is your **free server and code vault**. We need it to store all website files. Most importantly, we will use **GitHub Pages** to turn the code into a globally accessible URL for free, eliminating the need to buy a server or domain. +* **Git environment**: this is the backstage **courier**. Although we write code locally in Trae, Git is what pushes the code from your computer to GitHub. You do not need to master Git commands, and Trae can help invoke them, but Git must be installed first. +* **Ruby environment**: this is the local **web page workshop**. Because the academic template in this tutorial uses Jekyll, which runs on Ruby, we need Ruby locally so we can preview the website on our own computer before pushing it online. + +## 2.2 Download Trae + +**Trae** is our main battlefield for Vibe Coding. You can think of it as a **code editor with a super AI built in**. Unlike traditional cold editors, it is like an experienced programmer sitting next to you, always ready to help. + +* **Download address**: visit the official site [https://www.trae.cn](https://www.trae.cn) and download the version for your operating system, Windows or Mac. +* **Installation**: installation is very simple, just like installing WeChat or QQ. Double-click the installer package and click "Next" until it finishes. + +After preparing this tool, in the following practical steps we will not need to stare at boring code panes. We will directly open the project here and use the chat panel on the right to tell the AI in natural language, in Chinese if you like, to help us write code, fix bugs, and even refactor whole pages. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image6.png) + +## 2.3 Download Git + +**What is Git?** +If Trae is the AI engineer responsible for writing code in Vibe Coding, then **Git is the courier responsible for transporting code**. You need it to package the code written on your computer and safely push it to GitHub, your cloud repository. Without it, your site runs only on your own machine and no one else can see it. + +In the past, you had to go to the official site, download the installer, and configure environment variables manually. That was annoying. Now, we can simply let Trae help detect and install it. + +**Step 1: Check whether Git is already installed** + +Open Trae and type the following instruction in the chat panel at the lower right: + +```markdown +Please help me check whether Git is already installed on this computer. Please run the `git --version` command in the terminal. +``` + +* **Case A (already installed)**: if you see something like `git version 2.xx.x`, congratulations. You can skip the installation step directly. +* **Case B (not installed)**: if you see "command not found" or a group of red error messages, continue below. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image7.png) + +**Step 2: AI-assisted installation** + +Do not close Trae. Continue typing in the chat panel: + +**Instruction (Windows users):** + +```markdown +I have not installed Git. Please write the command that uses the `winget` command-line tool to install Git automatically, and tell me how to run it in the terminal. +``` + +**Instruction (Mac users):** + +```markdown +I have not installed Git. Please tell me how to quickly install Git through terminal commands, for example using `git` or `brew`. +``` + +Trae will give you a command, often something like `winget install --id Git.Git`. + +You only need to click the **Run in Terminal** button in the code block or copy it into the terminal at the bottom and press Enter. It will automatically download and install Git for you. + +If you still feel the AI-assisted process is not perfect enough, you can refer to this tutorial for manual download and installation: +[Git download and installation tutorial](https://blog.csdn.net/weixin_41293671/article/details/144255269?ops_request_misc=elastic_search_misc&request_id=63236900b52320a7beb177787ba97f07&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~baidu_landing_v2~default-5-144255269-null-null.142^v102^pc_search_result_base4&utm_term=git%E4%B8%8B%E8%BD%BD%E5%AE%89%E8%A3%85&spm=1018.2226.3001.4187) + +## 2.4 Install the Ruby environment + +Before we officially start writing code, we still need one last piece of the puzzle. The academic homepage template used in this tutorial is built with Jekyll, which itself is based on the Ruby programming language. + +To preview and debug the "renovation effect" on your own computer before pushing the code to GitHub for the world to see, we must install a Ruby environment on the computer. Think of this as hiring an interpreter on your computer who understands Ruby. Do not worry, you do not need to learn how to write Ruby. You only need to install it, and Trae can handle the rest. + +### 2.4.1 Windows installation + +**Step 1: Download the installer using a domestic mirror** + +For Windows users, the official site at https://rubyinstaller.org/downloads/ provides one-click installers, but because of network differences, it helps to know a trick. The official recommendation for beginners is usually **`Ruby+Devkit 3.X.X (x64)`**, because it includes the required toolchain. + +**Beginner reminder**: in practice, downloading directly from the official site may be slow or fail. We strongly recommend using the domestic mirror at [RubyInstaller for Windows - China mirror](https://rubyinstaller.cn/), which is usually much faster. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image8.png) + +**Step 2: Run the installation** + +Double-click the downloaded installer. In the setup wizard, make sure to check **"Add Ruby executables to your PATH."** This is the most important step. Otherwise the computer will not be able to "find" the interpreter you just installed. + +After checking it, keep clicking **Next** to complete the installation. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image9.png) + +**Step 3: Configure the development toolkit** + +When the installation progress finishes, a black command-line window will open automatically. Do not panic. Type the number `3` where the cursor is blinking, which means installing the MSYS2 base environment and the MINGW toolchain, then press Enter. Wait until the commands finish running and the window closes automatically. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image10.png) + +**Step 4: Verify the result** + +Now it is time to ask AI to check your homework. Open Trae and type the following natural-language instruction in the right-side chat: + +```markdown +Please help me check whether the Ruby environment has been installed correctly on this computer. Please run the `ruby -v` command in the terminal at the bottom and tell me the result. +``` + +If Trae replies with something like `ruby 3.x.x`, then your Windows Ruby environment is fully set up. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image11.png) + +### 2.4.2 Mac installation + +Configuring a Mac environment feels more "geeky" because it usually requires terminal commands. But in Vibe Coding mode, we do not even need to open the terminal manually. We can just let Trae act as our personal IT operator. + +**Step 1: Give the one-shot environment setup instruction** + +Open Trae and paste the following natural-language instruction into the chat on the right. We will ask it to handle checking Homebrew, installing it if missing, then installing Ruby: + +```markdown +I am using a Mac computer and need to configure a Ruby development environment. Please help me complete the following steps: +1. Check whether Homebrew is already installed. If not, please run Homebrew's official installation script in the terminal. +2. After confirming Homebrew is ready, run `brew install ruby` in the terminal. +3. When everything is done, run `ruby -v` to confirm the installation succeeded. +Please guide me step by step, and when necessary provide terminal commands that I can click and run directly. +``` + +After receiving the instruction, Trae will start working and show code blocks with run buttons in the chat panel. + +**Important note for beginners** + +When installing Homebrew, the terminal often prompts something like `Password:` and asks for your Mac login password. + +**Note:** when you type a password in the Mac terminal, the screen will not show any characters or stars. This is normal. Just type your password blindly and press Enter. + +**Step 2: Verify the result** + +After installation, go back to Trae and type: + +```markdown +I just installed Ruby on this Mac through `brew`. Please help me run the `ruby -v` command in the terminal and check whether the installation and environment variables are correct. +``` + +When you see something like `ruby 3.x.x` in the terminal, the local webpage workshop is ready and your Mac is prepared for Vibe Coding. + +## 2.5 Register a GitHub account + +**What is GitHub?** +If Git is the courier, then **GitHub is the cloud warehouse and showroom**. It not only hosts your code for free, but more importantly, with **GitHub Pages** it can turn your code into a globally accessible website URL. It is also the world's largest code hosting platform, and having a GitHub account is a kind of passport into the technical world. + +**Registration steps:** + +1. **Visit the official site**: open [https://github.com/](https://github.com/). +2. **Click Sign up**: click **"Sign up"** in the upper right corner. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image12.png) + +3. **Fill in your information** +4. **Email**: enter a real email address. +5. **Password**: choose a strong password. +6. **Username (important!)**: **choose carefully**. Your homepage URL will later become **`https://your-username.github.io`**. It is best to use your English name, pinyin, a familiar ID, or a simple combination of letters and numbers. Do **not** choose something like `a1b2c3d4`, otherwise your website link will be hard to remember. +7. **Verification and activation**: complete the human verification, often rotating images or choosing spiral galaxies, then check your email for the verification code. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image13.png) + +Once registration is complete, you have a plot of your own on the internet. In the next section, we will begin building on that plot. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image14.png) + +# 3. From Template to Your First Accessible Page + +Everything is ready. In the first two chapters, we prepared the tools. In this chapter, we will officially claim land on the internet. The task in this chapter is simple: +**Do not worry about decoration or content yet. First build the site's skeleton and get a live access link.** + +We will directly fork a mature academic template and use GitHub Pages automation to get it running within twenty minutes. When finished, you will have a globally accessible link. + +## 3.1 Get a website template + +In Vibe Coding mode, we do not need to write HTML from scratch. GitHub has thousands of excellent open-source templates. We only need to "borrow" one and change the name to our own. + +**Step 1: Find a template** + +Here we have selected a classic template with a clear structure and strong suitability for academic display: +https://github.com/luost26/academic-homepage?tab=readme-ov-file +This template is based on the Jekyll framework. + +Of course, you can also search **`academic-homepage`** on GitHub and pick another style you like, but to follow this tutorial, it is recommended to use the template above first. + +We also prepared several additional template recommendations for you: + +* Minimal Light personal homepage theme: https://github.com/yaoyao-liu/minimal-light? +* Minimal Mistakes: [https://github.com/mmistakes/minimal-mistakes](https://github.com/mmistakes/minimal-mistakes?utm_source=chatgpt.com) +* Pixyll: https://github.com/johno/pixyll +* Hydejack: https://github.com/hydecorp/hydejack +* Forty Jekyll Theme: https://github.com/andrewbanchich/forty-jekyll-theme +* Leonids: https://github://github.com/renyuanz/leonids +* YAT: https://github.com/jeffreytse/jekyll-theme-yat + +**Step 2: Fork the project** + +Visit the target repository homepage and click the **Fork** button in the upper right corner. A confirmation box will pop up. Click **Create Fork** directly. + +* Explanation: this step is equivalent to copying someone else's code repository with a full set of keys into your own GitHub account. Now, you own your copy of the site. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image15.png) + +**Step 3: Rename the repository, the most important step** + +Change the repository name to: +`your-username.github.io` + +**Important note for beginners**: +This is a hard rule of GitHub Pages. +For example, if your GitHub username is `musk-fan`, then the repository name **must** be `musk-fan.github.io`. +Only this way will GitHub automatically assign you a free domain. If the name is wrong, the webpage will not open later. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image16.png) + +## 3.2 Get the GitHub project URL + +After renaming, we need the repository pickup slip. + +1. Return to the repository homepage, under the **Code** tab. +2. Click the green **Code** button. +3. Make sure the **HTTPS** tab is selected. +4. Click the copy button and copy the URL ending in `.git`, for example `https://github.com/musk-fan/musk-fan.github.io.git`. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image17.png) + +## 3.3 Pull the project locally + +In the past, programmers had to type complex Git commands in a black terminal to download code. In the Vibe Coding era, we have Trae. We only need to tell AI, "I want this, help me pull it down." + +**Step 1: Preparation** + +Create a new folder on your computer, for example `MyWebsite`, then right-click and choose **Open with Trae**, or open Trae first and choose **Open Folder**. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image18.png) + +**Step 2: Give the clone command** + +After Trae opens, bring up the AI chat panel on the right and enter the following natural-language instruction: + +```text +Please help me clone the remote GitHub repository into the current folder. +Repository address: paste the URL you just copied, for example https://github.com/musk-fan/musk-fan.github.io.git +Execution requirement: please run the `git clone` command directly in the terminal. +``` + +**Step 3: Confirm the download** + +Trae will automatically invoke the terminal at the bottom and execute the command. Wait a few seconds. When you see files such as `_config.yml` and `index.html` appear in the file tree on the left, the project has been successfully moved to your computer. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image19.png) + +## 3.4 Preview the webpage locally + +The code is on your machine and the Ruby environment is ready. Before we modify the site, we must first inspect it locally on our own computer. This is like renovating a house: you first arrange everything in the showroom, confirm it looks right, and only then open it publicly. + +Thanks to the Ruby environment installed in **Section 2.4**, this is now very simple. + +**Step 1: Install dependencies** + +A Jekyll site depends on many Gems to run. This is like buying all the furniture from a shopping list. **However**, because of network conditions, direct downloads can stall. We will ask Trae to **switch to a domestic mirror** and install dependencies there. + +In Trae's chat box, enter: + +```markdown +I need to install the Jekyll dependencies. Considering the network environment, please first change the `source` in the Gemfile to the domestic mirror `https://gems.ruby-china.com/`. After that, please run the `bundle install` command in the terminal to install all dependencies. +``` + +**Step 2: Start the local service** + +Now we will start a **local server** to simulate the website running. Continue and tell Trae: + +```markdown +The dependencies have finished installing. Please help me start the Jekyll local preview service in the terminal. Please run the `bundle exec jekyll serve` command. +``` + +After the terminal runs for a few seconds, you will see something similar to: +`Server address: http://127.0.0.1:4000/academic-homepage/` + +1. **Open the browser**: click that link, or type it directly into your browser: + `http://127.0.0.1:4000/academic-homepage/` +2. **See the magic**: now your site is already running in the browser. Although it still shows the original template author's name, it is already running locally on your computer. + +From this point on, whenever you change content and press `Ctrl+S`, then refresh the browser, **the webpage content will change with it**. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image20.png) + +Once local preview works, we can enter the next chapter and start turning the website into something shaped like Elon Musk. + +# 4. AI-Assisted Content Modification + +To help everyone quickly experience the full process, we will not use our own personal information, to avoid privacy anxiety. Instead, we will use **Elon Musk as an example** and build an academic homepage for him. This lets us drop the boring pressure of writing a personal resume and focus on the fun of Vibe Coding for websites. It also lets us see how cool it is to place the "technical white papers" of a Silicon Valley iron man, such as *Hyperloop Alpha*, on an academic-style website. + +We will go through the complete loop from **getting the template** to **publishing the site**, and build a world-class personal showcase space by hand. + +Follow my pace and send the first instruction to AI. + +## 4.1 Unified global constraints + +This is the **global setup prompt**. You only need to send it once. +Its purpose is to set rules for the AI, to prevent it from improvising and breaking the site structure. Copy it directly into Trae: + +```text +You are now the maintainer of a “GitHub Pages + Jekyll academic homepage template” site. +The current repository is a Jekyll-powered academic homepage (including `_config.yml`, `_data`, `_layouts`, etc.). +Your modifications must follow these principles: +1. Each step should only solve the current stage goal. Do not do later-stage content in advance. +2. Do not modify the site structure, do not introduce new plugins, and do not change the theme style. +3. All content must be renderable by Jekyll without errors. +4. All identity information must follow an “academic-style simulation” tone and must not use first-person voice. +5. Do not invent obviously fake IEEE / Nature papers. +6. If information is uncertain, use “publicly well-known facts” or “reasonable academic simulation labeling.” +``` + +## 4.2 Build Musk's homepage, the content part + +### 4.2.1 First global instruction: replace the identity + +The first thing we need to solve is "Who am I?" The template is filled with the original author's information, and we need to replace it with AI in one go. + +**Step 1: Prepare the assets** + +Put the image assets I provide to you, `University_of_Pennsylvania.jpg` and `Queen_University.jpg`, into the corresponding project folder, usually `/assets/images/badges/`. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image21.png) +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image22.png) + +**Step 2: Send the instruction** + +In Trae's right-side chat box, enter the following prompt. Note that we do not need to find and edit lines manually. We just tell AI what we want: + +```text +1. Goal: replace the “person identity” of the current academic homepage with Elon Musk. Only modify the basic profile information. +2. Specific requirements: +1. Name: Elon Musk +2. Professional identity: + Technology Entrepreneur + Engineer + Founder & CEO of SpaceX + CEO of Tesla, Inc. +3. Education: + Queen’s University (Physics and Economics, not completed) (image path: /assets/images/badges/Queen_University.jpg) + University of Pennsylvania (B.S. in Physics, B.A. in Economics) (image path: /assets/images/badges/University_of_Pennsylvania.jpg) +4. Research Interests (can be simulated as): + Space Systems Engineering + Sustainable Energy Systems + Artificial Intelligence & Robotics + Large-scale Technological Innovation +5. Honors & Recognition: + Time Person of the Year (2021) + Fellow of the Royal Society (FRS) + Listed in Forbes Billionaires (multiple years) +6. Constraints: + Do not add papers / publications + Do not invent IEEE, Nature, or Science papers + Use academic-style wording and avoid commercial promotional tone + Keep the original field structure unchanged and only replace the content +``` + +At this point, you can see that Trae has completed all our modification requirements. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image23.png) + +**Step 3: Refresh the local browser** + +Refresh the local browser now, and you should see everything replaced correctly. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image24.png) + +### 4.2.2 Iterative improvement: add "papers" and projects + +Because Elon Musk is not a traditional university professor, he rarely publishes papers in *Nature* or *Science*. But as a "chief engineer," he has released many highly technical **white papers** and **master plans**. + +Within the context of an academic homepage, we can redefine the meaning of "Publications" as **"Technical White Papers & Visionary Plans."** This is not awkward at all. In fact, it fits his builder identity very well. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image25.png) + +**Step 1: Prepare the assets** + +Download the cover images I provide, namely `Hyperloop_Alpha_sketch.jpg`, `SpaceX_Starship.jpg`, and `Neuralink_sewing_machine_robot.jpg`, place them under `/assets/images/covers/`, and remove the example images originally in that folder. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image26.png) +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image27.png) +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image28.png) + +**Step 2: Send the instruction** + +Send the following prompt to Trae and let it help us rebuild the data structure: + +```text +1. Role setting: you are a static site development expert who is proficient in Jekyll and Liquid syntax. +2. Task goal: +Modify the section title on the homepage or in the navigation bar. +The current file structure is organized by year subfolders, for example `_publications/2023/xxx.md`. +Create three new Markdown files in the specified format to display Elon Musk's technical white papers and visionary plans. +3. Specific steps and requirements: +1. Modify the section title + Please search globally for the string "Selected Publications" (it may appear in `index.html`, `_config.yml`, or `_pages/publications.md`). + Replace it with: "Technical White Papers & Visionary Plans". +2. Rebuild the publication data (critical step) + Clear all old content under the `_publications` folder, including old year folders such as 2023 and 2024. + Create three new folders: `_publications/2013/`, `_publications/2017/`, and `_publications/2019/`. + In those folders, create the following three Markdown files. +3. Strictly follow this file format +Important: you must strictly follow the YAML Front Matter format below, and must not invent new field names: + - title: "paper title" + - date: YYYY-MM-DD HH:MM:SS +0800 + - selected: true + - pub: "venue / journal name" + - pub_date: "year" + - abstract: >- abstract content... + - cover: /assets/images/covers/cover_name.jpg + - authors: - Author1- Author2 + - links:Paper: https://paper-link +4. Please generate the full code for the following three files (including the path descriptions): +(1) Path: `_publications/2013/2013-hyperloop.md` + Title: Hyperloop Alpha + Date: 2013-08-12 + Pub: Tesla Blog (Open Source) + Pub_date: "2013" + Abstract: A proposal for a fifth mode of transport, utilizing a low-pressure tube and air bearings to achieve subsonic speeds. + cover: /assets/images/covers/Hyperloop_Alpha_sketch.jpg + Authors: Elon Musk, SpaceX & Tesla Teams + Link: https://www.tesla.com/sites/default/files/blog_images/hyperloop-alpha.pdf +(2) Path: `_publications/2017/2017-mars.md` + Title: Making Humans a Multi-Planetary Species + Date: 2017-06-01 + Pub: New Space + Pub_date: "2017" + Abstract: Detailed architecture of the Starship system designed to colonize Mars. This paper outlines the technical challenges to establish a self-sustaining city. + cover: /assets/images/covers/SpaceX_Starship.jpg + Authors: Elon Musk + Link: https://www.liebertpub.com/doi/10.1089/space.2017.29009.emu +(3) Path: `_publications/2019/2019-neuralink.md` + Title: An Integrated Brain-Machine Interface Platform + Date: 2019-10-16 + Pub: Journal of Medical Internet Research + Pub_date: "2019" + Abstract: We have built arrays of small and flexible electrode threads, with as many as 3,072 electrodes per array, and a neurosurgical robot. + cover: /assets/images/covers/Neuralink_sewing_machine_robot.jpg + Authors: Elon Musk, Neuralink + Link: https://www.jmir.org/2019/10/e16194/ +Execution requirement: +Please directly provide the complete content of these three files, and also provide the modification code for the file where you changed the title. +``` + +**Step 3: Refresh the local browser** + +When the build completes, you will find that the originally dull publication list has turned into a futuristic black-tech showcase. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image33.png) + +### 4.2.3 Final polish: social links and avatar + +This is the key step for moving from a score of 90 to a score of 100. The sidebar may still contain the template's original GitHub link or an incorrect email. We need to point them to Musk's real social accounts, mainly X.com. + +**Step 1: Preparation** + +Search Google for a good-looking photo of Musk, save it as `portrait.png`, or drag it into the `images/photo` folder in Trae and replace the original image. + +**Step 2: Copy the following prompt into Trae** + +```text +1. Role setting: you are a detail-oriented Jekyll website development expert. +2. Task goal: complete the final update of the website sidebar and personal information configuration. We need to update the author's avatar, intro, and social links to Elon Musk's real information. +Please first scan the project structure and find the configuration file that controls the author information. +3. Please make the following modifications: +1. Avatar path fix + I have already uploaded a new image named `portrait.png` into the `images/` or `assets/images/` folder. + Please modify the avatar path in the configuration file to point to this image, and ensure the relative path is correct, for example `/images/portrait.png`. +2. Social link cleanup + Please update or remove the social icon links in the sidebar: + Email: change it to `elon@spacex.com`, or if the field allows, comment it out or remove it to avoid harassment. + Twitter / X: change it to `https://x.com/elonmusk` (this is the core link). + GitHub: change it to `https://github.com/tesla` to point to the Tesla open-source repository, or remove it directly. + Google Scholar: must be removed, because he does not maintain it. + LinkedIn / ResearchGate: if they exist, remove them all. +Output requirement: +Please directly provide the complete modified configuration code snippet. +``` + +**Step 3: Refresh the local browser** + +1. Look at the sidebar. Is it now using that handsome photo? Does clicking the Twitter icon take you to X.com? + +At this point, locally, you already have a complete, professional, and distinctly Musk-style personal academic homepage. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image34.png) + +## 4.3 Injecting soul through UI customization, the style part + +Right now the content is correct, but the page still looks like a printed resume. It lacks the sense of technology. In Vibe Coding mode, we do not need to understand CSS. We only need to describe the **feeling** we want to AI. + +**Example scenario**: +If you think the gray background is too dull and want to change it to **Mars red**, just ask Trae: +*"I want to change the background color of the sidebar to dark red (#8B0000) to reflect the feeling of Mars. Which CSS or SCSS file should I modify? Please give me the code directly."* + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image35.png) + +If you like the **SpaceX Dashboard** style in the example image above, you can directly copy the following designer-level prompt: + +```text +1. Role setting: you are a top UI designer who admires “Swiss internationalist style” and is good at interfaces like Notion, Linear, or Apple. +2. Task goal: please completely rewrite the CSS / SCSS to create a “SpaceX Dashboard” style minimalist academic homepage. The core keywords are: transparent, restrained, precise. +3. Please apply the following concrete style overrides: +1. Global typography + Font: abandon the original serif font. Force the whole site to use the system-level sans-serif stack: + 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif. + Line height: increase breathing room in the body text with `line-height: 1.75`. + Colors: + Main title: #111111 + Body text: #333333 + Secondary information such as dates or citations: #666666 +2. Clean header + Background: remove the previous black background and use pure white (#FFFFFF), or translucent white with blur if supported, for example `rgba(255, 255, 255, 0.9)` plus `backdrop-filter: blur(10px)`. + Border: keep only a very thin bottom border, `border-bottom: 1px solid #EAEAEA`. + Text: navigation links should use dark gray #333333, and only become black and bold on hover. +3. Remove cards and return to content + Remove the background and shadow of the left sidebar and the About me cards (`box-shadow: none`, `background: transparent`). + Great minimalism lets the text float directly on the page background. + Increase spacing: significantly increase `margin-bottom`, for example 80px, between sections and use whitespace instead of borders to separate content. +4. Restrained use of brand color + Use Tesla Red (#E82127) only on links and important buttons. + Link style: remove underline and only change color. On hover, add a light red background block such as `background: rgba(232, 33, 39, 0.05)`. +5. Avatar tuning + Keep it circular with `border-radius: 50%`. + Remove the border. + Keep only a very light shadow, such as `box-shadow: 0 10px 30px rgba(0,0,0,0.08)`. +Execution requirement: +Please analyze the `_sass` or CSS files. Do not patch the old code. Instead, directly provide the code that resets and overrides the styles above. +``` + +## 4.4 Replace it with your own information, the customization part + +Congratulations. After going through the Musk homepage flow above, you have already mastered the core mindset of Vibe Coding for site building. Turning this sample room into your own home is actually easy now. + +You do not need to start over. You only need to repeat the steps above, but with slightly more flexible strategy: + +**Step 1: Physical replacement, avatar and basic information** + +This is the easiest step: + +1. **Change the photo**: in the file panel on the left side of Trae, find `assets/images/` and drag your own headshot there, replacing `portrait.png`. +2. **Change the name**: tell Trae, "Replace all instances of Elon Musk across the entire site with [your name]." + +**Step 2: AI preprocessing, let ChatGPT / Gemini help organize the content** + +Trae is good at writing code, but if you directly throw a messy PDF resume at it, it may get confused. + +**So a more efficient approach is this**: +first use an AI that is strong at handling long text, such as ChatGPT, Gemini, or Kimi, to help you **cleanly format** the resume. + +You can send ChatGPT a prompt like this: + +```text +Role setting: you are a professional academic website content planner. +Task goal: +I will send you my personal resume / CV. Please help me extract key information from it and organize it into a clear Markdown structure suitable for filling directly into a static website. +Please strictly organize and refine it into the following five modules. If some content does not exist, leave it blank. +1. Profile +Name: my full name. +Tagline: a one-line professional tag, for example “CS Student @ XX Univ | AI Enthusiast”. +Bio: a 50 to 100 word third-person introduction summarizing my background and core skills, in a professional academic tone. +Socials: extract email, GitHub, LinkedIn, blog links, and so on. +2. Education +Please list: school name, degree such as B.S. in CS, and time range. +Optional: if GPA or core courses are available, add them on a separate line. +3. Selected Projects — important +Please extract 2 to 3 strongest projects, and for each include: +Title: project name. +Tech Stack: technologies used, such as Python, React, PyTorch. +TL;DR: a one-line summary of what the project does. +Description: 2 to 3 core contributions, refined using STAR style. +Image Placeholder: reserve an image filename such as `project_name.jpg`. +4. Publications / Articles +If there are papers or technical articles, please extract: +Title +Venue +Date, year is enough +Abstract, one-sentence summary +5. Skills +Please organize them into categories: programming languages, frameworks / tools, and other skills. +Output requirement: +Do not explain the process. Directly output the cleaned Markdown content. +``` + +Once you get this cleaned text, feed it into Trae, and the accuracy will improve dramatically. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image36.png) +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image37.png) + +**Step 3: Replace the core content, with two possible routes** + +At this step, depending on your preference, you can choose two different Vibe Coding modes: + +1. **Mode A: let AI navigate, then edit manually** + +If you want to know exactly where everything is changed, you can ask Trae: + +```markdown +I want to modify the “Education” section. Please tell me where the corresponding file path is and which lines contain the code. +``` + +Trae will tell you in the chat something like: +"The file you need to modify is `_pages/about.md`, and the relevant code is around line XX..." + +You can then open that file yourself from the file tree on the left and fill in the cleaned content from ChatGPT like a structured editing exercise. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image38.png) + +2. **Mode B: fully managed automation** + +If you think finding files is too troublesome, directly paste your cleaned information into Trae: + +```markdown +Here is the cleaned content for my “Education” and “Project Experience” sections (paste the Markdown content). +Please directly replace the corresponding content in the current site and preserve the existing layout format. +``` + +# 5. Deploy Online + +## 5.1 Deploy to GitHub Pages + +**Step 1: Enable GitHub Actions, the cloud build** + +Back on GitHub in the browser: + +1. Click **Settings** at the top of the repository. +2. In the left sidebar, click **Pages**. +3. Under **Build and deployment**, change **Source** from `Deploy from a branch` to **`GitHub Actions`**. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image39.png) + +**Step 2: Automatically configure the Jekyll workflow** + +After switching, the page layout changes. GitHub will automatically recognize that this is a Jekyll project. + +1. Find the **Jekyll (By GitHub Actions)** card. +2. Click **Configure** on that card. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image40.png) + +**Step 3: Commit the configuration file** + +After clicking, you will be taken to a page full of code. This is a `.yml` configuration file already written by GitHub for building a Jekyll site. + +1. **Do not modify any code**. +2. Click the green **Commit changes...** button in the upper right corner. +3. In the pop-up confirmation box, click **Commit changes** again. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image41.png) + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image42.png) + +**Step 4: Wait and verify** + +After the commit, GitHub's servers start working automatically. + +1. Click the **Actions** tab in the top menu. +2. You will see a task named `Deploy Jekyll site to Pages` spinning. +3. Wait one to two minutes until the yellow circle turns into a **green check mark**. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image43.png) + +**Step 5: Visit your website** + +Once the circle turns green, you can access the default version of the template through an address like: +**`https://your-username.github.io/`** + +Congratulations. You have now successfully deployed a personal academic homepage that is globally accessible. + +## 5.2 Commit changes and update the homepage + +Now we will push all the local modifications we made earlier to GitHub, so this Musk-style personal homepage can be seen by the world. + +1. Click **Source Control** on the left. +2. Add all the **changes** into **staged changes**. +3. Let Trae help generate a commit message, then click **Commit**. +4. Click **Sync Changes** or **Push** to push to the `main` branch. +5. Wait a moment until all processes under the **Actions** tab complete. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image44.png) + +Now, congratulations. Open **`https://your-username.github.io/`**, and you already have a complete, professional, and strongly Musk-flavored academic homepage. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image45.png) + +# 6. Advanced Play: Hand-build a Personal Homepage from Scratch + +If you think academic templates are too rigid, or if you want to make a one-page website as cool as *The Matrix*, welcome to the **DIY section**. + +Here, we do not fork anyone else's code. We will use Trae, starting from an empty folder, and generate a complete website with a single instruction, then deploy it online. + +## 6.1 Why build it by hand + +* **Absolute freedom**: no template constraints. If you want the navigation bar on the right, or fireworks in the background, you only need to tell the AI. +* **Minimalism**: templates often contain hundreds of files, while a hand-built website may need only one `index.html`. +* **Technical control**: this is the best way to understand how a webpage actually runs. + +We will demonstrate the classic **pure HTML flow**: +no compilation required, and GitHub Pages supports it natively, which makes it ideal for building a personal landing page. + +## 6.2 Practical example: ask AI to write a "Mars command center" homepage + +This time we are not doing the academic route. Suppose Musk wants an extremely minimal, futuristic personal homepage to present his Mars plan. + +**Step 1: Create an empty project** + +Create a new folder on your computer and open it with Trae. At that moment, the file tree on the left is completely empty. + +*(Tip: you can prepare a photo of Musk in advance and name it `portrait.png`.)* + +**Step 2: Build the framework** + +Enter the following prompt in Trae's chat panel. Note that we require AI to write all code into a single file so that it is easy for beginners to manage: + +```text +I want to build a minimalist personal homepage for Elon Musk from scratch, without any complex framework, using only HTML + CSS + JS. +Design style: SpaceX dashboard style. + Background: use deep space black (#000000), with starlight animation. + Main accent color: use “Mars red” (#E82127). + Font: use a monospace font stack to imitate the feel of a code terminal. +Page content: + Place Elon Musk's avatar in the center, circular, with a rotating border. The image path is `portrait.png`. + Name: Elon Musk (Technoking of Tesla) + Intro: "Occupying Mars... 99% Loading." + At the bottom, put three glowing buttons linking to X (Twitter), SpaceX, and Tesla. +Technical requirement: +Please put all CSS styles and HTML structure inside a single `index.html` file. +Please generate the full code directly. +``` + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image46.png) + +**Step 3: Generate and preview** + +In the previous step, Trae already helped us generate an `index.html` file. So how do we see its current effect? + +Tell Trae in the chat: + +```markdown +Please help me start a local service to preview this webpage. +``` + +You will receive a link such as `http://localhost:8000`. Copy and open it in the browser, and you will see a cool "Mars homepage," perhaps with stars twinkling in the background. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image47.png) + +But we will notice that the current page is only a very cool landing page. As a complete personal homepage, it still has too little information and lacks the depth expected of an academic homepage. So based on this visual framework, we now continue to enrich it with academic-style information about Elon Musk. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image48.png) + +**Step 4: Further improve the information** + +We want Trae to keep the current Mars style, but restructure the page into something more like the academic template. We need to clearly tell it to move the existing elements to the left and create a new content area on the right for profile text and white papers, while keeping all newly added content in the same black-and-red cyberpunk style. + +Copy the following prompt and send it to Trae: + +```text +Core principle: +You must strictly preserve the current “SpaceX / Mars” design style, including pure black background, starlight decorations, red neon accent color, and monospace code-style font. Do not use the white background from the reference image. + +Specific modification steps: +1. Create a two-column layout +Split the page into left and right columns. The left sidebar should take about 30% to 35% width, and the right content area should take about 65% to 70%. + +2. Left sidebar - move the existing information +Move all current elements from the original hero screen into the fixed left sidebar: + - Avatar: keep Elon Musk's circular avatar. + - Name and title: keep the red neon text “ELON MUSK” and “Technoking of Tesla”. + - Loading bar: keep “Occupying Mars... 99% Loading” as the personal signature. + - Social buttons: move the three red buttons, X, SPACE X, and TESLA, to the bottom of the left sidebar. + +3. Right content area - add detailed information +Add detailed personal introduction and achievements in the right area. All new body text should use white or light gray, while titles should use red neon emphasis. Please create the following sections: +- About Me: + Write a short introduction, for example: “Technology entrepreneur and engineer focused on multi-planetary expansion, sustainable energy, and artificial intelligence.” +- Focus Areas: + List Space Systems Engineering, Mars Colonization Architecture, Brain-Machine Interfaces. +- Visionary Plans & White Papers: + This is the key section. Refer to the list style in the example image, but convert it into a black-background style. + Create a list displaying his important technical plans, using red borders or glow effects to distinguish each item. + Item 1: “Making Humans a Multi-Planetary Species” (Starship Architecture, 2017). + Item 2: “Hyperloop Alpha” (High-speed transportation proposal, 2013). + Item 3: “Neuralink: An Integrated Brain-Machine Interface Platform” (2019). +- Notable Achievements: + Briefly list milestones such as: + First private liquid-propellant rocket to reach orbit (Falcon 1) + First reusable orbital class rocket (Falcon 9) + +4. Style detail requirements +All section titles on the right, such as “About Me,” should use the same red glowing style as the “ELON MUSK” text on the left. +Make sure the whole page remains responsive and preserves a good two-column layout on different screen sizes. +``` + +Refresh the browser after that, and your cyberpunk academic page is complete. Of course, you can keep improving it according to your own preferences. As in the previous steps, you only need to tell Trae the goal clearly, and it will handle the tedious coding process for you. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image49.png) + +## 6.3 How to deploy the hand-built site + +Unlike the previous forked template, which came from someone else's repository, this project is newly created by you and does not yet have a corresponding GitHub location. We therefore need to bind it manually. + +**Step 1: Create a new repository on GitHub** + +1. Log in to GitHub in the browser. +2. Click the **+** icon in the upper right, then **New repository**. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image50.png) + +3. **Repository name**: enter `mars-profile`, or any other name you like. + +**Note**: +If you have already used **`your-username.github.io`**, you cannot reuse that name here. You can choose another name, and GitHub will then generate a URL like **`your-username.github.io/mars-link`**. + +4. **Public / Private**: choose **Public**. +5. **Do not check "Add a README file"!** + Leave the other options at their defaults. +6. Click **Create repository**. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image51.png) + +**Step 2: Push the local code to the cloud** + +After creation, GitHub will take you to a page with a lot of code-looking content. Do not worry. We just need to copy the repository link shown on that page. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image52.png) + +Go back to Trae and type in the chat: + +```markdown +I have created an empty repository on GitHub. The address is: https://github.com/your-username/mars-link.git (please replace this with the actual repository address you just created). +Now please help me initialize the current local project as a Git repository and push the code to the `main` branch of this remote address. +``` + +Trae will usually help execute the standard sequence below, and you may only need to click to run them: + +1. `git init` +2. `git add .` and `git commit -m "First commit"` +3. `git branch -M main` and `git remote add origin [your address]` +4. `git push -u origin main` + +After Trae completes the push, go back to GitHub and refresh the page. Click the **Code** tab, and you will see that the code written in Trae has been successfully pushed into the repository. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image53.png) + +**Step 3: Enable GitHub Pages** + +After the code is pushed, the webpage will not appear automatically. We still need to turn on the switch manually: + +1. Go back to the GitHub repository page and click **Settings** at the top. +2. Click **Pages** in the left sidebar. +3. Under **Build and deployment**: + 1. Set **Source** to `Deploy from a branch`. + 2. Set **Branch** to `main`, and choose `/(root)` as the folder. +4. Click **Save**. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image54.png) + +After you click Save, the webpage will not appear instantly. GitHub's backend works like a small robot factory. It needs around **1 to 2 minutes** to package your code, build it, and publish it to global servers. + +Wait patiently and refresh the page. Under the big **GitHub Pages** heading, you will see a line with a URL similar to: +**"Your site is live at `https://your-username.github.io/mars-link/`"** + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image55.png) + +Click it, and your Mars command center is online. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image56.png) + +# 7. Final words + +The tutorial is over. Now, when you look at the `.github.io` glowing in your browser's address bar, do you feel a little like you have planted a flag on the internet? + +In this tutorial, we borrowed Elon Musk's persona and built a website like a Lego project that looks quite impressive. But this is only the beginning. The most charming part of Vibe Coding is not how much typing time it saves. It is that it **completely smashes the wall between “idea” and “reality.”** + +In the past, you might have given up on showing a project because **you could not write CSS**. +Now, the only limits left are your **imagination** and your **taste**. + +**Do not let this site stay a “Musk-inspired clone.”** +That Tesla link you used for practice and that Mars-colonization white paper are ultimately someone else's story. Your homepage should be your own name card in the digital world. + +Go and put your first real project experience there. +Go and publish your own unique thoughts on a technical topic. +You can even put your favorite book list or your own photos on it. +Thoughts that would get buried on WeChat Moments can stay here permanently. +Passion that does not fit inside a resume can spread freely here. + +Do not leave this plot empty. +Go experiment. Go break it. Go rebuild it. +Keep doing that until it grows into the shape you like most. + +![](../../../../zh-cn/stage-3/personal-brand/3.7-personal-website-blog/images/image57.png) + +***Go ahead, and let the world see you.*** + +# References + +CSDN: [2025 latest nanny-level tutorial: step by step on using GitHub to build a personal homepage](https://blog.csdn.net/qq_45743991/article/details/145505150?ops_request_misc=&request_id=&biz_id=102&utm_term=github%E6%9E%84%E5%BB%BA%E4%B8%AA%E4%BA%BA%E4%B8%BB%E9%A1%B5&utm_medium=distribute.pc_search_result.none-task-blog-2~all~sobaiduweb~default-0-145505150.142^v102^pc_search_result_base4&spm=1018.2226.3001.4187) + +CSDN: [Git download and installation tutorial](https://blog.csdn.net/weixin_41293671/article/details/144255269?ops_request_misc=elastic_search_misc&request_id=63236900b52320a7beb177787ba97f07&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~baidu_landing_v2~default-5-144255269-null-null.142^v102^pc_search_result_base4&utm_term=git%E4%B8%8B%E8%BD%BD%E5%AE%89%E8%A3%85&spm=1018.2226.3001.4187) + +CSDN: [Ruby installation tutorial under Windows](https://blog.csdn.net/alive_tree/article/details/103043158?ops_request_misc=elastic_search_misc&request_id=ad7e29ea7f702554d785c2fc82ec6e95&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~ElasticSearch~search_v2-11-103043158-null-null.142^v102^pc_search_result_base4&utm_term=ruby%E5%AE%89%E8%A3%85%E6%95%99%E7%A8%8B&spm=1018.2226.3001.4187) diff --git a/docs/public/sitemap.xml b/docs/public/sitemap.xml index fce0178..b0289e1 100644 --- a/docs/public/sitemap.xml +++ b/docs/public/sitemap.xml @@ -900,122 +900,139 @@ https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/ai-capabilities/2.1-dify-knowledge-base/ - 2026-02-27T21:58:38+08:00 + 2026-03-24T08:44:49.470Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/assignments/2.1-fullstack-app/ - 2026-01-13T20:09:45+08:00 + 2026-03-24T07:00:25.007Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/assignments/2.2-modern-frontend-trae/ - 2026-01-13T20:09:45+08:00 + 2026-03-24T07:00:25.008Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/backend/2.2-database-supabase/ - 2026-03-18T09:34:06+08:00 + 2026-03-24T10:17:02.326Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/backend/2.3-ai-interface-code/ - 2026-03-02T16:17:11+08:00 + 2026-03-23T15:51:59.282Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/backend/2.4-git-workflow/ - 2026-02-27T18:45:52+08:00 + 2026-03-24T10:08:14.884Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/backend/2.5-zeabur-deployment/ - 2026-02-27T18:26:49+08:00 + 2026-03-24T01:20:30.534Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/backend/2.6-modern-cli/ - 2026-03-23T17:36:13+08:00 + 2026-03-24T09:29:56.836Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/backend/2.7-stripe-payment/ - 2026-02-10T17:23:27+08:00 + 2026-03-24T07:00:25.008Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/frontend/2.0-lovart-assets/ - 2026-03-16T12:42:29+08:00 + 2026-03-24T08:28:32.292Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/frontend/2.1-figma-mastergo/ - 2026-03-16T12:42:29+08:00 + 2026-03-23T15:48:55.095Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/frontend/2.2-ui-design/ - 2026-02-27T18:46:11+08:00 + 2026-03-24T07:00:25.008Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/frontend/2.3-multi-product-ui/ - 2026-02-27T18:46:11+08:00 + 2026-03-24T05:49:47.526Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/frontend/2.4-llm-skills-beautiful/ - 2026-03-01T12:28:47+08:00 + 2026-03-24T09:24:24.080Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/frontend/2.5-hogwarts-portraits/ - 2026-02-28T19:13:23+08:00 + 2026-03-24T01:18:35.782Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/frontend/2.6-design-to-code/ - 2026-02-28T19:12:09+08:00 + 2026-03-23T15:51:59.282Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/frontend/2.7-modern-component-library/ - 2026-03-02T20:11:08+08:00 + 2026-03-24T10:12:29.711Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-2/ @@ -1035,164 +1052,187 @@ https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/ai-advanced/3.a1-rag-introduction/ - 2026-02-16T09:21:33+08:00 + 2026-03-24T07:30:22.563Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/ai-advanced/3.a2-langgraph-advanced-rag/ - 2026-03-02T02:20:17+08:00 + 2026-03-24T07:00:25.008Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/core-skills/agent-teams/ - 2026-03-14T22:17:12+08:00 + 2026-03-24T16:19:04.205Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/core-skills/basics/ - 2026-03-23T17:36:13+08:00 + 2026-03-24T08:39:26.659Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/core-skills/claude-agent-sdk/ - 2026-03-02T12:52:38+08:00 + 2026-03-24T08:26:09.452Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/core-skills/long-running-tasks/ - 2026-03-18T17:22:35+08:00 + 2026-03-24T08:29:07.591Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/core-skills/mcp/ - 2026-03-01T12:26:02+08:00 + 2026-03-24T08:06:58.426Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/core-skills/mobile-development/ - 2026-03-02T14:26:06+08:00 + 2026-03-24T08:26:46.026Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/core-skills/skills/ - 2026-03-02T10:29:08+08:00 + 2026-03-24T08:27:00.403Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/core-skills/spec-coding/ - 2026-03-02T12:42:07+08:00 + 2026-03-24T08:15:15.558Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/core-skills/superpowers/ - 2026-03-01T12:26:02+08:00 + 2026-03-24T08:27:46.387Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/core-skills/workflow/ - 2026-03-02T16:17:11+08:00 + 2026-03-24T10:34:29.162Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/cross-platform/3.1-choose-platform/ - 2026-03-08T12:17:04+08:00 + 2026-03-24T06:11:17.435Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/cross-platform/3.10-electron-voice-to-text/ - 2026-02-28T15:00:32+08:00 + 2026-03-24T06:30:57.078Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/cross-platform/3.11-nft-minting/ - 2026-02-28T15:00:32+08:00 + 2026-03-24T06:30:57.087Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/cross-platform/3.12-vscode-extension/ - 2026-02-28T15:00:32+08:00 + 2026-03-24T06:30:57.099Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/cross-platform/3.13-qt-industrial-hmi/ - 2026-02-28T15:00:32+08:00 + 2026-03-24T06:30:57.106Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/cross-platform/3.3-wechat-miniprogram/ - 2026-03-14T15:42:54+08:00 + 2026-03-24T06:30:57.025Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/cross-platform/3.4-wechat-miniprogram-backend/ - 2026-03-02T02:20:17+08:00 + 2026-03-24T06:08:37.178Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/cross-platform/3.5-android-app/ - 2026-02-15T09:24:02+08:00 + 2026-03-24T06:30:57.033Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/cross-platform/3.6-ios-app/ - 2026-02-15T09:24:02+08:00 + 2026-03-24T06:30:57.049Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/cross-platform/3.8-pwa-local-app/ - 2026-03-07T23:02:24+08:00 + 2026-03-24T06:30:57.061Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/cross-platform/3.9-browser-ai-extension/ - 2026-03-05T01:27:15+08:00 + 2026-03-24T06:30:57.068Z weekly 0.8 + https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/ @@ -1212,9 +1252,10 @@ https://datawhalechina.github.io/easy-vibe/zh-cn/stage-3/personal-brand/3.7-personal-website-blog/ - 2026-02-15T09:24:02+08:00 + 2026-03-24T08:04:35.919Z weekly 0.8 +