Skip to content

Commit 0e455d5

Browse files
committed
Merge branch 'rebuild-pages-whitepaper'
2 parents 2363313 + 2b04093 commit 0e455d5

35 files changed

Lines changed: 1840 additions & 532 deletions
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<script setup lang="ts">
2+
import { computed } from 'vue'
3+
import { useData, withBase } from 'vitepress'
4+
5+
const props = defineProps<{
6+
light: string
7+
dark: string
8+
alt: string
9+
caption?: string
10+
wide?: boolean
11+
}>()
12+
13+
const { isDark } = useData()
14+
const src = computed(() => withBase(isDark.value ? props.dark : props.light))
15+
</script>
16+
17+
<template>
18+
<div :class="['figure-frame', wide && 'figure-frame-wide']">
19+
<img class="hero-figure" :src="src" :alt="alt" />
20+
<p v-if="caption" class="figure-note">{{ caption }}</p>
21+
</div>
22+
</template>

docs/.vitepress/config.ts

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ const base = rawBase
99
: `/${rawBase}/`
1010
: '/sgemm-optimization/'
1111

12+
const asset = (relativePath: string) => `${base}${relativePath.replace(/^\/+/, '')}`
13+
1214
function localeNav(prefix: '/en/' | '/zh/') {
1315
if (prefix === '/en/') {
1416
return [
@@ -37,6 +39,7 @@ function localeSidebar(prefix: '/en/' | '/zh/') {
3739
text: 'Overview',
3840
items: [
3941
{ text: 'Project Guide', link: '/en/overview/' },
42+
{ text: 'Reader Map', link: '/en/overview/reader-map' },
4043
{ text: 'Getting Started', link: '/en/overview/getting-started' },
4144
],
4245
},
@@ -46,6 +49,7 @@ function localeSidebar(prefix: '/en/' | '/zh/') {
4649
text: 'Architecture',
4750
items: [
4851
{ text: 'Architecture Overview', link: '/en/architecture/' },
52+
{ text: 'System Blueprint', link: '/en/architecture/system-blueprint' },
4953
{ text: 'Kernel Ladder', link: '/en/architecture/kernel-ladder' },
5054
{ text: 'Memory Flow', link: '/en/architecture/memory-flow' },
5155
{ text: 'Tensor Core Path', link: '/en/architecture/tensor-core-path' },
@@ -75,6 +79,7 @@ function localeSidebar(prefix: '/en/' | '/zh/') {
7579
text: 'Validation',
7680
items: [
7781
{ text: 'Validation Overview', link: '/en/validation/' },
82+
{ text: 'Performance Model', link: '/en/validation/performance-model' },
7883
{ text: 'Correctness Policy', link: '/en/validation/correctness-policy' },
7984
{ text: 'Benchmark Scope', link: '/en/validation/benchmark-scope' },
8085
{ text: 'Reproducibility', link: '/en/validation/reproducibility' },
@@ -87,6 +92,7 @@ function localeSidebar(prefix: '/en/' | '/zh/') {
8792
text: 'Research',
8893
items: [
8994
{ text: 'Research Desk', link: '/en/research/' },
95+
{ text: 'Reference Map', link: '/en/research/reference-map' },
9096
{ text: 'Curated References', link: '/en/research/references' },
9197
{ text: 'Related Projects', link: '/en/research/related-projects' },
9298
{ text: 'Evolution Notes', link: '/en/research/evolution' },
@@ -105,6 +111,7 @@ function localeSidebar(prefix: '/en/' | '/zh/') {
105111
text: '导读',
106112
items: [
107113
{ text: '项目导读', link: '/zh/overview/' },
114+
{ text: '阅读地图', link: '/zh/overview/reader-map' },
108115
{ text: '快速上手', link: '/zh/overview/getting-started' },
109116
],
110117
},
@@ -114,6 +121,7 @@ function localeSidebar(prefix: '/en/' | '/zh/') {
114121
text: '架构',
115122
items: [
116123
{ text: '架构概述', link: '/zh/architecture/' },
124+
{ text: '系统蓝图', link: '/zh/architecture/system-blueprint' },
117125
{ text: 'Kernel 阶梯', link: '/zh/architecture/kernel-ladder' },
118126
{ text: 'Memory Flow', link: '/zh/architecture/memory-flow' },
119127
{ text: 'Tensor Core 路径', link: '/zh/architecture/tensor-core-path' },
@@ -143,6 +151,7 @@ function localeSidebar(prefix: '/en/' | '/zh/') {
143151
text: '验证',
144152
items: [
145153
{ text: '验证概览', link: '/zh/validation/' },
154+
{ text: '性能模型', link: '/zh/validation/performance-model' },
146155
{ text: '正确性策略', link: '/zh/validation/correctness-policy' },
147156
{ text: 'Benchmark 范围', link: '/zh/validation/benchmark-scope' },
148157
{ text: '可复现性', link: '/zh/validation/reproducibility' },
@@ -155,6 +164,7 @@ function localeSidebar(prefix: '/en/' | '/zh/') {
155164
text: '研究',
156165
items: [
157166
{ text: '研究总览', link: '/zh/research/' },
167+
{ text: '参考文献地图', link: '/zh/research/reference-map' },
158168
{ text: '参考资料清单', link: '/zh/research/references' },
159169
{ text: '相关开源项目', link: '/zh/research/related-projects' },
160170
{ text: '演进思考', link: '/zh/research/evolution' },
@@ -175,9 +185,9 @@ export default withMermaid(defineConfig({
175185
head: [
176186
['meta', { name: 'theme-color', content: '#76b900' }],
177187
['meta', { property: 'og:type', content: 'website' }],
178-
['link', { rel: 'icon', type: 'image/svg+xml', href: '/sgemm-optimization/favicon.svg' }],
179-
['link', { rel: 'icon', type: 'image/png', sizes: '32x32', href: '/sgemm-optimization/favicon-32x32.png' }],
180-
['link', { rel: 'apple-touch-icon', sizes: '180x180', href: '/sgemm-optimization/apple-touch-icon.png' }],
188+
['link', { rel: 'icon', type: 'image/svg+xml', href: asset('favicon.svg') }],
189+
['link', { rel: 'icon', type: 'image/png', sizes: '32x32', href: asset('favicon-32x32.png') }],
190+
['link', { rel: 'apple-touch-icon', sizes: '180x180', href: asset('apple-touch-icon.png') }],
181191
],
182192

183193
ignoreDeadLinks: [

docs/.vitepress/theme/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { watch, onMounted } from 'vue'
44
import { useData } from 'vitepress'
55
import Citation from '../components/Citation.vue'
66
import PerfCompare from '../components/PerfCompare.vue'
7+
import ThemedFigure from '../components/ThemedFigure.vue'
78
// 导出断点模块供组件使用
89
export { useBreakpoint, BREAKPOINTS, minQuery, maxQuery } from './breakpoints'
910

@@ -12,6 +13,7 @@ export default {
1213
enhanceApp({ app }) {
1314
app.component('Citation', Citation)
1415
app.component('PerfCompare', PerfCompare)
16+
app.component('ThemedFigure', ThemedFigure)
1517
},
1618
setup() {
1719
const { isDark } = useData()

docs/en/academy/index.md

Lines changed: 42 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,48 +4,62 @@ title: Academy
44

55
# Academy
66

7-
The academy is the ordered learning surface of this repository. Architecture gives the map. The academy gives the teaching sequence.
7+
The academy is the ordered learning surface of this repository. Architecture gives the system map. The academy gives the teaching sequence — the order in which each kernel stage is explained, and why that order is non-negotiable.
88

9-
## The rule of this section
9+
## The structuring principle
1010

11-
Read kernels as a progression of bottleneck shifts:
11+
Read kernels as a progression of bottleneck shifts, not as a list of tricks:
1212

13-
1. establish the cost model
14-
2. change data reuse
15-
3. stabilize shared-memory behavior
16-
4. overlap staging and compute
17-
5. introduce guarded mixed precision
13+
| Stage | Bottleneck exposed | Structural change introduced |
14+
|---|---|---|
15+
| Naïve FP32 | Unlimited DRAM traffic | Establishes the cost model |
16+
| Tiled FP32 | Redundant global reads | Shared-memory staging |
17+
| Bank-Free FP32 | Shared-memory bank conflicts | Tile padding |
18+
| Double Buffer | Memory latency in critical path | Overlap staging and compute |
19+
| Tensor Core WMMA | FP32 throughput ceiling | Hardware fragment accumulation |
1820

19-
That order matters because every later page assumes the previous page already explained why its extra complexity is justified.
21+
Each later page assumes the previous page already explained why its extra complexity is justified. Reading out of order makes the causal chain invisible.
2022

2123
## Academy map
2224

2325
| Track | Purpose | Start here |
2426
|---|---|---|
25-
| Orientation | Learn the route through the ladder | [Learning Path](./learning-path) |
27+
| Orientation | Learn the route through the ladder before opening any kernel page | [Learning Path](./learning-path) |
2628
| Experiment discipline | Avoid drawing conclusions from sloppy measurements | [Benchmark Discipline](./benchmark-discipline) |
27-
| Bottleneck reasoning | Turn symptoms into the next defendable change | [Diagnosis Loop](./diagnosis-loop) |
28-
| Kernel deep dives | Inspect the actual optimization stages | [Naive Kernel](./kernel-naive) |
29-
| Retention aids | Refresh memory and tuning heuristics quickly | [CUDA Memory Cheat Sheet](./cuda-memory-cheatsheet) |
29+
| Bottleneck reasoning | Turn symptoms into the next defendable architectural change | [Diagnosis Loop](./diagnosis-loop) |
30+
| Kernel deep dives | Inspect the actual optimization stages in sequence | [Naive Kernel](./kernel-naive) |
31+
| Retention aids | Refresh memory hierarchy and tuning heuristics quickly | [CUDA Memory Cheat Sheet](./cuda-memory-cheatsheet) |
3032

31-
## Recommended order
33+
## Recommended reading order
3234

33-
1. [Learning Path](./learning-path)
34-
2. [Naive Kernel](./kernel-naive)
35-
3. [Tiled Kernel](./kernel-tiled)
36-
4. [Bank Conflict Free](./kernel-bank-free)
37-
5. [Double Buffer](./kernel-double-buffer)
38-
6. [Tensor Core WMMA](./kernel-tensor-core)
39-
7. [Diagnosis Loop](./diagnosis-loop)
40-
8. [Optimization Playbook](./optimization-playbook)
35+
1. [Learning Path](./learning-path) — orientation before any kernel
36+
2. [Naive Kernel](./kernel-naive) — cost model baseline
37+
3. [Tiled Kernel](./kernel-tiled) — shared-memory reuse
38+
4. [Bank Conflict Free](./kernel-bank-free) — stability under conflict shapes
39+
5. [Double Buffer](./kernel-double-buffer) — latency hiding
40+
6. [Tensor Core WMMA](./kernel-tensor-core) — guarded throughput ceiling
41+
7. [Diagnosis Loop](./diagnosis-loop) — turn measurements into decisions
42+
8. [Optimization Playbook](./optimization-playbook) — structured tuning process
4143

4244
## Interview-ready framing
4345

44-
When you need to explain the project quickly:
46+
When defending any kernel stage under review, use this four-part structure:
47+
48+
1. **Name the current bottleneck** — what resource is saturated or wastefully used?
49+
2. **Name the specific structural change** — what does this kernel do differently at the hardware level?
50+
3. **State the evidence requirement** — what measurement would confirm the change helped?
51+
4. **State the constraint** — what assumption or shape condition limits this improvement?
52+
53+
That sequence keeps the discussion at the level of engineering reasoning rather than benchmark screenshots. The academy is designed to give you a defensible answer for each of the five stages.
54+
55+
## What the academy is not
56+
57+
The academy is not a reference manual for CUDA programming. For reference, use the [CUDA C++ Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) and the [CUDA Memory Cheat Sheet](./cuda-memory-cheatsheet) in this section.
58+
59+
The academy is not a substitute for reading the source code. Each kernel page explains the architectural reasoning; the code itself contains the implementation. Both are necessary to give a complete account of any stage.
4560

46-
1. name the current bottleneck
47-
2. name the specific structural change
48-
3. say what evidence would prove that change helped
49-
4. say what constraint still limits the design
61+
## Related resources
5062

51-
That sequence keeps the discussion technical and keeps you out of vague “it got faster” claims.
63+
- [Architecture Overview](../architecture/) — the system map that contextualizes the ladder
64+
- [Validation Overview](../validation/) — the trust boundary for any number produced during academy study
65+
- [Performance Model](../validation/performance-model) — analytical cost model behind each ladder stage

0 commit comments

Comments
 (0)