refactor(arch): improve codebase architecture depth

LessUp · LessUp · commit f5fe22959f23 · 2026-05-15T03:40:35.000+08:00
Centralize configuration and unify algorithms:

1. Tiling Configuration:
   - Add ForwardTilingConfig and BackwardTilingConfig structs
   - Replace hardcoded block sizes with centralized config
   - Document why forward/backward use different tiling

2. Online Softmax:
   - Add OnlineSoftmaxState::update_with_rescale() method
   - Refactor forward kernel to use unified algorithm
   - Ensure numerical consistency across forward/backward

3. Validation:
   - Add is_supported_head_dim() for centralized checking
   - Update API validation to use single source of truth

4. Documentation:
   - Update CONTEXT.md with new architecture details
   - Add test file comments explaining test vs production paths
diff --git a/CONTEXT.md b/CONTEXT.md
@@ -17,6 +17,8 @@ FlashAttention 中的计算单位。一块 Q、K 或 V 的行数据，大小由
 
 支持增量式更新，无需存储完整 softmax matrix。
 
+使用 `impl::OnlineSoftmaxState` 结构体统一实现，确保前向和反向传播使用相同的数值算法。
+
 ### Matmul Operations
 Tile 级别的矩阵乘法原语：
 - `matmul_ABt`: C = A @ Bᵀ（attention score 计算）
@@ -36,22 +38,32 @@ CUDA 设备函数，在 GPU 上并行执行。FlashAttention 的 kernel 是参
 ┌─────────────────────────────────────────────────┐
 │  cuflash::flash_attention_forward/backward      │  ← 高级 API
 ├─────────────────────────────────────────────────┤
-│  cuflash::kernels::*                            │  ← Kernel 原语（公开）
+│  cuflash::kernels::*                            │  ← Kernel 原语（公开，用于测试）
 │    - online_softmax_init/update/finalize        │
 │    - matmul_ABt, matmul_AB, matmul_AB_acc, AtB  │
 │    - load_tile, store_tile                      │
 ├─────────────────────────────────────────────────┤
 │  src/kernels/impl/*                             │  ← 实现细节（内部）
 │    - OnlineSoftmaxState (device struct)         │
+│    - ForwardTilingConfig, BackwardTilingConfig  │
 │    - __device__ matmul functions                │
 │    - __device__ tile I/O functions              │
 └─────────────────────────────────────────────────┘
 ```
 
+## Tiling Configuration
+
+Tiling 参数集中定义在 `src/kernels/impl/tile_io.cuh` 中：
+
+- **ForwardTilingConfig**: 前向传播使用较大的 block（64x64，head_dim=128 时用 32x32）
+- **BackwardTilingConfig**: 反向传播使用较小的 block（64x64，head_dim=128 时用 16x32）
+
+差异原因：反向传播需要在 shared memory 中存储更多梯度张量（dQ, dK, dV），因此需要更保守的 tiling 策略。
+
 ## Key Invariants
 
 1. **Tensor Layout**: `[batch_size, num_heads, seq_len, head_dim]` — 不可变
-2. **Supported head_dim**: 32, 64, 128 — 由 kernel 模板实例化决定
+2. **Supported head_dim**: 32, 64, 128 — 由 `impl::is_supported_head_dim()` 集中检查
 3. **Data Types**: FP32 (float) and FP16 (half) — 内部计算始终用 float
 4. **Stream Safety**: 所有 CUDA 操作使用显式 stream 参数
 
@@ -60,3 +72,4 @@ CUDA 设备函数，在 GPU 上并行执行。FlashAttention 的 kernel 是参
 1. **Depth over Shallow**: Kernel utilities 有公开接口，测试不穿透实现细节
 2. **Primitive Decomposition**: 复杂操作分解为可组合的原语
 3. **Template for Performance**: M, N, K 作为编译期模板参数，确保 kernel 优化
+4. **Centralized Configuration**: Tiling 配置和支持的 head_dim 值集中定义，避免分散
diff --git a/src/api/flash_attention_api.cu b/src/api/flash_attention_api.cu
@@ -1,4 +1,5 @@
 #include "cuflash/flash_attention.h"
+#include "impl/tile_io.cuh"  // For is_supported_head_dim
 
 namespace cuflash {
 
@@ -55,8 +56,8 @@ static FlashAttentionError validate_common_params(const void* Q, const void* K,
         return FlashAttentionError::INVALID_DIMENSION;
     }
 
-    // Check supported head_dim values
-    if (head_dim != 32 && head_dim != 64 && head_dim != 128) {
+    // Check supported head_dim values (centralized check)
+    if (!impl::is_supported_head_dim(head_dim)) {
         return FlashAttentionError::UNSUPPORTED_HEAD_DIM;
     }
 
diff --git a/src/backward/flash_attention_backward_typed.cu b/src/backward/flash_attention_backward_typed.cu
@@ -4,6 +4,7 @@
 #include <float.h>
 
 #include "cuflash/flash_attention.h"
+#include "impl/online_softmax.cuh"
 #include "impl/tile_io.cuh"
 #include "kernel_launch_utils.cuh"
 #include "workspace_utils.cuh"
@@ -371,10 +372,11 @@ FlashAttentionError launch_flash_attention_backward_typed<float>(
     const float* Q, const float* K, const float* V, const float* O, const float* L, const float* dO,
     float* dQ, float* dK, float* dV, int batch_size, int num_heads, int seq_len, int head_dim,
     float scale, bool causal, cudaStream_t stream) {
-    constexpr int BLOCK_M = 64;
-    constexpr int BLOCK_N = 64;
-    constexpr int BLOCK_M_HD128 = 16;
-    constexpr int BLOCK_N_HD128 = 32;
+    using Config = impl::BackwardTilingConfig;
+    constexpr int BLOCK_M = Config::BLOCK_M;
+    constexpr int BLOCK_N = Config::BLOCK_N;
+    constexpr int BLOCK_M_HD128 = Config::BLOCK_M_HD128;
+    constexpr int BLOCK_N_HD128 = Config::BLOCK_N_HD128;
 
     int batch_heads = batch_size * num_heads;
 
@@ -526,10 +528,11 @@ FlashAttentionError launch_flash_attention_backward_typed<half>(
     const half* Q, const half* K, const half* V, const half* O, const half* L, const half* dO,
     half* dQ, half* dK, half* dV, int batch_size, int num_heads, int seq_len, int head_dim,
     float scale, bool causal, cudaStream_t stream) {
-    constexpr int BLOCK_M = 64;
-    constexpr int BLOCK_N = 64;
-    constexpr int BLOCK_M_HD128 = 16;
-    constexpr int BLOCK_N_HD128 = 32;
+    using Config = impl::BackwardTilingConfig;
+    constexpr int BLOCK_M = Config::BLOCK_M;
+    constexpr int BLOCK_N = Config::BLOCK_N;
+    constexpr int BLOCK_M_HD128 = Config::BLOCK_M_HD128;
+    constexpr int BLOCK_N_HD128 = Config::BLOCK_N_HD128;
 
     int batch_heads = batch_size * num_heads;
 
diff --git a/src/forward/flash_attention_forward_typed.cu b/src/forward/flash_attention_forward_typed.cu
@@ -4,6 +4,7 @@
 #include <float.h>
 
 #include "cuflash/flash_attention.h"
+#include "impl/online_softmax.cuh"
 #include "impl/tile_io.cuh"
 #include "kernel_launch_utils.cuh"
 
@@ -105,13 +106,15 @@ __global__ void __launch_bounds__(128)
             if (q_start + row >= seq_len)
                 continue;
 
+            // Compute row max for this KV block
             float row_max = -INFINITY;
             for (int j = 0; j < BLOCK_N; j++) {
                 if (kv_start + j < seq_len) {
                     row_max = fmaxf(row_max, S_tile[row * BLOCK_N + j]);
                 }
             }
 
+            // Compute row sum (exp) and convert scores to probabilities
             float row_sum = 0.0f;
             for (int j = 0; j < BLOCK_N; j++) {
                 if (kv_start + j < seq_len) {
@@ -122,30 +125,31 @@ __global__ void __launch_bounds__(128)
                 }
             }
 
-            // Update online softmax state
-            float m_old = m_tile[row];
-            float l_old = l_tile[row];
-            float m_new = fmaxf(m_old, row_max);
-            float l_new = l_old * expf(m_old - m_new) + row_sum * expf(row_max - m_new);
+            // Update online softmax state using unified algorithm
+            impl::OnlineSoftmaxState state;
+            state.m = m_tile[row];
+            state.l = l_tile[row];
+
+            float rescale_existing, scale_new;
+            state.update_with_rescale(row_max, row_sum, rescale_existing, scale_new);
 
             // Rescale existing O
-            float rescale = expf(m_old - m_new);
             for (int d = 0; d < HEAD_DIM; d++) {
-                O_tile[row * HEAD_DIM + d] *= rescale;
+                O_tile[row * HEAD_DIM + d] *= rescale_existing;
             }
 
             // Add contribution from this block: P @ V
-            float p_scale = expf(row_max - m_new);
             for (int d = 0; d < HEAD_DIM; d++) {
                 float sum = 0.0f;
                 for (int j = 0; j < BLOCK_N; j++) {
                     sum += S_tile[row * BLOCK_N + j] * V_tile[j * HEAD_DIM + d];
                 }
-                O_tile[row * HEAD_DIM + d] += sum * p_scale;
+                O_tile[row * HEAD_DIM + d] += sum * scale_new;
             }
 
-            m_tile[row] = m_new;
-            l_tile[row] = l_new;
+            // Store updated state
+            m_tile[row] = state.m;
+            l_tile[row] = state.l;
         }
         __syncthreads();
     }
@@ -199,10 +203,11 @@ template<>
 FlashAttentionError launch_flash_attention_forward_typed<float>(
     const float* Q, const float* K, const float* V, float* O, float* L, int batch_size,
     int num_heads, int seq_len, int head_dim, float scale, bool causal, cudaStream_t stream) {
-    constexpr int BLOCK_M = 64;
-    constexpr int BLOCK_N = 64;
-    constexpr int BLOCK_M_HD128 = 32;
-    constexpr int BLOCK_N_HD128 = 32;
+    using Config = impl::ForwardTilingConfig;
+    constexpr int BLOCK_M = Config::BLOCK_M;
+    constexpr int BLOCK_N = Config::BLOCK_N;
+    constexpr int BLOCK_M_HD128 = Config::BLOCK_M_HD128;
+    constexpr int BLOCK_N_HD128 = Config::BLOCK_N_HD128;
 
     const int batch_heads = batch_size * num_heads;
     const int num_q_blocks = (seq_len + BLOCK_M - 1) / BLOCK_M;
@@ -271,10 +276,11 @@ template<>
 FlashAttentionError launch_flash_attention_forward_typed<half>(
     const half* Q, const half* K, const half* V, half* O, half* L, int batch_size, int num_heads,
     int seq_len, int head_dim, float scale, bool causal, cudaStream_t stream) {
-    constexpr int BLOCK_M = 64;
-    constexpr int BLOCK_N = 64;
-    constexpr int BLOCK_M_HD128 = 32;
-    constexpr int BLOCK_N_HD128 = 32;
+    using Config = impl::ForwardTilingConfig;
+    constexpr int BLOCK_M = Config::BLOCK_M;
+    constexpr int BLOCK_N = Config::BLOCK_N;
+    constexpr int BLOCK_M_HD128 = Config::BLOCK_M_HD128;
+    constexpr int BLOCK_N_HD128 = Config::BLOCK_N_HD128;
 
     const int batch_heads = batch_size * num_heads;
     const int num_q_blocks = (seq_len + BLOCK_M - 1) / BLOCK_M;
diff --git a/src/kernels/impl/online_softmax.cuh b/src/kernels/impl/online_softmax.cuh
@@ -37,6 +37,25 @@ struct OnlineSoftmaxState {
 
     /// Get the normalization factor for final output
     __device__ __forceinline__ float get_normalizer() const { return 1.0f / l; }
+
+    /// Update from local block statistics and return the rescale factor for existing O.
+    /// This combines update() with computing the necessary scaling for output accumulation.
+    /// @param new_m max value in the new block
+    /// @param new_l sum of exp(x - new_m) in the new block
+    /// @param rescale_existing output: factor to multiply existing O by
+    /// @param scale_new output: factor to multiply new P@V by
+    __device__ __forceinline__ void update_with_rescale(float new_m, float new_l,
+                                                        float& rescale_existing, float& scale_new) {
+        float m_old = m;
+        float l_old = l;
+        float m_new = fmaxf(m_old, new_m);
+        l = l_old * expf(m_old - m_new) + new_l * expf(new_m - m_new);
+        m = m_new;
+
+        // Compute rescaling factors
+        rescale_existing = expf(m_old - m_new);  // For existing O
+        scale_new = expf(new_m - m_new);         // For new P @ V contribution
+    }
 };
 
 // =============================================================================
diff --git a/src/kernels/impl/tile_io.cuh b/src/kernels/impl/tile_io.cuh
@@ -422,14 +422,54 @@ __device__ __forceinline__ void matmul_AtB(const float* __restrict__ A,  // KxM
     }
 }
 
-// Tiling configuration
-struct TilingConfig {
+// =============================================================================
+// Tiling Configuration
+// =============================================================================
+// Centralized tiling configuration for FlashAttention kernels.
+// Different configurations for forward and backward passes due to memory
+// constraints in backward pass requiring smaller blocks.
+
+/// Tiling configuration for forward pass.
+/// Uses larger blocks for better memory throughput.
+struct ForwardTilingConfig {
+    // Standard block sizes for head_dim 32 and 64
+    static constexpr int BLOCK_M = 64;  // Q block rows
+    static constexpr int BLOCK_N = 64;  // K/V block rows
+
+    // Smaller blocks for head_dim 128 (shared memory constraint)
+    static constexpr int BLOCK_M_HD128 = 32;
+    static constexpr int BLOCK_N_HD128 = 32;
+
+    static constexpr int NUM_THREADS = 128;
+    static constexpr int WARP_SIZE = 32;
+};
+
+/// Tiling configuration for backward pass.
+/// Uses smaller blocks to accommodate additional gradient tensors in shared memory.
+struct BackwardTilingConfig {
+    // Standard block sizes for head_dim 32 and 64
     static constexpr int BLOCK_M = 64;  // Q block rows
     static constexpr int BLOCK_N = 64;  // K/V block rows
-    static constexpr int BLOCK_K = 64;  // Head dimension tile
+
+    // Smaller blocks for head_dim 128 (more aggressive due to dQ, dK, dV)
+    static constexpr int BLOCK_M_HD128 = 16;
+    static constexpr int BLOCK_N_HD128 = 32;
+
     static constexpr int NUM_THREADS = 128;
     static constexpr int WARP_SIZE = 32;
 };
 
+/// Supported head dimensions - single source of truth.
+/// Used for validation and kernel dispatch.
+inline constexpr int SUPPORTED_HEAD_DIMS[] = {32, 64, 128};
+
+/// Check if a head_dim value is supported.
+inline constexpr bool is_supported_head_dim(int head_dim) {
+    return head_dim == 32 || head_dim == 64 || head_dim == 128;
+}
+
+// Legacy alias for backward compatibility
+using TilingConfig = ForwardTilingConfig;
+
 }  // namespace impl
 }  // namespace cuflash
diff --git a/tests/unit/test_matmul.cu b/tests/unit/test_matmul.cu
@@ -1,5 +1,13 @@
 // Matmul Unit Tests
 // Tests for cuflash::kernels::matmul_* operations
+//
+// NOTE: These tests validate the standalone kernel API for matmul operations.
+// The production forward/backward kernels use impl::matmul_* directly in
+// shared memory for better performance. The kernels::matmul_* API provides
+// a testable interface for the same underlying algorithms.
+//
+// Test coverage: The numerical correctness of impl::matmul_* is indirectly
+// validated through the end-to-end tests in tests/integration/.
 
 #include <gtest/gtest.h>
 #if CUFLASH_ENABLE_RAPIDCHECK
diff --git a/tests/unit/test_tile_io.cu b/tests/unit/test_tile_io.cu
@@ -1,5 +1,14 @@
 // Tile I/O Unit Tests
 // Tests for cuflash::kernels::load_tile and store_tile operations
+//
+// NOTE: These tests validate the standalone kernel API for tile I/O operations.
+// The production forward/backward kernels use impl::load_tile_to_shared and
+// impl::store_tile_from_shared directly for better performance (avoiding
+// kernel launch overhead for each tile). The kernels::* API provides a
+// testable interface for the same underlying algorithms.
+//
+// Test coverage: The correctness of impl::* tile functions is validated through
+// end-to-end tests in tests/integration/.
 
 #include <gtest/gtest.h>
 #if CUFLASH_ENABLE_RAPIDCHECK

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`#include "cuflash/flash_attention.h"`
	`2`	`+#include "impl/tile_io.cuh" // For is_supported_head_dim`
`2`	`3`
`3`	`4`	`namespace cuflash {`
`4`	`5`
`@@ -55,8 +56,8 @@ static FlashAttentionError validate_common_params(const void* Q, const void* K,`
`55`	`56`	`return FlashAttentionError::INVALID_DIMENSION;`
`56`	`57`	`}`
`57`	`58`
`58`		`- // Check supported head_dim values`
`59`		`- if (head_dim != 32 && head_dim != 64 && head_dim != 128) {`
	`59`	`+ // Check supported head_dim values (centralized check)`
	`60`	`+ if (!impl::is_supported_head_dim(head_dim)) {`
`60`	`61`	`return FlashAttentionError::UNSUPPORTED_HEAD_DIM;`
`61`	`62`	`}`
`62`	`63`