From 416db71f5ec0cd242a9f83c1efb9cf118af33ebc Mon Sep 17 00:00:00 2001 From: Andrew Chan Date: Tue, 3 Jun 2025 07:21:33 +0000 Subject: [PATCH 1/3] remove unused rope args --- src/infer.cpp | 48 ++++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/src/infer.cpp b/src/infer.cpp index d084d60..3d5c8ef 100644 --- a/src/infer.cpp +++ b/src/infer.cpp @@ -645,14 +645,13 @@ inline float clip(float x, float v) { return x < -v ? -v : (x > v ? v : x); } -static void rope(float* buf, float* vec, int d, int head_dim, int pos, float theta) { +static void rope(float* buf, float* vec, int d, int pos, float theta) { // For some reason, DeepSeek-V2 was trained using rope output // layout transposed compared to the input. This means we need a buffer // to hold intermediate results. assert(d % 2 == 0); for (int i = 0; i < d; i += 2) { - int j_head = i % head_dim; - float freq = 1.0f / powf(theta, (float)j_head / (float)head_dim); + float freq = 1.0f / powf(theta, (float)i / (float)d); float val = pos * freq; float fcr = cosf(val); float fci = sinf(val); @@ -667,12 +666,9 @@ static void rope(float* buf, float* vec, int d, int head_dim, int pos, float the } } -static void rope_v3(float* vec, int d, int head_dim, int pos, float theta) { - int rotary_dim = head_dim; - +static void rope_v3(float* vec, int d, int pos, float theta) { for (int i = 0; i < d; i += 2) { - int j_head = i % head_dim; - float freq = j_head >= rotary_dim ? 0.f : 1.0f / powf(theta, (float)j_head / (float)rotary_dim); + float freq = 1.0f / powf(theta, (float)i / (float)d); float val = pos * freq; float fcr = cosf(val); float fci = sinf(val); @@ -684,14 +680,13 @@ static void rope_v3(float* vec, int d, int head_dim, int pos, float theta) { } } -static void rope(float* buf, f16_t* vec, int d, int head_dim, int pos, float theta) { +static void rope(float* buf, f16_t* vec, int d, int pos, float theta) { // For some reason, DeepSeek-V2 was trained using rope output // layout transposed compared to the input. This means we need a buffer // to hold intermediate results. assert(d % 2 == 0); for (int i = 0; i < d; i += 2) { - int j_head = i % head_dim; - float freq = 1.0f / powf(theta, (float)j_head / (float)head_dim); + float freq = 1.0f / powf(theta, (float)i / (float)d); float val = pos * freq; float fcr = cosf(val); float fci = sinf(val); @@ -706,12 +701,9 @@ static void rope(float* buf, f16_t* vec, int d, int head_dim, int pos, float the } } -static void rope_v3(f16_t* vec, int d, int head_dim, int pos, float theta) { - int rotary_dim = head_dim; - +static void rope_v3(f16_t* vec, int d, int pos, float theta) { for (int i = 0; i < d; i += 2) { - int j_head = i % head_dim; - float freq = j_head >= rotary_dim ? 0.f : 1.0f / powf(theta, (float)j_head / (float)rotary_dim); + float freq = 1.0f / powf(theta, (float)i / (float)d); float val = pos * freq; float fcr = cosf(val); float fci = sinf(val); @@ -958,17 +950,17 @@ void BlockMHA::_attention_impl( bool is_v3 = c.has_moegate_bias; for (int h = 0; h < c.n_heads; h++) { if (is_v3) { - rope_v3(s.q(h) + q_pe_offset, c.qk_rope_head_dim, c.qk_rope_head_dim, pos, c.rope_theta); + rope_v3(s.q(h) + q_pe_offset, c.qk_rope_head_dim, pos, c.rope_theta); } else { - rope(s.ropebuf(), s.q(h) + q_pe_offset, c.qk_rope_head_dim, c.qk_rope_head_dim, pos, c.rope_theta); + rope(s.ropebuf(), s.q(h) + q_pe_offset, c.qk_rope_head_dim, pos, c.rope_theta); } } int kv_pe_offset = c.kv_lora_rank; float* k_rope = s.kv_a() + kv_pe_offset; if (is_v3) { - rope_v3(k_rope, c.qk_rope_head_dim, c.qk_rope_head_dim, pos, c.rope_theta); + rope_v3(k_rope, c.qk_rope_head_dim, pos, c.rope_theta); } else { - rope(s.ropebuf(), k_rope, c.qk_rope_head_dim, c.qk_rope_head_dim, pos, c.rope_theta); + rope(s.ropebuf(), k_rope, c.qk_rope_head_dim, pos, c.rope_theta); } // rms norm to non-pe chunk of kv_a rmsnorm(s.kv_a(), s.kv_a(), this->rms_kv_a_weight(), c.kv_lora_rank, c.norm_eps); @@ -1012,9 +1004,9 @@ void BlockMHA::_attention_impl( for (int h = 0; h < c.n_heads; h++) { f16_t* kh = key + h * c.head_dim; if (is_v3) { - rope_v3(kh + q_pe_offset, c.qk_rope_head_dim, c.qk_rope_head_dim, 1, c.rope_theta); + rope_v3(kh + q_pe_offset, c.qk_rope_head_dim, 1, c.rope_theta); } else { - rope(s.ropebuf(), kh + q_pe_offset, c.qk_rope_head_dim, c.qk_rope_head_dim, 1, c.rope_theta); + rope(s.ropebuf(), kh + q_pe_offset, c.qk_rope_head_dim, 1, c.rope_theta); } } } @@ -1073,17 +1065,17 @@ void BlockMLA::_attention_impl( bool is_v3 = c.has_moegate_bias; for (int h = 0; h < c.n_heads; h++) { if (is_v3) { - rope_v3(s.q_rope(h), c.qk_rope_head_dim, c.qk_rope_head_dim, pos, c.rope_theta); + rope_v3(s.q_rope(h), c.qk_rope_head_dim, pos, c.rope_theta); } else { - rope(s.ropebuf(), s.q_rope(h), c.qk_rope_head_dim, c.qk_rope_head_dim, pos, c.rope_theta); + rope(s.ropebuf(), s.q_rope(h), c.qk_rope_head_dim, pos, c.rope_theta); } } int kv_pe_offset = c.kv_lora_rank; float* k_rope = s.kv_a() + kv_pe_offset; if (is_v3) { - rope_v3(k_rope, c.qk_rope_head_dim, c.qk_rope_head_dim, pos, c.rope_theta); + rope_v3(k_rope, c.qk_rope_head_dim, pos, c.rope_theta); } else { - rope(s.ropebuf(), k_rope, c.qk_rope_head_dim, c.qk_rope_head_dim, pos, c.rope_theta); + rope(s.ropebuf(), k_rope, c.qk_rope_head_dim, pos, c.rope_theta); } // rms norm to non-pe chunk of kv_a (compressed latent kv) rmsnorm(s.kv_a(), s.kv_a(), this->rms_kv_a_weight(), c.kv_lora_rank, c.norm_eps); @@ -1103,9 +1095,9 @@ void BlockMLA::_attention_impl( for (int r = 0; r < kv_sink; r++) { f16_t* kv = this->kv_rope_cache(r); if (is_v3) { - rope_v3(kv, c.qk_rope_head_dim, c.qk_rope_head_dim, 1, c.rope_theta); + rope_v3(kv, c.qk_rope_head_dim, 1, c.rope_theta); } else { - rope(s.ropebuf(), kv, c.qk_rope_head_dim, c.qk_rope_head_dim, 1, c.rope_theta); + rope(s.ropebuf(), kv, c.qk_rope_head_dim, 1, c.rope_theta); } } From 7ac6d2a558d0a50abc901860e3646a132797ecd1 Mon Sep 17 00:00:00 2001 From: Andrew Chan Date: Tue, 3 Jun 2025 07:58:44 +0000 Subject: [PATCH 2/3] add yarn for v2 --- src/infer.cpp | 119 ++++++++++++++++++++++++++++++++++++++++++-------- src/model.cpp | 2 +- src/model.h | 2 +- 3 files changed, 104 insertions(+), 19 deletions(-) diff --git a/src/infer.cpp b/src/infer.cpp index 3d5c8ef..68c90ac 100644 --- a/src/infer.cpp +++ b/src/infer.cpp @@ -645,16 +645,44 @@ inline float clip(float x, float v) { return x < -v ? -v : (x > v ? v : x); } -static void rope(float* buf, float* vec, int d, int pos, float theta) { +inline float yarn_find_correction_dim(int num_rotations, int dim, int base, int max_pos) { + return (dim * logf(max_pos / (num_rotations * 2 * M_PI))) / (2 * logf(base)); +} + +inline float yarn_linear_ramp(float min, float max, int i) { + float lin = (i - min) / (max - min); + return std::clamp(lin, 0.0f, 1.0f); +} + +inline float yarn_get_mscale(float scale, float mscale) { + return scale <= 1.0f ? 1.0f : 0.1 * mscale * logf(scale) + 1.0f; +} + +static void rope( + float* buf, float* vec, + int d, int pos, + float theta, float scaling_factor, + int beta_fast, int beta_slow, + float mscale, float mscale_all_dim, + int original_max_pos +) { // For some reason, DeepSeek-V2 was trained using rope output // layout transposed compared to the input. This means we need a buffer // to hold intermediate results. assert(d % 2 == 0); for (int i = 0; i < d; i += 2) { - float freq = 1.0f / powf(theta, (float)i / (float)d); - float val = pos * freq; - float fcr = cosf(val); - float fci = sinf(val); + // TODO: cache these values + float freq_extra = 1.0f / powf(theta, (float)i / (float)d); + float freq_inter = freq_extra / scaling_factor; + float low = std::max(0.0f, floorf(yarn_find_correction_dim(beta_fast, d, theta, original_max_pos))); + float high = std::min(d - 1.0f, floorf(yarn_find_correction_dim(beta_slow, d, theta, original_max_pos))); + float inv_freq_mask = 1.0f - yarn_linear_ramp(low, high, i / 2); + float inv_freq = freq_inter * (1.0f - inv_freq_mask) + freq_extra * inv_freq_mask; + float val = pos * inv_freq; + float _mscale = + yarn_get_mscale(scaling_factor, mscale) / yarn_get_mscale(scaling_factor, mscale_all_dim); + float fcr = _mscale * cosf(val); + float fci = _mscale * sinf(val); float v0 = vec[i]; float v1 = vec[i + 1]; @@ -680,16 +708,31 @@ static void rope_v3(float* vec, int d, int pos, float theta) { } } -static void rope(float* buf, f16_t* vec, int d, int pos, float theta) { +static void rope( + float* buf, f16_t* vec, + int d, int pos, + float theta, float scaling_factor, + int beta_fast, int beta_slow, + float mscale, float mscale_all_dim, + int original_max_pos +) { // For some reason, DeepSeek-V2 was trained using rope output // layout transposed compared to the input. This means we need a buffer // to hold intermediate results. assert(d % 2 == 0); for (int i = 0; i < d; i += 2) { - float freq = 1.0f / powf(theta, (float)i / (float)d); - float val = pos * freq; - float fcr = cosf(val); - float fci = sinf(val); + // TODO: cache these values + float freq_extra = 1.0f / powf(theta, (float)i / (float)d); + float freq_inter = freq_extra / scaling_factor; + float low = std::max(0.0f, floorf(yarn_find_correction_dim(beta_fast, d, theta, original_max_pos))); + float high = std::min(d - 1.0f, floorf(yarn_find_correction_dim(beta_slow, d, theta, original_max_pos))); + float inv_freq_mask = 1.0f - yarn_linear_ramp(low, high, i / 2); + float inv_freq = freq_inter * (1.0f - inv_freq_mask) + freq_extra * inv_freq_mask; + float val = pos * inv_freq; + float _mscale = + yarn_get_mscale(scaling_factor, mscale) / yarn_get_mscale(scaling_factor, mscale_all_dim); + float fcr = _mscale * cosf(val); + float fci = _mscale * sinf(val); float v0 = half_to_float(vec[i]); float v1 = half_to_float(vec[i + 1]); @@ -952,7 +995,14 @@ void BlockMHA::_attention_impl( if (is_v3) { rope_v3(s.q(h) + q_pe_offset, c.qk_rope_head_dim, pos, c.rope_theta); } else { - rope(s.ropebuf(), s.q(h) + q_pe_offset, c.qk_rope_head_dim, pos, c.rope_theta); + rope( + s.ropebuf(), s.q(h) + q_pe_offset, + c.qk_rope_head_dim, pos, + c.rope_theta, c.rs_factor, + c.rs_beta_fast, c.rs_beta_slow, + c.rs_mscale, c.rs_mscale_all_dim, + c.rs_original_max_pos + ); } } int kv_pe_offset = c.kv_lora_rank; @@ -960,7 +1010,14 @@ void BlockMHA::_attention_impl( if (is_v3) { rope_v3(k_rope, c.qk_rope_head_dim, pos, c.rope_theta); } else { - rope(s.ropebuf(), k_rope, c.qk_rope_head_dim, pos, c.rope_theta); + rope( + s.ropebuf(), k_rope, + c.qk_rope_head_dim, pos, + c.rope_theta, c.rs_factor, + c.rs_beta_fast, c.rs_beta_slow, + c.rs_mscale, c.rs_mscale_all_dim, + c.rs_original_max_pos + ); } // rms norm to non-pe chunk of kv_a rmsnorm(s.kv_a(), s.kv_a(), this->rms_kv_a_weight(), c.kv_lora_rank, c.norm_eps); @@ -1006,7 +1063,14 @@ void BlockMHA::_attention_impl( if (is_v3) { rope_v3(kh + q_pe_offset, c.qk_rope_head_dim, 1, c.rope_theta); } else { - rope(s.ropebuf(), kh + q_pe_offset, c.qk_rope_head_dim, 1, c.rope_theta); + rope( + s.ropebuf(), kh + q_pe_offset, + c.qk_rope_head_dim, 1, + c.rope_theta, c.rs_factor, + c.rs_beta_fast, c.rs_beta_slow, + c.rs_mscale, c.rs_mscale_all_dim, + c.rs_original_max_pos + ); } } } @@ -1067,7 +1131,14 @@ void BlockMLA::_attention_impl( if (is_v3) { rope_v3(s.q_rope(h), c.qk_rope_head_dim, pos, c.rope_theta); } else { - rope(s.ropebuf(), s.q_rope(h), c.qk_rope_head_dim, pos, c.rope_theta); + rope( + s.ropebuf(), s.q_rope(h), + c.qk_rope_head_dim, pos, + c.rope_theta, c.rs_factor, + c.rs_beta_fast, c.rs_beta_slow, + c.rs_mscale, c.rs_mscale_all_dim, + c.rs_original_max_pos + ); } } int kv_pe_offset = c.kv_lora_rank; @@ -1075,7 +1146,14 @@ void BlockMLA::_attention_impl( if (is_v3) { rope_v3(k_rope, c.qk_rope_head_dim, pos, c.rope_theta); } else { - rope(s.ropebuf(), k_rope, c.qk_rope_head_dim, pos, c.rope_theta); + rope( + s.ropebuf(), k_rope, + c.qk_rope_head_dim, pos, + c.rope_theta, c.rs_factor, + c.rs_beta_fast, c.rs_beta_slow, + c.rs_mscale, c.rs_mscale_all_dim, + c.rs_original_max_pos + ); } // rms norm to non-pe chunk of kv_a (compressed latent kv) rmsnorm(s.kv_a(), s.kv_a(), this->rms_kv_a_weight(), c.kv_lora_rank, c.norm_eps); @@ -1097,7 +1175,14 @@ void BlockMLA::_attention_impl( if (is_v3) { rope_v3(kv, c.qk_rope_head_dim, 1, c.rope_theta); } else { - rope(s.ropebuf(), kv, c.qk_rope_head_dim, 1, c.rope_theta); + rope( + s.ropebuf(), kv, + c.qk_rope_head_dim, 1, + c.rope_theta, c.rs_factor, + c.rs_beta_fast, c.rs_beta_slow, + c.rs_mscale, c.rs_mscale_all_dim, + c.rs_original_max_pos + ); } } @@ -1263,7 +1348,7 @@ void Model::_forward_cpu(InferenceState& s, int token, int pos, InferenceMode mo // When decoding past the context length, keep the first few tokens in the KV cache // untouched as "attention sinks" while replacing the rest in ring order. // See StreamingLLM (https://arxiv.org/pdf/2309.17453) for more. - int original_max_position = c.rs_original_max_position_embeddings; + int original_max_position = c.rs_original_max_pos; int kv_sink = pos >= original_max_position ? KV_SINKS : 0; int kv_pos = kv_sink + (pos - kv_sink) % (original_max_position - kv_sink); int kv_len = pos >= original_max_position ? original_max_position : pos + 1; diff --git a/src/model.cpp b/src/model.cpp index 422ef5e..034ea09 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -123,7 +123,7 @@ void Config::from_yalm(YALMData& yalm, int context) { rs_factor = std::stof(yalm.metadata.at("rope_scaling_factor").get()); rs_mscale = std::stof(yalm.metadata.at("rope_scaling_mscale").get()); rs_mscale_all_dim = std::stof(yalm.metadata.at("rope_scaling_mscale_all_dim").get()); - rs_original_max_position_embeddings = std::stoi(yalm.metadata.at("rope_scaling_original_max_position_embeddings").get()); + rs_original_max_pos = std::stoi(yalm.metadata.at("rope_scaling_original_max_position_embeddings").get()); } std::optional check_tensor(const Tensor* tensor, Quant weight_quant, std::array shape, const int debug_line) { diff --git a/src/model.h b/src/model.h index b999d13..0de2a48 100644 --- a/src/model.h +++ b/src/model.h @@ -89,7 +89,7 @@ struct Config { float rs_factor; float rs_mscale; float rs_mscale_all_dim; - int rs_original_max_position_embeddings; + int rs_original_max_pos; // If nonzero `context` is supplied, max sequence length is limited to `context`. void from_yalm(YALMData& yalm, int context = 0); From e9354fa1708604ff4f6a276e403df6cc5554a882 Mon Sep 17 00:00:00 2001 From: Andrew Chan Date: Thu, 5 Jun 2025 01:08:39 +0000 Subject: [PATCH 3/3] fixes --- src/infer.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/infer.cpp b/src/infer.cpp index 68c90ac..13fdda8 100644 --- a/src/infer.cpp +++ b/src/infer.cpp @@ -650,6 +650,9 @@ inline float yarn_find_correction_dim(int num_rotations, int dim, int base, int } inline float yarn_linear_ramp(float min, float max, int i) { + if (min == max) { + max += 0.001; // Prevent singularity + } float lin = (i - min) / (max - min); return std::clamp(lin, 0.0f, 1.0f); } @@ -675,7 +678,7 @@ static void rope( float freq_extra = 1.0f / powf(theta, (float)i / (float)d); float freq_inter = freq_extra / scaling_factor; float low = std::max(0.0f, floorf(yarn_find_correction_dim(beta_fast, d, theta, original_max_pos))); - float high = std::min(d - 1.0f, floorf(yarn_find_correction_dim(beta_slow, d, theta, original_max_pos))); + float high = std::min(d - 1.0f, ceilf(yarn_find_correction_dim(beta_slow, d, theta, original_max_pos))); float inv_freq_mask = 1.0f - yarn_linear_ramp(low, high, i / 2); float inv_freq = freq_inter * (1.0f - inv_freq_mask) + freq_extra * inv_freq_mask; float val = pos * inv_freq; @@ -725,7 +728,7 @@ static void rope( float freq_extra = 1.0f / powf(theta, (float)i / (float)d); float freq_inter = freq_extra / scaling_factor; float low = std::max(0.0f, floorf(yarn_find_correction_dim(beta_fast, d, theta, original_max_pos))); - float high = std::min(d - 1.0f, floorf(yarn_find_correction_dim(beta_slow, d, theta, original_max_pos))); + float high = std::min(d - 1.0f, ceilf(yarn_find_correction_dim(beta_slow, d, theta, original_max_pos))); float inv_freq_mask = 1.0f - yarn_linear_ramp(low, high, i / 2); float inv_freq = freq_inter * (1.0f - inv_freq_mask) + freq_extra * inv_freq_mask; float val = pos * inv_freq;