Skip to content

Commit 634590b

Browse files
Add Qwen Moe (#2163)
* qwen moe init commit * wip * wip * weight conversion wip * weight matching complete * update the docstrings + configs * remove incorrect import * wip * updates * updates * updates * bug fix * add aux loss * address comments * causal lm test * add tests * add docstrings * small bug fix * bug fix in aux loss * update init.py * address comments * bug fixes * address comments * code format * update * update * address comments * update * format --------- Co-authored-by: Anshuman Mishra <[email protected]> Co-authored-by: Anshuman Mishra <[email protected]>
1 parent ec0a914 commit 634590b

17 files changed

+2578
-1
lines changed

keras_hub/api/models/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,15 @@
420420
from keras_hub.src.models.qwen.qwen_tokenizer import (
421421
QwenTokenizer as QwenTokenizer,
422422
)
423+
from keras_hub.src.models.qwen_moe.qwen_moe_backbone import (
424+
QwenMoeBackbone as QwenMoeBackbone,
425+
)
426+
from keras_hub.src.models.qwen_moe.qwen_moe_causal_lm import (
427+
QwenMoeCausalLM as QwenMoeCausalLM,
428+
)
429+
from keras_hub.src.models.qwen_moe.qwen_moe_causal_lm_preprocessor import (
430+
QwenMoeCausalLMPreprocessor as QwenMoeCausalLMPreprocessor,
431+
)
423432
from keras_hub.src.models.resnet.resnet_backbone import (
424433
ResNetBackbone as ResNetBackbone,
425434
)

keras_hub/api/tokenizers/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@
6868
from keras_hub.src.models.qwen.qwen_tokenizer import (
6969
QwenTokenizer as QwenTokenizer,
7070
)
71+
from keras_hub.src.models.qwen_moe.qwen_moe_tokenizer import (
72+
QwenMoeTokenizer as QwenMoeTokenizer,
73+
)
7174
from keras_hub.src.models.roberta.roberta_tokenizer import (
7275
RobertaTokenizer as RobertaTokenizer,
7376
)

keras_hub/src/models/qwen/qwen_attention.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,9 @@ def _compute_attention(
287287
if self.use_sliding_window_attention:
288288
attention_mask = self._mask_sliding_window(
289289
attention_mask,
290-
cache_update_index=cache_update_index,
290+
cache_update_index=cache_update_index
291+
if cache_update_index
292+
else 0,
291293
)
292294
attention_scores = self._masked_softmax(
293295
attention_scores, attention_mask

keras_hub/src/models/qwen_moe/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)