| from transformers import PretrainedConfig |
| from typing import List |
|
|
|
|
| |
|
|
| class ModularStarEncoderConfig(PretrainedConfig): |
| model_type = "ModularStarEncoder" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| def __init__( |
| self, |
| attention_dropout= 0.1, |
| residual_dropout= 0.1, |
| embedding_dropout= 0.1, |
| bos_token_id= 0, |
| eos_token_id= 0, |
| hidden_act= "gelu_pytorch_tanh", |
| _attn_implementation="flash_attention_2", |
| hidden_size= 1024, |
| conditional_size= 4, |
| initializer_range= 0.018042, |
| intermediate_size= 12288, |
| max_position_embeddings= 2048, |
| mlp_type= "default", |
| model_type= "starcoder2", |
| torch_dtype= "bfloat16", |
| layer_matryoshka_loss= True, |
| matryoshka_layers= [4,9,18,27,36], |
| norm_epsilon= 1e-05, |
| layer_norm_eps=1e-05, |
| norm_type= "layer_norm", |
| num_attention_heads= 16, |
| num_hidden_layers= 36, |
| num_key_value_heads= 4, |
| rope_theta= 999999.4420358813, |
| sliding_window= None, |
| transformers_version= "4.39.3", |
| use_bias= True, |
| use_cache= False, |
| vocab_size= 49156, |
| pad_token_id=0, |
| **kwargs, |
| ): |
| if _attn_implementation not in ["flash_attention_2", "sdpa"]: |
| raise ValueError(f"`_attn_implementation` must be 'flash_attention_2', 'sdpa', got {_attn_implementation}.") |
|
|
| self.attention_dropout=attention_dropout , |
| self.residual_dropout= residual_dropout, |
| self.embedding_dropout= embedding_dropout, |
| self.bos_token_id= bos_token_id, |
| self.eos_token_id= eos_token_id, |
| self.hidden_act= hidden_act, |
| self._attn_implementation=_attn_implementation, |
| self.hidden_size= hidden_size, |
| self.conditional_size= conditional_size, |
| self.initializer_range= initializer_range, |
| self.intermediate_size= intermediate_size, |
| self.max_position_embeddings= max_position_embeddings, |
| self.mlp_type= mlp_type, |
| self.model_type= model_type, |
| self.torch_dtype= torch_dtype, |
| self.layer_matryoshka_loss= layer_matryoshka_loss, |
| self.matryoshka_layers= matryoshka_layers, |
| self.norm_epsilon= norm_epsilon, |
| self.layer_norm_eps=layer_norm_eps, |
| self.norm_type= norm_type, |
| self.num_attention_heads= num_attention_heads, |
| self.num_hidden_layers= num_hidden_layers, |
| self.num_key_value_heads= num_key_value_heads, |
| self.rope_theta= rope_theta, |
| self.sliding_window= sliding_window, |
| self.transformers_version= transformers_version, |
| self.use_bias= use_bias, |
| self.use_cache= use_cache, |
| self.vocab_size= vocab_size, |
| self.pad_token_id=pad_token_id, |
| super().__init__( |
| bos_token_id=bos_token_id, |
| eos_token_id=eos_token_id, |
| **kwargs) |
|
|