Kilinskiy commited on
Commit
a432d10
·
verified ·
1 Parent(s): e124a7f

Upload config.json.original

Browse files
Files changed (1) hide show
  1. config.json.original +311 -0
config.json.original ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Step3p5ForCausalLM"
4
+ ],
5
+ "att_impl_type": "GQA",
6
+ "attention_other_setting": {
7
+ "attention_type": "sliding_attention",
8
+ "head_dim": 128,
9
+ "num_attention_groups": 8,
10
+ "num_attention_heads": 96,
11
+ "true_head_dim": 128
12
+ },
13
+ "auto_map": {
14
+ "AutoConfig": "configuration_step3p5.Step3p5Config",
15
+ "AutoModelForCausalLM": "modeling_step3p5.Step3p5ForCausalLM"
16
+ },
17
+ "bos_token_id": 0,
18
+ "dtype": "bfloat16",
19
+ "eos_token_id": 1,
20
+ "head_dim": 128,
21
+ "hidden_size": 4096,
22
+ "intermediate_size": 11264,
23
+ "layer_types": [
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "full_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "sliding_attention",
32
+ "full_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "full_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "sliding_attention",
44
+ "full_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "full_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "full_attention",
53
+ "sliding_attention",
54
+ "sliding_attention",
55
+ "sliding_attention",
56
+ "full_attention",
57
+ "sliding_attention",
58
+ "sliding_attention",
59
+ "sliding_attention",
60
+ "full_attention",
61
+ "sliding_attention",
62
+ "sliding_attention",
63
+ "sliding_attention",
64
+ "full_attention",
65
+ "sliding_attention",
66
+ "sliding_attention",
67
+ "sliding_attention",
68
+ "full_attention",
69
+ "sliding_attention",
70
+ "sliding_attention",
71
+ "sliding_attention"
72
+ ],
73
+ "max_position_embeddings": 262144,
74
+ "max_seq_len": 262144,
75
+ "model_type": "step3p5",
76
+ "moe_every_n_layer": 1,
77
+ "moe_intermediate_size": 1280,
78
+ "moe_layer_offset": 0,
79
+ "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
80
+ "moe_num_experts": 288,
81
+ "moe_router_activation": "sigmoid",
82
+ "moe_router_scaling_factor": 3.0,
83
+ "moe_top_k": 8,
84
+ "need_fp32_gate": true,
85
+ "norm_expert_weight": true,
86
+ "num_attention_groups": 8,
87
+ "num_attention_heads": 64,
88
+ "num_hidden_layers": 45,
89
+ "num_nextn_predict_layers": 3,
90
+ "output_hidden_states": true,
91
+ "pad_token_id": 1,
92
+ "partial_rotary_factor": 0.5,
93
+ "partial_rotary_factors": [
94
+ 0.5,
95
+ 1.0,
96
+ 1.0,
97
+ 1.0,
98
+ 0.5,
99
+ 1.0,
100
+ 1.0,
101
+ 1.0,
102
+ 0.5,
103
+ 1.0,
104
+ 1.0,
105
+ 1.0,
106
+ 0.5,
107
+ 1.0,
108
+ 1.0,
109
+ 1.0,
110
+ 0.5,
111
+ 1.0,
112
+ 1.0,
113
+ 1.0,
114
+ 0.5,
115
+ 1.0,
116
+ 1.0,
117
+ 1.0,
118
+ 0.5,
119
+ 1.0,
120
+ 1.0,
121
+ 1.0,
122
+ 0.5,
123
+ 1.0,
124
+ 1.0,
125
+ 1.0,
126
+ 0.5,
127
+ 1.0,
128
+ 1.0,
129
+ 1.0,
130
+ 0.5,
131
+ 1.0,
132
+ 1.0,
133
+ 1.0,
134
+ 0.5,
135
+ 1.0,
136
+ 1.0,
137
+ 1.0,
138
+ 0.5,
139
+ 1.0,
140
+ 1.0,
141
+ 1.0
142
+ ],
143
+ "rms_norm_eps": 1e-05,
144
+ "rope_parameters": null,
145
+ "rope_theta": [
146
+ 5000000.0,
147
+ 10000.0,
148
+ 10000.0,
149
+ 10000.0,
150
+ 5000000.0,
151
+ 10000.0,
152
+ 10000.0,
153
+ 10000.0,
154
+ 5000000.0,
155
+ 10000.0,
156
+ 10000.0,
157
+ 10000.0,
158
+ 5000000.0,
159
+ 10000.0,
160
+ 10000.0,
161
+ 10000.0,
162
+ 5000000.0,
163
+ 10000.0,
164
+ 10000.0,
165
+ 10000.0,
166
+ 5000000.0,
167
+ 10000.0,
168
+ 10000.0,
169
+ 10000.0,
170
+ 5000000.0,
171
+ 10000.0,
172
+ 10000.0,
173
+ 10000.0,
174
+ 5000000.0,
175
+ 10000.0,
176
+ 10000.0,
177
+ 10000.0,
178
+ 5000000.0,
179
+ 10000.0,
180
+ 10000.0,
181
+ 10000.0,
182
+ 5000000.0,
183
+ 10000.0,
184
+ 10000.0,
185
+ 10000.0,
186
+ 5000000.0,
187
+ 10000.0,
188
+ 10000.0,
189
+ 10000.0,
190
+ 5000000.0,
191
+ 10000.0,
192
+ 10000.0,
193
+ 10000.0
194
+ ],
195
+ "share_expert_dim": 1280,
196
+ "sink": false,
197
+ "sliding_window": 512,
198
+ "swiglu_limits": [
199
+ 0.0,
200
+ 0.0,
201
+ 0.0,
202
+ 0.0,
203
+ 0.0,
204
+ 0.0,
205
+ 0.0,
206
+ 0.0,
207
+ 0.0,
208
+ 0.0,
209
+ 0.0,
210
+ 0.0,
211
+ 0.0,
212
+ 0.0,
213
+ 0.0,
214
+ 0.0,
215
+ 0.0,
216
+ 0.0,
217
+ 0.0,
218
+ 0.0,
219
+ 0.0,
220
+ 0.0,
221
+ 0.0,
222
+ 0.0,
223
+ 0.0,
224
+ 0.0,
225
+ 0.0,
226
+ 0.0,
227
+ 0.0,
228
+ 0.0,
229
+ 0.0,
230
+ 0.0,
231
+ 0.0,
232
+ 0.0,
233
+ 0.0,
234
+ 0.0,
235
+ 0.0,
236
+ 0.0,
237
+ 0.0,
238
+ 0.0,
239
+ 0.0,
240
+ 0.0,
241
+ 0.0,
242
+ 7,
243
+ 7,
244
+ 0.0,
245
+ 0.0,
246
+ 0.0
247
+ ],
248
+ "swiglu_limits_shared": [
249
+ 0.0,
250
+ 0.0,
251
+ 0.0,
252
+ 0.0,
253
+ 0.0,
254
+ 0.0,
255
+ 0.0,
256
+ 0.0,
257
+ 0.0,
258
+ 0.0,
259
+ 0.0,
260
+ 0.0,
261
+ 0.0,
262
+ 0.0,
263
+ 0.0,
264
+ 0.0,
265
+ 0.0,
266
+ 0.0,
267
+ 0.0,
268
+ 0.0,
269
+ 0.0,
270
+ 0.0,
271
+ 0.0,
272
+ 0.0,
273
+ 0.0,
274
+ 0.0,
275
+ 0.0,
276
+ 0.0,
277
+ 0.0,
278
+ 0.0,
279
+ 0.0,
280
+ 0.0,
281
+ 0.0,
282
+ 0.0,
283
+ 0.0,
284
+ 0.0,
285
+ 0.0,
286
+ 0.0,
287
+ 0.0,
288
+ 0.0,
289
+ 0.0,
290
+ 0.0,
291
+ 0.0,
292
+ 0.0,
293
+ 16,
294
+ 0.0,
295
+ 0.0,
296
+ 0.0
297
+ ],
298
+ "transformers_version": "5.1.0",
299
+ "use_cache": false,
300
+ "use_head_wise_attn_gate": true,
301
+ "use_moe": true,
302
+ "use_moe_router_bias": true,
303
+ "use_qk_norm": true,
304
+ "use_return_dict": true,
305
+ "use_rope_layers": [],
306
+ "vocab_size": 128896,
307
+ "yarn_only_types": [
308
+ "full_attention"
309
+ ],
310
+ "zero_centered": true
311
+ }