DatPySci commited on
Commit
3ef1fa4
·
verified ·
1 Parent(s): 324300d

Delete files models/OLMo-1B/config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. models/OLMo-1B/config.yaml +0 -413
models/OLMo-1B/config.yaml DELETED
@@ -1,413 +0,0 @@
1
- run_name: OLMo-1B-as_fm3_omi2
2
- seed: 6198
3
- epoch: null
4
- dry_run: false
5
- model:
6
- d_model: 2048
7
- n_heads: 16
8
- n_kv_heads: null
9
- clip_qkv: null
10
- n_layers: 16
11
- mlp_ratio: 8
12
- mlp_hidden_size: null
13
- activation_type: swiglu
14
- block_type: sequential
15
- block_group_size: 1
16
- alibi: false
17
- alibi_bias_max: 8.0
18
- rope: true
19
- rope_full_precision: true
20
- rope_theta: 10000
21
- flash_attention: false
22
- attention_dropout: 0.0
23
- multi_query_attention: false
24
- attention_layer_norm: false
25
- residual_dropout: 0.0
26
- embedding_dropout: 0.0
27
- embedding_layer_norm: false
28
- layer_norm_type: default
29
- layer_norm_with_affine: false
30
- layer_norm_eps: 1.0e-05
31
- attention_layer_norm_with_affine: false
32
- max_sequence_length: 2048
33
- include_bias: false
34
- bias_for_layer_norm: false
35
- scale_logits: false
36
- vocab_size: 32000
37
- embedding_size: 32000
38
- weight_tying: true
39
- eos_token_id: 0
40
- pad_token_id: 1
41
- init_device: cuda
42
- init_fn: normal
43
- init_std: 0.02
44
- init_cutoff_factor: 3.0
45
- precision: amp_bf16
46
- scale_emb_init: false
47
- emb_init_std: null
48
- norm_after: false
49
- optimizer:
50
- name: adamw
51
- learning_rate: 0.0005
52
- weight_decay: 0.1
53
- betas:
54
- - 0.9
55
- - 0.95
56
- eps: 1.0e-08
57
- no_decay_norm_and_bias: null
58
- selective_updates: false
59
- decay_norm_and_bias: true
60
- decay_embeddings: true
61
- metrics_log_interval: 10
62
- record_update_metrics: false
63
- scheduler:
64
- name: cosine_with_warmup
65
- units: steps
66
- t_warmup: 2000
67
- t_max: null
68
- alpha_f: 0.1
69
- grad_clip_warmup_steps: null
70
- grad_clip_warmup_factor: null
71
- warmup_min_lr: 0.0
72
- data:
73
- paths:
74
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
88
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00019_00000_doc_shuffled.ds
89
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
90
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
91
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
92
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
93
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
94
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
95
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
96
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
97
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
98
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
99
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
100
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
101
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00032_00000_doc_shuffled.ds
102
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00033_00000_doc_shuffled.ds
103
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00034_00000_doc_shuffled.ds
104
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00035_00000_doc_shuffled.ds
105
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00036_00000_doc_shuffled.ds
106
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00037_00000_doc_shuffled.ds
107
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00038_00000_doc_shuffled.ds
108
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00039_00000_doc_shuffled.ds
109
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00040_00000_doc_shuffled.ds
110
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00041_00000_doc_shuffled.ds
111
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00042_00000_doc_shuffled.ds
112
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00043_00000_doc_shuffled.ds
113
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00044_00000_doc_shuffled.ds
114
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00045_00000_doc_shuffled.ds
115
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00046_00000_doc_shuffled.ds
116
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00047_00000_doc_shuffled.ds
117
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00048_00000_doc_shuffled.ds
118
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00049_00000_doc_shuffled.ds
119
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00050_00000_doc_shuffled.ds
120
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00051_00000_doc_shuffled.ds
121
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00052_00000_doc_shuffled.ds
122
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00053_00000_doc_shuffled.ds
123
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00054_00000_doc_shuffled.ds
124
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00055_00000_doc_shuffled.ds
125
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00056_00000_doc_shuffled.ds
126
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00057_00000_doc_shuffled.ds
127
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00058_00000_doc_shuffled.ds
128
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00059_00000_doc_shuffled.ds
129
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00060_00000_doc_shuffled.ds
130
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00061_00000_doc_shuffled.ds
131
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00062_00000_doc_shuffled.ds
132
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00063_00000_doc_shuffled.ds
133
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00064_00000_doc_shuffled.ds
134
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00065_00000_doc_shuffled.ds
135
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00066_00000_doc_shuffled.ds
136
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00067_00000_doc_shuffled.ds
137
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00068_00000_doc_shuffled.ds
138
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00069_00000_doc_shuffled.ds
139
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00070_00000_doc_shuffled.ds
140
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00071_00000_doc_shuffled.ds
141
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00072_00000_doc_shuffled.ds
142
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00073_00000_doc_shuffled.ds
143
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00074_00000_doc_shuffled.ds
144
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00075_00000_doc_shuffled.ds
145
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00076_00000_doc_shuffled.ds
146
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00077_00000_doc_shuffled.ds
147
- - data_token/as_fm3_omi2/algebraic-stack-tokenized/00078_00000_doc_shuffled.ds
148
- - data_token/as_fm3_omi2/finemath3-tokenized/00000_00000_doc_shuffled.ds
149
- - data_token/as_fm3_omi2/finemath3-tokenized/00001_00000_doc_shuffled.ds
150
- - data_token/as_fm3_omi2/finemath3-tokenized/00002_00000_doc_shuffled.ds
151
- - data_token/as_fm3_omi2/finemath3-tokenized/00003_00000_doc_shuffled.ds
152
- - data_token/as_fm3_omi2/finemath3-tokenized/00004_00000_doc_shuffled.ds
153
- - data_token/as_fm3_omi2/finemath3-tokenized/00005_00000_doc_shuffled.ds
154
- - data_token/as_fm3_omi2/finemath3-tokenized/00006_00000_doc_shuffled.ds
155
- - data_token/as_fm3_omi2/finemath3-tokenized/00007_00000_doc_shuffled.ds
156
- - data_token/as_fm3_omi2/finemath3-tokenized/00008_00000_doc_shuffled.ds
157
- - data_token/as_fm3_omi2/finemath3-tokenized/00009_00000_doc_shuffled.ds
158
- - data_token/as_fm3_omi2/finemath3-tokenized/00010_00000_doc_shuffled.ds
159
- - data_token/as_fm3_omi2/finemath3-tokenized/00011_00000_doc_shuffled.ds
160
- - data_token/as_fm3_omi2/finemath3-tokenized/00012_00000_doc_shuffled.ds
161
- - data_token/as_fm3_omi2/finemath3-tokenized/00013_00000_doc_shuffled.ds
162
- - data_token/as_fm3_omi2/finemath3-tokenized/00014_00000_doc_shuffled.ds
163
- - data_token/as_fm3_omi2/finemath3-tokenized/00015_00000_doc_shuffled.ds
164
- - data_token/as_fm3_omi2/finemath3-tokenized/00016_00000_doc_shuffled.ds
165
- - data_token/as_fm3_omi2/finemath3-tokenized/00017_00000_doc_shuffled.ds
166
- - data_token/as_fm3_omi2/finemath3-tokenized/00018_00000_doc_shuffled.ds
167
- - data_token/as_fm3_omi2/finemath3-tokenized/00019_00000_doc_shuffled.ds
168
- - data_token/as_fm3_omi2/finemath3-tokenized/00020_00000_doc_shuffled.ds
169
- - data_token/as_fm3_omi2/finemath3-tokenized/00021_00000_doc_shuffled.ds
170
- - data_token/as_fm3_omi2/finemath3-tokenized/00022_00000_doc_shuffled.ds
171
- - data_token/as_fm3_omi2/finemath3-tokenized/00023_00000_doc_shuffled.ds
172
- - data_token/as_fm3_omi2/finemath3-tokenized/00024_00000_doc_shuffled.ds
173
- - data_token/as_fm3_omi2/finemath3-tokenized/00025_00000_doc_shuffled.ds
174
- - data_token/as_fm3_omi2/finemath3-tokenized/00026_00000_doc_shuffled.ds
175
- - data_token/as_fm3_omi2/finemath3-tokenized/00027_00000_doc_shuffled.ds
176
- - data_token/as_fm3_omi2/finemath3-tokenized/00028_00000_doc_shuffled.ds
177
- - data_token/as_fm3_omi2/finemath3-tokenized/00029_00000_doc_shuffled.ds
178
- - data_token/as_fm3_omi2/finemath3-tokenized/00030_00000_doc_shuffled.ds
179
- - data_token/as_fm3_omi2/finemath3-tokenized/00031_00000_doc_shuffled.ds
180
- - data_token/as_fm3_omi2/finemath3-tokenized/00032_00000_doc_shuffled.ds
181
- - data_token/as_fm3_omi2/finemath3-tokenized/00033_00000_doc_shuffled.ds
182
- - data_token/as_fm3_omi2/finemath3-tokenized/00034_00000_doc_shuffled.ds
183
- - data_token/as_fm3_omi2/finemath3-tokenized/00035_00000_doc_shuffled.ds
184
- - data_token/as_fm3_omi2/finemath3-tokenized/00036_00000_doc_shuffled.ds
185
- - data_token/as_fm3_omi2/finemath3-tokenized/00037_00000_doc_shuffled.ds
186
- - data_token/as_fm3_omi2/finemath3-tokenized/00038_00000_doc_shuffled.ds
187
- - data_token/as_fm3_omi2/finemath3-tokenized/00039_00000_doc_shuffled.ds
188
- - data_token/as_fm3_omi2/finemath3-tokenized/00040_00000_doc_shuffled.ds
189
- - data_token/as_fm3_omi2/finemath3-tokenized/00041_00000_doc_shuffled.ds
190
- - data_token/as_fm3_omi2/finemath3-tokenized/00042_00000_doc_shuffled.ds
191
- - data_token/as_fm3_omi2/finemath3-tokenized/00043_00000_doc_shuffled.ds
192
- - data_token/as_fm3_omi2/finemath3-tokenized/00044_00000_doc_shuffled.ds
193
- - data_token/as_fm3_omi2/finemath3-tokenized/00045_00000_doc_shuffled.ds
194
- - data_token/as_fm3_omi2/finemath3-tokenized/00046_00000_doc_shuffled.ds
195
- - data_token/as_fm3_omi2/finemath3-tokenized/00047_00000_doc_shuffled.ds
196
- - data_token/as_fm3_omi2/finemath3-tokenized/00048_00000_doc_shuffled.ds
197
- - data_token/as_fm3_omi2/finemath3-tokenized/00049_00000_doc_shuffled.ds
198
- - data_token/as_fm3_omi2/finemath3-tokenized/00050_00000_doc_shuffled.ds
199
- - data_token/as_fm3_omi2/finemath3-tokenized/00051_00000_doc_shuffled.ds
200
- - data_token/as_fm3_omi2/finemath3-tokenized/00052_00000_doc_shuffled.ds
201
- - data_token/as_fm3_omi2/finemath3-tokenized/00053_00000_doc_shuffled.ds
202
- - data_token/as_fm3_omi2/finemath3-tokenized/00054_00000_doc_shuffled.ds
203
- - data_token/as_fm3_omi2/finemath3-tokenized/00055_00000_doc_shuffled.ds
204
- - data_token/as_fm3_omi2/finemath3-tokenized/00056_00000_doc_shuffled.ds
205
- - data_token/as_fm3_omi2/finemath3-tokenized/00057_00000_doc_shuffled.ds
206
- - data_token/as_fm3_omi2/finemath3-tokenized/00058_00000_doc_shuffled.ds
207
- - data_token/as_fm3_omi2/finemath3-tokenized/00059_00000_doc_shuffled.ds
208
- - data_token/as_fm3_omi2/finemath3-tokenized/00060_00000_doc_shuffled.ds
209
- - data_token/as_fm3_omi2/finemath3-tokenized/00061_00000_doc_shuffled.ds
210
- - data_token/as_fm3_omi2/finemath3-tokenized/00062_00000_doc_shuffled.ds
211
- - data_token/as_fm3_omi2/finemath3-tokenized/00063_00000_doc_shuffled.ds
212
- - data_token/as_fm3_omi2/finemath3-tokenized/00064_00000_doc_shuffled.ds
213
- - data_token/as_fm3_omi2/finemath3-tokenized/00065_00000_doc_shuffled.ds
214
- - data_token/as_fm3_omi2/finemath3-tokenized/00066_00000_doc_shuffled.ds
215
- - data_token/as_fm3_omi2/finemath3-tokenized/00067_00000_doc_shuffled.ds
216
- - data_token/as_fm3_omi2/finemath3-tokenized/00068_00000_doc_shuffled.ds
217
- - data_token/as_fm3_omi2/finemath3-tokenized/00069_00000_doc_shuffled.ds
218
- - data_token/as_fm3_omi2/finemath3-tokenized/00070_00000_doc_shuffled.ds
219
- - data_token/as_fm3_omi2/finemath3-tokenized/00071_00000_doc_shuffled.ds
220
- - data_token/as_fm3_omi2/finemath3-tokenized/00072_00000_doc_shuffled.ds
221
- - data_token/as_fm3_omi2/finemath3-tokenized/00073_00000_doc_shuffled.ds
222
- - data_token/as_fm3_omi2/finemath3-tokenized/00074_00000_doc_shuffled.ds
223
- - data_token/as_fm3_omi2/finemath3-tokenized/00075_00000_doc_shuffled.ds
224
- - data_token/as_fm3_omi2/finemath3-tokenized/00076_00000_doc_shuffled.ds
225
- - data_token/as_fm3_omi2/finemath3-tokenized/00077_00000_doc_shuffled.ds
226
- - data_token/as_fm3_omi2/finemath3-tokenized/00078_00000_doc_shuffled.ds
227
- - data_token/as_fm3_omi2/finemath3-tokenized/00079_00000_doc_shuffled.ds
228
- - data_token/as_fm3_omi2/finemath3-tokenized/00080_00000_doc_shuffled.ds
229
- - data_token/as_fm3_omi2/finemath3-tokenized/00081_00000_doc_shuffled.ds
230
- - data_token/as_fm3_omi2/finemath3-tokenized/00082_00000_doc_shuffled.ds
231
- - data_token/as_fm3_omi2/finemath3-tokenized/00083_00000_doc_shuffled.ds
232
- - data_token/as_fm3_omi2/finemath3-tokenized/00084_00000_doc_shuffled.ds
233
- - data_token/as_fm3_omi2/finemath3-tokenized/00085_00000_doc_shuffled.ds
234
- - data_token/as_fm3_omi2/finemath3-tokenized/00086_00000_doc_shuffled.ds
235
- - data_token/as_fm3_omi2/finemath3-tokenized/00087_00000_doc_shuffled.ds
236
- - data_token/as_fm3_omi2/finemath3-tokenized/00088_00000_doc_shuffled.ds
237
- - data_token/as_fm3_omi2/finemath3-tokenized/00089_00000_doc_shuffled.ds
238
- - data_token/as_fm3_omi2/finemath3-tokenized/00090_00000_doc_shuffled.ds
239
- - data_token/as_fm3_omi2/finemath3-tokenized/00091_00000_doc_shuffled.ds
240
- - data_token/as_fm3_omi2/finemath3-tokenized/00092_00000_doc_shuffled.ds
241
- - data_token/as_fm3_omi2/finemath3-tokenized/00093_00000_doc_shuffled.ds
242
- - data_token/as_fm3_omi2/finemath3-tokenized/00094_00000_doc_shuffled.ds
243
- - data_token/as_fm3_omi2/finemath3-tokenized/00095_00000_doc_shuffled.ds
244
- - data_token/as_fm3_omi2/finemath3-tokenized/00096_00000_doc_shuffled.ds
245
- - data_token/as_fm3_omi2/finemath3-tokenized/00097_00000_doc_shuffled.ds
246
- - data_token/as_fm3_omi2/finemath3-tokenized/00098_00000_doc_shuffled.ds
247
- - data_token/as_fm3_omi2/finemath3-tokenized/00099_00000_doc_shuffled.ds
248
- - data_token/as_fm3_omi2/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
249
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
250
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
251
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
252
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
253
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
254
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
255
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
256
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
257
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
258
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
259
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
260
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
261
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
262
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
263
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
264
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
265
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
266
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
267
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
268
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
269
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
270
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
271
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
272
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
273
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
274
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
275
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
276
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
277
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
278
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
279
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
280
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
281
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00032_00000_doc_shuffled.ds
282
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00033_00000_doc_shuffled.ds
283
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00034_00000_doc_shuffled.ds
284
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00035_00000_doc_shuffled.ds
285
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00036_00000_doc_shuffled.ds
286
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00037_00000_doc_shuffled.ds
287
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00038_00000_doc_shuffled.ds
288
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00039_00000_doc_shuffled.ds
289
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00040_00000_doc_shuffled.ds
290
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00041_00000_doc_shuffled.ds
291
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00042_00000_doc_shuffled.ds
292
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00043_00000_doc_shuffled.ds
293
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00044_00000_doc_shuffled.ds
294
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00045_00000_doc_shuffled.ds
295
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00046_00000_doc_shuffled.ds
296
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00047_00000_doc_shuffled.ds
297
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00048_00000_doc_shuffled.ds
298
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00049_00000_doc_shuffled.ds
299
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00050_00000_doc_shuffled.ds
300
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00051_00000_doc_shuffled.ds
301
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00052_00000_doc_shuffled.ds
302
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00053_00000_doc_shuffled.ds
303
- - data_token/as_fm3_omi2/openmathinstruct2-tokenized/00054_00000_doc_shuffled.ds
304
- - data_token/as_fm3_omi2/tinygsm-tokenized/00000_00000_doc_shuffled.ds
305
- - data_token/as_fm3_omi2/tinygsm-tokenized/00001_00000_doc_shuffled.ds
306
- - data_token/as_fm3_omi2/tinygsm-tokenized/00002_00000_doc_shuffled.ds
307
- - data_token/as_fm3_omi2/tinygsm-tokenized/00003_00000_doc_shuffled.ds
308
- - data_token/as_fm3_omi2/tinygsm-tokenized/00004_00000_doc_shuffled.ds
309
- - data_token/as_fm3_omi2/tinygsm-tokenized/00005_00000_doc_shuffled.ds
310
- - data_token/as_fm3_omi2/tinygsm-tokenized/00006_00000_doc_shuffled.ds
311
- - data_token/as_fm3_omi2/tinygsm-tokenized/00007_00000_doc_shuffled.ds
312
- - data_token/as_fm3_omi2/tinygsm-tokenized/00008_00000_doc_shuffled.ds
313
- - data_token/as_fm3_omi2/tinygsm-tokenized/00009_00000_doc_shuffled.ds
314
- - data_token/as_fm3_omi2/tinygsm-tokenized/00010_00000_doc_shuffled.ds
315
- - data_token/as_fm3_omi2/tinygsm-tokenized/00011_00000_doc_shuffled.ds
316
- - data_token/as_fm3_omi2/tinygsm-tokenized/00012_00000_doc_shuffled.ds
317
- - data_token/as_fm3_omi2/tinygsm-tokenized/00013_00000_doc_shuffled.ds
318
- - data_token/as_fm3_omi2/tinygsm-tokenized/00014_00000_doc_shuffled.ds
319
- - data_token/as_fm3_omi2/tinygsm-tokenized/00015_00000_doc_shuffled.ds
320
- - data_token/as_fm3_omi2/tinygsm-tokenized/00016_00000_doc_shuffled.ds
321
- memmap_dtype: uint16
322
- datasets: null
323
- label_mask_paths: null
324
- pad_direction: right
325
- generate_attention_mask: false
326
- generate_doc_lengths: false
327
- num_workers: 32
328
- drop_last: true
329
- pin_memory: true
330
- prefetch_factor: 8
331
- persistent_workers: true
332
- timeout: 0
333
- seed: null
334
- instance_filter: null
335
- custom_dataset: null
336
- restore_dataloader: true
337
- fast_forward_batches: null
338
- evaluators: []
339
- eval_interval: 5000
340
- tokenizer:
341
- identifier: meta-llama/Llama-2-7b-hf
342
- truncate_direction: right
343
- save_folder: checkpoints/OLMo-1B-as_fm3_omi2
344
- remote_save_folder: null
345
- canceled_check_interval: 6000
346
- save_interval: 3000
347
- save_interval_unsharded: 3000
348
- save_interval_ephemeral: null
349
- save_num_checkpoints_to_keep: -1
350
- save_num_unsharded_checkpoints_to_keep: -1
351
- save_overwrite: true
352
- force_save_unsharded: false
353
- no_pre_train_checkpoint: false
354
- load_path: step9000-unsharded
355
- load_path_sharded_checkpointer: null
356
- try_load_latest_save: false
357
- reset_optimizer_state: false
358
- reset_trainer_state: false
359
- sharded_checkpointer: torch_legacy
360
- new_style_checkpoints: null
361
- max_duration: 1ep
362
- global_train_batch_size: 512
363
- device_train_batch_size: 128
364
- device_train_microbatch_size: 16
365
- device_eval_batch_size: 16
366
- eval_subset_num_batches: -1
367
- eval_on_load: false
368
- device_train_grad_accum: 8
369
- max_grad_norm: 1.0
370
- max_grad_norm_ratio: null
371
- precision: amp_bf16
372
- wandb:
373
- project: olmo-debug
374
- entity: null
375
- group: null
376
- name: OLMo-1B-as_fm3_omi2
377
- tags:
378
- - watching
379
- log_artifacts: false
380
- rank_zero_only: true
381
- log_interval: 1
382
- speed_monitor:
383
- window_size: 20
384
- gpu_flops_available: null
385
- console_log_interval: 1
386
- gen1_gc_interval: 1
387
- compile: null
388
- distributed_strategy: fsdp
389
- fsdp:
390
- use_orig_params: true
391
- sharding_strategy: FULL_SHARD
392
- wrapping_strategy: null
393
- precision: mixed
394
- hybrid_sharding_num_model_replicas: null
395
- ddp:
396
- grad_sync_mode: batch
397
- find_unused_params: false
398
- single:
399
- device: auto
400
- softmax_auxiliary_loss: false
401
- auxiliary_loss_multiplier: 0.0001
402
- time_limit: null
403
- extra_steps_after_cancel: 10
404
- early_stopping_factor: null
405
- save_data_indices: true
406
- python_profiling: false
407
- torch_profiling: false
408
- stop_at: null
409
- stop_after: null
410
- activation_checkpointing: null
411
- fused_loss: null
412
- hf_datasets_cache_dir: null
413
- module_outputs_save_steps: null