ashkihotah commited on
Commit
fd3b89f
·
1 Parent(s): d6e6119

refactor(data augmentation): changed the tracking strategy of the data augmentation pipeline and its source code

Browse files
data/interim/issue-report-classification/post-analysis/nasa-cfs/gemini-2.5-flash.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 810fc90fdb20cda56f1e20b589003f98
3
+ size: 20647734
4
+ hash: md5
5
+ path: gemini-2.5-flash.csv
data/interim/issue-report-classification/post-analysis/nlbse24/gemini-2.5-flash.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 3bbf67ae914c8c59277683c671224433
3
+ size: 15235640
4
+ hash: md5
5
+ path: gemini-2.5-flash.csv
data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 08fda2e3a09e5ef4f2f1386e7fd160d8
3
+ size: 7017011
4
+ hash: md5
5
+ path: gemini-2.5-flash.csv
data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash/easy.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 22b83d918fe8304d12861b9b42d6fc70
3
+ size: 607787
4
+ hash: md5
5
+ path: easy.csv
data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash/hard.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 910e201f4bb25c88bcff7dce1ae51f0b
3
+ size: 2311768
4
+ hash: md5
5
+ path: hard.csv
data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash/medium.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: dfd102bbce2ae8719ad33776272bf33e
3
+ size: 1621288
4
+ hash: md5
5
+ path: medium.csv
data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 4b65cf5eaa84c8868395727819e1501e
3
+ size: 8443153
4
+ hash: md5
5
+ path: gemini-2.5-flash.csv
data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash/easy.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: e62e5ae07e4f2611280006dd0dba7961
3
+ size: 586850
4
+ hash: md5
5
+ path: easy.csv
data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash/hard.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 467982c597b82fd5df60400027b80161
3
+ size: 2460914
4
+ hash: md5
5
+ path: hard.csv
data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash/medium.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 8aaeb6866bce768ee5d64042814e9016
3
+ size: 1978713
4
+ hash: md5
5
+ path: medium.csv
data/interim/issue-report-classification/soft-cleaned/nasa/cfs_test.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 1f761ca5d6c85c8726a234a542184970
3
+ size: 519291
4
+ hash: md5
5
+ path: cfs_test.csv
data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: 39f4dae2567b2a2c4e9e0edf2534bad7
3
+ size: 1969469
4
+ hash: md5
5
+ path: cfs_train.csv
data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_test.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: c8b5bdc4006fbbdb555c3236c9a6d462
3
+ size: 3454159
4
+ hash: md5
5
+ path: issues_test.csv
data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: f21545a0f19ff9fad6568549a5a02181
3
+ size: 3686102
4
+ hash: md5
5
+ path: issues_train.csv
dvc.lock DELETED
@@ -1,434 +0,0 @@
1
- schema: '2.0'
2
- stages:
3
- analyze_nlbse24:
4
- cmd: python syntetic_issue_report_data_generation/augmentation/analyze.py
5
- --input-csv
6
- data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
7
- --input-column issue --label-column label --context-file
8
- rsc/prompts/contexts/nlbse24.md --output-file
9
- data/interim/issue-report-classification/post-analysis/nlbse24/gemini-2.5-flash.csv
10
- --provider google --model gemini-2.5-flash
11
- deps:
12
- - path:
13
- data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
14
- hash: md5
15
- md5: f21545a0f19ff9fad6568549a5a02181
16
- size: 3686102
17
- - path: rsc/prompts/analize.md
18
- hash: md5
19
- md5: ccacb11a2da7672e990f389e6359e8d4
20
- size: 7832
21
- - path: rsc/prompts/contexts/nlbse24.md
22
- hash: md5
23
- md5: 50984698bf834f05052caf60e91e04dd
24
- size: 2454
25
- - path: syntetic_issue_report_data_generation/augmentation/analyze.py
26
- hash: md5
27
- md5: 5f0f30547aaf9eb63a5acf2ccaff49c4
28
- size: 10973
29
- outs:
30
- - path:
31
- data/interim/issue-report-classification/post-analysis/nlbse24/gemini-2.5-flash.csv
32
- hash: md5
33
- md5: 3bbf67ae914c8c59277683c671224433
34
- size: 15235640
35
- analyze_nasa_cfs:
36
- cmd: python syntetic_issue_report_data_generation/augmentation/analyze.py
37
- --input-csv
38
- data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
39
- --input-column issue --label-column label --context-file
40
- rsc/prompts/contexts/nasa-cfs.md --output-file
41
- data/interim/issue-report-classification/post-analysis/nasa-cfs/gemini-2.5-flash.csv
42
- --provider google --model gemini-2.5-flash
43
- deps:
44
- - path:
45
- data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
46
- hash: md5
47
- md5: 39f4dae2567b2a2c4e9e0edf2534bad7
48
- size: 1969469
49
- - path: rsc/prompts/analize.md
50
- hash: md5
51
- md5: ccacb11a2da7672e990f389e6359e8d4
52
- size: 7832
53
- - path: rsc/prompts/contexts/nasa-cfs.md
54
- hash: md5
55
- md5: 27fc32ef713da793f3d373b4c69b1f32
56
- size: 5515
57
- - path: syntetic_issue_report_data_generation/augmentation/analyze.py
58
- hash: md5
59
- md5: 5f0f30547aaf9eb63a5acf2ccaff49c4
60
- size: 10973
61
- outs:
62
- - path:
63
- data/interim/issue-report-classification/post-analysis/nasa-cfs/gemini-2.5-flash.csv
64
- hash: md5
65
- md5: 810fc90fdb20cda56f1e20b589003f98
66
- size: 20647734
67
- transform_nlbse24:
68
- cmd: python syntetic_issue_report_data_generation/augmentation/generate.py
69
- --input-csv
70
- data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
71
- --analysis-csv
72
- data/interim/issue-report-classification/post-analysis/nlbse24/gemini-2.5-flash.csv
73
- --input-column issue --label-column label --context-file
74
- rsc/prompts/contexts/nlbse24.md --method one_to_one_transform_all_3
75
- --output-file
76
- data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash.csv
77
- --provider google --model gemini-2.5-flash
78
- deps:
79
- - path:
80
- data/interim/issue-report-classification/post-analysis/nlbse24/gemini-2.5-flash.csv
81
- hash: md5
82
- md5: 3bbf67ae914c8c59277683c671224433
83
- size: 15235640
84
- - path:
85
- data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
86
- hash: md5
87
- md5: f21545a0f19ff9fad6568549a5a02181
88
- size: 3686102
89
- - path: rsc/prompts/contexts/nlbse24.md
90
- hash: md5
91
- md5: 50984698bf834f05052caf60e91e04dd
92
- size: 2454
93
- - path: rsc/prompts/transform/one-to-one-transform-all-3.md
94
- hash: md5
95
- md5: 12b1947ed4d4aead9942b9bbc7c49465
96
- size: 2546
97
- - path: syntetic_issue_report_data_generation/augmentation/analyze.py
98
- hash: md5
99
- md5: 5f0f30547aaf9eb63a5acf2ccaff49c4
100
- size: 10973
101
- - path: syntetic_issue_report_data_generation/augmentation/generate.py
102
- hash: md5
103
- md5: 06d0bae579db0fcb6aa2c35aec6f91c2
104
- size: 7960
105
- - path: syntetic_issue_report_data_generation/augmentation/utils.py
106
- hash: md5
107
- md5: b7205414ff7e3b58b05180e215b52ffa
108
- size: 1982
109
- outs:
110
- - path:
111
- data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash.csv
112
- hash: md5
113
- md5: 4b65cf5eaa84c8868395727819e1501e
114
- size: 8443153
115
- split_nlbse24_by_difficulty:
116
- cmd: python
117
- syntetic_issue_report_data_generation/augmentation/split_by_difficulty.py
118
- --file
119
- data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash.csv
120
- --output-dir
121
- data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash
122
- --original-dataset
123
- ./data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
124
- deps:
125
- - path:
126
- data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash.csv
127
- hash: md5
128
- md5: 4b65cf5eaa84c8868395727819e1501e
129
- size: 8443153
130
- - path:
131
- data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
132
- hash: md5
133
- md5: f21545a0f19ff9fad6568549a5a02181
134
- size: 3686102
135
- - path:
136
- syntetic_issue_report_data_generation/augmentation/split_by_difficulty.py
137
- hash: md5
138
- md5: e02f350ea800033d339b380ad2b4363b
139
- size: 2299
140
- outs:
141
- - path:
142
- data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash/easy.csv
143
- hash: md5
144
- md5: e62e5ae07e4f2611280006dd0dba7961
145
- size: 586850
146
- - path:
147
- data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash/hard.csv
148
- hash: md5
149
- md5: 467982c597b82fd5df60400027b80161
150
- size: 2460914
151
- - path:
152
- data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash/medium.csv
153
- hash: md5
154
- md5: 8aaeb6866bce768ee5d64042814e9016
155
- size: 1978713
156
- transform_nasa_cfs:
157
- cmd: python syntetic_issue_report_data_generation/augmentation/generate.py
158
- --input-csv
159
- data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
160
- --analysis-csv
161
- data/interim/issue-report-classification/post-analysis/nasa-cfs/gemini-2.5-flash.csv
162
- --input-column issue --label-column label --context-file
163
- rsc/prompts/contexts/nasa-cfs.md --method one_to_one_transform_all_3
164
- --output-file
165
- data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash.csv
166
- --provider google --model gemini-2.5-flash
167
- deps:
168
- - path:
169
- data/interim/issue-report-classification/post-analysis/nasa-cfs/gemini-2.5-flash.csv
170
- hash: md5
171
- md5: 810fc90fdb20cda56f1e20b589003f98
172
- size: 20647734
173
- - path:
174
- data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
175
- hash: md5
176
- md5: 39f4dae2567b2a2c4e9e0edf2534bad7
177
- size: 1969469
178
- - path: rsc/prompts/contexts/nasa-cfs.md
179
- hash: md5
180
- md5: 27fc32ef713da793f3d373b4c69b1f32
181
- size: 5515
182
- - path: rsc/prompts/transform/one-to-one-transform-all-3.md
183
- hash: md5
184
- md5: 12b1947ed4d4aead9942b9bbc7c49465
185
- size: 2546
186
- - path: syntetic_issue_report_data_generation/augmentation/analyze.py
187
- hash: md5
188
- md5: 5f0f30547aaf9eb63a5acf2ccaff49c4
189
- size: 10973
190
- - path: syntetic_issue_report_data_generation/augmentation/generate.py
191
- hash: md5
192
- md5: 06d0bae579db0fcb6aa2c35aec6f91c2
193
- size: 7960
194
- - path: syntetic_issue_report_data_generation/augmentation/utils.py
195
- hash: md5
196
- md5: b7205414ff7e3b58b05180e215b52ffa
197
- size: 1982
198
- outs:
199
- - path:
200
- data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash.csv
201
- hash: md5
202
- md5: 08fda2e3a09e5ef4f2f1386e7fd160d8
203
- size: 7017011
204
- preprocess_nlbse24:
205
- cmd: python syntetic_issue_report_data_generation/augmentation/preprocess.py
206
- --dataset nlbse24_train --output-dir
207
- data/interim/issue-report-classification/pre-analysis
208
- deps:
209
- - path: data/raw/issue-report-classification/nlbse24/issues_train.csv
210
- hash: md5
211
- md5: 1a1a99191592d38b3ec54a51d8d673c1
212
- size: 3680400
213
- - path: syntetic_issue_report_data_generation/augmentation/preprocess.py
214
- hash: md5
215
- md5: 0fc37014f5a08ef1c290898110a67c6e
216
- size: 1574
217
- outs:
218
- - path:
219
- data/interim/issue-report-classification/pre-analysis/nlbse24_train.csv
220
- hash: md5
221
- md5: 1ca462f7b317236d8e540d953564ee73
222
- size: 3630886
223
- preprocess_nasa_cfs:
224
- cmd: python syntetic_issue_report_data_generation/augmentation/preprocess.py
225
- --dataset nasa_cfs_train --output-dir
226
- data/interim/issue-report-classification/pre-analysis
227
- deps:
228
- - path: data/raw/issue-report-classification/nasa/cfs_train.csv
229
- hash: md5
230
- md5: 01b5946e58649c0afa0613716c2259ef
231
- size: 1961092
232
- - path: syntetic_issue_report_data_generation/augmentation/preprocess.py
233
- hash: md5
234
- md5: 0fc37014f5a08ef1c290898110a67c6e
235
- size: 1574
236
- outs:
237
- - path:
238
- data/interim/issue-report-classification/pre-analysis/nasa_cfs_train.csv
239
- hash: md5
240
- md5: 5094cc3998129fc469a6ca3aac967694
241
- size: 1883320
242
- split_nasa_cfs_by_difficulty:
243
- cmd: python
244
- syntetic_issue_report_data_generation/augmentation/split_by_difficulty.py
245
- --file
246
- data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash.csv
247
- --output-dir
248
- data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash
249
- --original-dataset
250
- ./data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
251
- deps:
252
- - path:
253
- data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash.csv
254
- hash: md5
255
- md5: 08fda2e3a09e5ef4f2f1386e7fd160d8
256
- size: 7017011
257
- - path:
258
- data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
259
- hash: md5
260
- md5: 39f4dae2567b2a2c4e9e0edf2534bad7
261
- size: 1969469
262
- - path:
263
- syntetic_issue_report_data_generation/augmentation/split_by_difficulty.py
264
- hash: md5
265
- md5: e02f350ea800033d339b380ad2b4363b
266
- size: 2299
267
- outs:
268
- - path:
269
- data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash/easy.csv
270
- hash: md5
271
- md5: 22b83d918fe8304d12861b9b42d6fc70
272
- size: 607787
273
- - path:
274
- data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash/hard.csv
275
- hash: md5
276
- md5: 910e201f4bb25c88bcff7dce1ae51f0b
277
- size: 2311768
278
- - path:
279
- data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash/medium.csv
280
- hash: md5
281
- md5: dfd102bbce2ae8719ad33776272bf33e
282
- size: 1621288
283
- soft_clean:
284
- cmd: echo "hi"
285
- deps:
286
- - path: data/raw/issue-report-classification/nasa/cfs_test.csv
287
- hash: md5
288
- md5: 25fc494a76ff8b7e0bde317bc21aee11
289
- size: 517219
290
- - path: data/raw/issue-report-classification/nasa/cfs_train.csv
291
- hash: md5
292
- md5: 01b5946e58649c0afa0613716c2259ef
293
- size: 1961092
294
- - path: data/raw/issue-report-classification/nasa/fprime_test.csv
295
- hash: md5
296
- md5: 1765403a9cda44565be66f5099548285
297
- size: 154541
298
- - path: data/raw/issue-report-classification/nasa/fprime_train.csv
299
- hash: md5
300
- md5: e7dbfad07b298d9e69dd438a8d3cbff4
301
- size: 638050
302
- - path: data/raw/issue-report-classification/nasa/nasa_test_sample.csv
303
- hash: md5
304
- md5: c9574f535c664cce4c8e58b8745b3fa6
305
- size: 7844
306
- - path: data/raw/issue-report-classification/nasa/nasa_train_sample.csv
307
- hash: md5
308
- md5: 2c3199c3c33df94c3ef8363d04ae4d97
309
- size: 15962
310
- - path: data/raw/issue-report-classification/nlbse24/issues_test.csv
311
- hash: md5
312
- md5: 44e171e338c727cbc93bb8240e88aa1e
313
- size: 3449390
314
- - path: data/raw/issue-report-classification/nlbse24/issues_train.csv
315
- hash: md5
316
- md5: 1a1a99191592d38b3ec54a51d8d673c1
317
- size: 3680400
318
- outs:
319
- - path:
320
- data/interim/issue-report-classification/soft-cleaned/nasa/cfs_test.csv
321
- hash: md5
322
- md5: 1d7c921b78807befb300f877e5ccad23
323
- size: 517765
324
- - path:
325
- data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
326
- hash: md5
327
- md5: 919adb09810a708c708007585ccc5763
328
- size: 1963260
329
- - path:
330
- data/interim/issue-report-classification/soft-cleaned/nasa/fprime_test.csv
331
- hash: md5
332
- md5: 8786472104c9aa23b60381cdc9f97861
333
- size: 154611
334
- - path:
335
- data/interim/issue-report-classification/soft-cleaned/nasa/fprime_train.csv
336
- hash: md5
337
- md5: a4852e1657d3e84d602eacefe233833e
338
- size: 638585
339
- - path:
340
- data/interim/issue-report-classification/soft-cleaned/nasa/nasa_test_sample.csv
341
- hash: md5
342
- md5: e8a9304e5632243977a9ccb38df22fee
343
- size: 7855
344
- - path:
345
- data/interim/issue-report-classification/soft-cleaned/nasa/nasa_train_sample.csv
346
- hash: md5
347
- md5: 5bccd9369e16384130509c166cd3cbdf
348
- size: 15973
349
- - path:
350
- data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_test.csv
351
- hash: md5
352
- md5: 5083327fc87e50cc1ef926c01527ed07
353
- size: 3449960
354
- - path:
355
- data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
356
- hash: md5
357
- md5: 6124c859150e47b0940346880f55a2f6
358
- size: 3681901
359
- preprocess_nlbse24_train:
360
- cmd: python syntetic_issue_report_data_generation/preprocess.py --dataset
361
- nlbse24_train --output
362
- data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
363
- deps:
364
- - path: data/raw/issue-report-classification/nlbse24/issues_train.csv
365
- hash: md5
366
- md5: 1a1a99191592d38b3ec54a51d8d673c1
367
- size: 3680400
368
- - path: syntetic_issue_report_data_generation/preprocess.py
369
- hash: md5
370
- md5: ea154c395ecb7bd6f18f0fdf99b8e532
371
- size: 4973
372
- outs:
373
- - path:
374
- data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
375
- hash: md5
376
- md5: f21545a0f19ff9fad6568549a5a02181
377
- size: 3686102
378
- preprocess_nlbse24_test:
379
- cmd: python syntetic_issue_report_data_generation/preprocess.py --dataset
380
- nlbse24_test --output
381
- data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_test.csv
382
- deps:
383
- - path: data/raw/issue-report-classification/nlbse24/issues_test.csv
384
- hash: md5
385
- md5: 44e171e338c727cbc93bb8240e88aa1e
386
- size: 3449390
387
- - path: syntetic_issue_report_data_generation/preprocess.py
388
- hash: md5
389
- md5: ea154c395ecb7bd6f18f0fdf99b8e532
390
- size: 4973
391
- outs:
392
- - path:
393
- data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_test.csv
394
- hash: md5
395
- md5: c8b5bdc4006fbbdb555c3236c9a6d462
396
- size: 3454159
397
- preprocess_nasa_cfs_train:
398
- cmd: python syntetic_issue_report_data_generation/preprocess.py --dataset
399
- nasa_cfs_train --output
400
- data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
401
- deps:
402
- - path: data/raw/issue-report-classification/nasa/cfs_train.csv
403
- hash: md5
404
- md5: 01b5946e58649c0afa0613716c2259ef
405
- size: 1961092
406
- - path: syntetic_issue_report_data_generation/preprocess.py
407
- hash: md5
408
- md5: ea154c395ecb7bd6f18f0fdf99b8e532
409
- size: 4973
410
- outs:
411
- - path:
412
- data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
413
- hash: md5
414
- md5: 39f4dae2567b2a2c4e9e0edf2534bad7
415
- size: 1969469
416
- preprocess_nasa_cfs_test:
417
- cmd: python syntetic_issue_report_data_generation/preprocess.py --dataset
418
- nasa_cfs_test --output
419
- data/interim/issue-report-classification/soft-cleaned/nasa/cfs_test.csv
420
- deps:
421
- - path: data/raw/issue-report-classification/nasa/cfs_test.csv
422
- hash: md5
423
- md5: 25fc494a76ff8b7e0bde317bc21aee11
424
- size: 517219
425
- - path: syntetic_issue_report_data_generation/preprocess.py
426
- hash: md5
427
- md5: ea154c395ecb7bd6f18f0fdf99b8e532
428
- size: 4973
429
- outs:
430
- - path:
431
- data/interim/issue-report-classification/soft-cleaned/nasa/cfs_test.csv
432
- hash: md5
433
- md5: 1f761ca5d6c85c8726a234a542184970
434
- size: 519291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dvc.yaml DELETED
@@ -1,177 +0,0 @@
1
- stages:
2
-
3
- # ==================== PREPROCESSING STAGES ====================
4
-
5
- preprocess_nlbse24_train:
6
- cmd: >-
7
- python syntetic_issue_report_data_generation/preprocess.py
8
- --dataset nlbse24_train
9
- --output data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
10
- deps:
11
- - data/raw/issue-report-classification/nlbse24/issues_train.csv
12
- - syntetic_issue_report_data_generation/preprocess.py
13
- outs:
14
- - data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
15
-
16
- preprocess_nlbse24_test:
17
- cmd: >-
18
- python syntetic_issue_report_data_generation/preprocess.py
19
- --dataset nlbse24_test
20
- --output data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_test.csv
21
- deps:
22
- - data/raw/issue-report-classification/nlbse24/issues_test.csv
23
- - syntetic_issue_report_data_generation/preprocess.py
24
- outs:
25
- - data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_test.csv
26
-
27
- preprocess_nasa_cfs_train:
28
- cmd: >-
29
- python syntetic_issue_report_data_generation/preprocess.py
30
- --dataset nasa_cfs_train
31
- --output data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
32
- deps:
33
- - data/raw/issue-report-classification/nasa/cfs_train.csv
34
- - syntetic_issue_report_data_generation/preprocess.py
35
- outs:
36
- - data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
37
-
38
- preprocess_nasa_cfs_test:
39
- cmd: >-
40
- python syntetic_issue_report_data_generation/preprocess.py
41
- --dataset nasa_cfs_test
42
- --output data/interim/issue-report-classification/soft-cleaned/nasa/cfs_test.csv
43
- deps:
44
- - data/raw/issue-report-classification/nasa/cfs_test.csv
45
- - syntetic_issue_report_data_generation/preprocess.py
46
- outs:
47
- - data/interim/issue-report-classification/soft-cleaned/nasa/cfs_test.csv
48
-
49
- # ==================== ANALYSIS STAGES ====================
50
-
51
- analyze_nlbse24:
52
- cmd: >-
53
- python syntetic_issue_report_data_generation/augmentation/analyze.py
54
- --input-csv data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
55
- --input-column issue
56
- --label-column label
57
- --context-file rsc/prompts/contexts/nlbse24.md
58
- --output-file data/interim/issue-report-classification/post-analysis/nlbse24/gemini-2.5-flash.csv
59
- --provider google
60
- --model gemini-2.5-flash
61
- deps:
62
- - data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
63
- - rsc/prompts/contexts/nlbse24.md
64
- - rsc/prompts/analize.md
65
- - syntetic_issue_report_data_generation/augmentation/analyze.py
66
- # - syntetic_issue_report_data_generation/augmentation/utils.py
67
- outs:
68
- - data/interim/issue-report-classification/post-analysis/nlbse24/gemini-2.5-flash.csv:
69
- persist: true
70
-
71
- analyze_nasa_cfs:
72
- cmd: >-
73
- python syntetic_issue_report_data_generation/augmentation/analyze.py
74
- --input-csv data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
75
- --input-column issue
76
- --label-column label
77
- --context-file rsc/prompts/contexts/nasa-cfs.md
78
- --output-file data/interim/issue-report-classification/post-analysis/nasa-cfs/gemini-2.5-flash.csv
79
- --provider google
80
- --model gemini-2.5-flash
81
- deps:
82
- - data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
83
- - rsc/prompts/contexts/nasa-cfs.md
84
- - rsc/prompts/analize.md
85
- - syntetic_issue_report_data_generation/augmentation/analyze.py
86
- # - syntetic_issue_report_data_generation/augmentation/utils.py
87
- outs:
88
- - data/interim/issue-report-classification/post-analysis/nasa-cfs/gemini-2.5-flash.csv:
89
- persist: true
90
-
91
- # ==================== TRANSFORM STAGES ====================
92
-
93
- transform_nlbse24:
94
- cmd: >-
95
- python syntetic_issue_report_data_generation/augmentation/generate.py
96
- --input-csv data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
97
- --analysis-csv data/interim/issue-report-classification/post-analysis/nlbse24/gemini-2.5-flash.csv
98
- --input-column issue
99
- --label-column label
100
- --context-file rsc/prompts/contexts/nlbse24.md
101
- --method one_to_one_transform_all_3
102
- --output-file data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash.csv
103
- --provider google
104
- --model gemini-2.5-flash
105
- deps:
106
- - data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
107
- - data/interim/issue-report-classification/post-analysis/nlbse24/gemini-2.5-flash.csv
108
- - rsc/prompts/contexts/nlbse24.md
109
- - rsc/prompts/transform/one-to-one-transform-all-3.md
110
- - syntetic_issue_report_data_generation/augmentation/generate.py
111
- # - syntetic_issue_report_data_generation/augmentation/analyze.py
112
- # - syntetic_issue_report_data_generation/augmentation/utils.py
113
- outs:
114
- - data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash.csv:
115
- persist: true
116
-
117
- transform_nasa_cfs:
118
- cmd: >-
119
- python syntetic_issue_report_data_generation/augmentation/generate.py
120
- --input-csv data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
121
- --analysis-csv data/interim/issue-report-classification/post-analysis/nasa-cfs/gemini-2.5-flash.csv
122
- --input-column issue
123
- --label-column label
124
- --context-file rsc/prompts/contexts/nasa-cfs.md
125
- --method one_to_one_transform_all_3
126
- --output-file data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash.csv
127
- --provider google
128
- --model gemini-2.5-flash
129
- deps:
130
- - data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
131
- - data/interim/issue-report-classification/post-analysis/nasa-cfs/gemini-2.5-flash.csv
132
- - rsc/prompts/contexts/nasa-cfs.md
133
- - rsc/prompts/transform/one-to-one-transform-all-3.md
134
- - syntetic_issue_report_data_generation/augmentation/generate.py
135
- # - syntetic_issue_report_data_generation/augmentation/analyze.py
136
- # - syntetic_issue_report_data_generation/augmentation/utils.py
137
- outs:
138
- - data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash.csv:
139
- persist: true
140
-
141
- # ==================== SPLIT BY DIFFICULTY STAGES ====================
142
-
143
- split_nlbse24_by_difficulty:
144
- cmd: >-
145
- python syntetic_issue_report_data_generation/augmentation/split_by_difficulty.py
146
- --file data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash.csv
147
- --output-dir data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash
148
- --original-dataset ./data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
149
- deps:
150
- - data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash.csv
151
- - syntetic_issue_report_data_generation/augmentation/split_by_difficulty.py
152
- - data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
153
- outs:
154
- - data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash/easy.csv:
155
- persist: true
156
- - data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash/medium.csv:
157
- persist: true
158
- - data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash/hard.csv:
159
- persist: true
160
-
161
- split_nasa_cfs_by_difficulty:
162
- cmd: >-
163
- python syntetic_issue_report_data_generation/augmentation/split_by_difficulty.py
164
- --file data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash.csv
165
- --output-dir data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash
166
- --original-dataset ./data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
167
- deps:
168
- - data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash.csv
169
- - syntetic_issue_report_data_generation/augmentation/split_by_difficulty.py
170
- - data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
171
- outs:
172
- - data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash/easy.csv:
173
- persist: true
174
- - data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash/medium.csv:
175
- persist: true
176
- - data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash/hard.csv:
177
- persist: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pipeline.ps1 ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DVC Pipeline Script for Capibara
2
+ # This script runs the entire data processing pipeline
3
+
4
+ param(
5
+ [string]$Stage = "all",
6
+ [switch]$Help
7
+ )
8
+
9
+ if ($Help) {
10
+ Write-Host @"
11
+ Capibara Pipeline Script
12
+ ========================
13
+
14
+ Usage: .\pipeline.ps1 [-Stage <stage_name>] [-Help]
15
+
16
+ Stages:
17
+ all - Run all stages (default)
18
+ preprocess - Run all preprocessing stages
19
+ preprocess_nlbse24_train - Preprocess NLBSE24 training data
20
+ preprocess_nlbse24_test - Preprocess NLBSE24 test data
21
+ preprocess_nasa_cfs_train - Preprocess NASA CFS training data
22
+ preprocess_nasa_cfs_test - Preprocess NASA CFS test data
23
+ analyze - Run all analysis stages
24
+ analyze_nlbse24 - Analyze NLBSE24 data
25
+ analyze_nasa_cfs - Analyze NASA CFS data
26
+ transform - Run all transform stages
27
+ transform_nlbse24 - Transform NLBSE24 data
28
+ transform_nasa_cfs - Transform NASA CFS data
29
+ split - Run all split by difficulty stages
30
+ split_nlbse24_by_difficulty - Split NLBSE24 by difficulty
31
+ split_nasa_cfs_by_difficulty - Split NASA CFS by difficulty
32
+
33
+ Examples:
34
+ .\pipeline.ps1 # Run all stages
35
+ .\pipeline.ps1 -Stage preprocess # Run only preprocessing
36
+ .\pipeline.ps1 -Stage analyze_nlbse24 # Run specific stage
37
+ "@
38
+ exit 0
39
+ }
40
+
41
+ # Set error action preference
42
+ $ErrorActionPreference = "Stop"
43
+
44
+ # Define colors for output
45
+ function Write-StageHeader {
46
+ param([string]$Message)
47
+ Write-Host ""
48
+ Write-Host "==================== $Message ====================" -ForegroundColor Cyan
49
+ Write-Host ""
50
+ }
51
+
52
+ function Write-StageInfo {
53
+ param([string]$Message)
54
+ Write-Host "[INFO] $Message" -ForegroundColor Green
55
+ }
56
+
57
+ function Write-StageError {
58
+ param([string]$Message)
59
+ Write-Host "[ERROR] $Message" -ForegroundColor Red
60
+ }
61
+
62
+ # ==================== PREPROCESSING STAGES ====================
63
+
64
+ function Invoke-PreprocessNlbse24Train {
65
+ Write-StageInfo "Running: preprocess_nlbse24_train"
66
+ python syntetic_issue_report_data_generation/preprocess.py `
67
+ --dataset nlbse24_train `
68
+ --output data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
69
+ if ($LASTEXITCODE -ne 0) { throw "preprocess_nlbse24_train failed" }
70
+ }
71
+
72
+ function Invoke-PreprocessNlbse24Test {
73
+ Write-StageInfo "Running: preprocess_nlbse24_test"
74
+ python syntetic_issue_report_data_generation/preprocess.py `
75
+ --dataset nlbse24_test `
76
+ --output data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_test.csv
77
+ if ($LASTEXITCODE -ne 0) { throw "preprocess_nlbse24_test failed" }
78
+ }
79
+
80
+ function Invoke-PreprocessNasaCfsTrain {
81
+ Write-StageInfo "Running: preprocess_nasa_cfs_train"
82
+ python syntetic_issue_report_data_generation/preprocess.py `
83
+ --dataset nasa_cfs_train `
84
+ --output data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
85
+ if ($LASTEXITCODE -ne 0) { throw "preprocess_nasa_cfs_train failed" }
86
+ }
87
+
88
+ function Invoke-PreprocessNasaCfsTest {
89
+ Write-StageInfo "Running: preprocess_nasa_cfs_test"
90
+ python syntetic_issue_report_data_generation/preprocess.py `
91
+ --dataset nasa_cfs_test `
92
+ --output data/interim/issue-report-classification/soft-cleaned/nasa/cfs_test.csv
93
+ if ($LASTEXITCODE -ne 0) { throw "preprocess_nasa_cfs_test failed" }
94
+ }
95
+
96
+ function Invoke-AllPreprocess {
97
+ Write-StageHeader "PREPROCESSING STAGES"
98
+ Invoke-PreprocessNlbse24Train
99
+ Invoke-PreprocessNlbse24Test
100
+ Invoke-PreprocessNasaCfsTrain
101
+ Invoke-PreprocessNasaCfsTest
102
+ }
103
+
104
+ # ==================== ANALYSIS STAGES ====================
105
+
106
+ function Invoke-AnalyzeNlbse24 {
107
+ Write-StageInfo "Running: analyze_nlbse24"
108
+ python syntetic_issue_report_data_generation/augmentation/analyze.py `
109
+ --input-csv data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv `
110
+ --input-column issue `
111
+ --label-column label `
112
+ --context-file rsc/prompts/contexts/nlbse24.md `
113
+ --output-file data/interim/issue-report-classification/post-analysis/nlbse24/gemini-2.5-flash.csv `
114
+ --provider google `
115
+ --model gemini-2.5-flash
116
+ if ($LASTEXITCODE -ne 0) { throw "analyze_nlbse24 failed" }
117
+ }
118
+
119
+ function Invoke-AnalyzeNasaCfs {
120
+ Write-StageInfo "Running: analyze_nasa_cfs"
121
+ python syntetic_issue_report_data_generation/augmentation/analyze.py `
122
+ --input-csv data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv `
123
+ --input-column issue `
124
+ --label-column label `
125
+ --context-file rsc/prompts/contexts/nasa-cfs.md `
126
+ --output-file data/interim/issue-report-classification/post-analysis/nasa-cfs/gemini-2.5-flash.csv `
127
+ --provider google `
128
+ --model gemini-2.5-flash
129
+ if ($LASTEXITCODE -ne 0) { throw "analyze_nasa_cfs failed" }
130
+ }
131
+
132
+ function Invoke-AllAnalyze {
133
+ Write-StageHeader "ANALYSIS STAGES"
134
+ Invoke-AnalyzeNlbse24
135
+ Invoke-AnalyzeNasaCfs
136
+ }
137
+
138
+ # ==================== TRANSFORM STAGES ====================
139
+
140
+ function Invoke-TransformNlbse24 {
141
+ Write-StageInfo "Running: transform_nlbse24"
142
+ python syntetic_issue_report_data_generation/augmentation/generate.py `
143
+ --input-csv data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv `
144
+ --analysis-csv data/interim/issue-report-classification/post-analysis/nlbse24/gemini-2.5-flash.csv `
145
+ --input-column issue `
146
+ --label-column label `
147
+ --context-file rsc/prompts/contexts/nlbse24.md `
148
+ --method one_to_one_transform_all_3 `
149
+ --output-file data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash.csv `
150
+ --provider google `
151
+ --model gemini-2.5-flash
152
+ if ($LASTEXITCODE -ne 0) { throw "transform_nlbse24 failed" }
153
+ }
154
+
155
+ function Invoke-TransformNasaCfs {
156
+ Write-StageInfo "Running: transform_nasa_cfs"
157
+ python syntetic_issue_report_data_generation/augmentation/generate.py `
158
+ --input-csv data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv `
159
+ --analysis-csv data/interim/issue-report-classification/post-analysis/nasa-cfs/gemini-2.5-flash.csv `
160
+ --input-column issue `
161
+ --label-column label `
162
+ --context-file rsc/prompts/contexts/nasa-cfs.md `
163
+ --method one_to_one_transform_all_3 `
164
+ --output-file data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash.csv `
165
+ --provider google `
166
+ --model gemini-2.5-flash
167
+ if ($LASTEXITCODE -ne 0) { throw "transform_nasa_cfs failed" }
168
+ }
169
+
170
+ function Invoke-AllTransform {
171
+ Write-StageHeader "TRANSFORM STAGES"
172
+ Invoke-TransformNlbse24
173
+ Invoke-TransformNasaCfs
174
+ }
175
+
176
+ # ==================== SPLIT BY DIFFICULTY STAGES ====================
177
+
178
+ function Invoke-SplitNlbse24ByDifficulty {
179
+ Write-StageInfo "Running: split_nlbse24_by_difficulty"
180
+ python syntetic_issue_report_data_generation/augmentation/split_by_difficulty.py `
181
+ --file data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash.csv `
182
+ --output-dir data/interim/issue-report-classification/post-transform/nlbse24/gemini-2.5-flash `
183
+ --original-dataset ./data/interim/issue-report-classification/soft-cleaned/nlbse24/issues_train.csv
184
+ if ($LASTEXITCODE -ne 0) { throw "split_nlbse24_by_difficulty failed" }
185
+ }
186
+
187
+ function Invoke-SplitNasaCfsByDifficulty {
188
+ Write-StageInfo "Running: split_nasa_cfs_by_difficulty"
189
+ python syntetic_issue_report_data_generation/augmentation/split_by_difficulty.py `
190
+ --file data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash.csv `
191
+ --output-dir data/interim/issue-report-classification/post-transform/nasa-cfs/gemini-2.5-flash `
192
+ --original-dataset ./data/interim/issue-report-classification/soft-cleaned/nasa/cfs_train.csv
193
+ if ($LASTEXITCODE -ne 0) { throw "split_nasa_cfs_by_difficulty failed" }
194
+ }
195
+
196
+ function Invoke-AllSplit {
197
+ Write-StageHeader "SPLIT BY DIFFICULTY STAGES"
198
+ Invoke-SplitNlbse24ByDifficulty
199
+ Invoke-SplitNasaCfsByDifficulty
200
+ }
201
+
202
+ # ==================== MAIN EXECUTION ====================
203
+
204
+ function Invoke-AllStages {
205
+ Invoke-AllPreprocess
206
+ Invoke-AllAnalyze
207
+ Invoke-AllTransform
208
+ Invoke-AllSplit
209
+ }
210
+
211
+ # Main execution block
212
+ try {
213
+ Write-Host "Capibara Pipeline" -ForegroundColor Yellow
214
+ Write-Host "=================" -ForegroundColor Yellow
215
+ Write-Host "Stage: $Stage" -ForegroundColor Yellow
216
+
217
+ switch ($Stage.ToLower()) {
218
+ "all" { Invoke-AllStages }
219
+
220
+ # Preprocessing stages
221
+ "preprocess" { Invoke-AllPreprocess }
222
+ "preprocess_nlbse24_train" { Invoke-PreprocessNlbse24Train }
223
+ "preprocess_nlbse24_test" { Invoke-PreprocessNlbse24Test }
224
+ "preprocess_nasa_cfs_train" { Invoke-PreprocessNasaCfsTrain }
225
+ "preprocess_nasa_cfs_test" { Invoke-PreprocessNasaCfsTest }
226
+
227
+ # Analysis stages
228
+ "analyze" { Invoke-AllAnalyze }
229
+ "analyze_nlbse24" { Invoke-AnalyzeNlbse24 }
230
+ "analyze_nasa_cfs" { Invoke-AnalyzeNasaCfs }
231
+
232
+ # Transform stages
233
+ "transform" { Invoke-AllTransform }
234
+ "transform_nlbse24" { Invoke-TransformNlbse24 }
235
+ "transform_nasa_cfs" { Invoke-TransformNasaCfs }
236
+
237
+ # Split by difficulty stages
238
+ "split" { Invoke-AllSplit }
239
+ "split_nlbse24_by_difficulty" { Invoke-SplitNlbse24ByDifficulty }
240
+ "split_nasa_cfs_by_difficulty" { Invoke-SplitNasaCfsByDifficulty }
241
+
242
+ default {
243
+ Write-StageError "Unknown stage: $Stage"
244
+ Write-Host "Use -Help to see available stages"
245
+ exit 1
246
+ }
247
+ }
248
+
249
+ Write-Host ""
250
+ Write-Host "==================== PIPELINE COMPLETED ====================" -ForegroundColor Green
251
+ Write-Host ""
252
+
253
+ } catch {
254
+ Write-StageError $_.Exception.Message
255
+ Write-Host ""
256
+ Write-Host "==================== PIPELINE FAILED ====================" -ForegroundColor Red
257
+ Write-Host ""
258
+ exit 1
259
+ }
syntetic_issue_report_data_generation/config.py CHANGED
@@ -11,7 +11,7 @@ PROJ_ROOT = Path(__file__).resolve().parents[1]
11
  DATA_DIR = PROJ_ROOT / "data"
12
  RAW_DATA_DIR = DATA_DIR / "raw"
13
  INTERIM_DATA_DIR = DATA_DIR / "interim"
14
- SOFT_CLEANED_DATA_DIR = DATA_DIR / "soft_cleaned"
15
  PROCESSED_DATA_DIR = DATA_DIR / "processed"
16
  EXTERNAL_DATA_DIR = DATA_DIR / "external"
17
 
 
11
  DATA_DIR = PROJ_ROOT / "data"
12
  RAW_DATA_DIR = DATA_DIR / "raw"
13
  INTERIM_DATA_DIR = DATA_DIR / "interim"
14
+ SOFT_CLEANED_DATA_DIR = INTERIM_DATA_DIR / "soft-cleaned"
15
  PROCESSED_DATA_DIR = DATA_DIR / "processed"
16
  EXTERNAL_DATA_DIR = DATA_DIR / "external"
17