Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ For data ordering, we devise **Folding Ordering (FO)** method, which addresses i
## 📢 News and Updates

Done
- [x] 2026/02/28: 💥 The **Data Ordering** module is officially integrated into DELT, supporting various data organization strategies including **Folding**, **Shuffle**, **Sorting**, **Zig-zag**, **Segment**, **Stair**, and **Saw Ordering**.
- [x] 2026/01/05: 💥Our paper **"Demystifying Data Organization for Enhanced LLM Training"** was submitted to ACL ARR January 2026.
- [x] 2025/06/28: 💥The [Arxiv paper](https://arxiv.org/abs/2506.21545) released.
- [x] 2025/08/31: 💥The DELT code released for pre-training on general domain.

Expand Down Expand Up @@ -137,7 +139,7 @@ bash data_selection/entry.sh $INPUT_DATA_PATH $OUTPUT_DATA_PATH $METHOD $CONFIG_
<details open>
<summary>Data Ordering</summary>

Existing ordering method: **Folding Ordering (FO)** (`folding`), Shuffle (`shuffle`), and Sorting (`sorting`).
Existing ordering method: **Folding Ordering (FO)** (`folding`), Shuffle (`shuffle`), Sorting (`sorting`), Zig-zag Ordering(`zigzag`), Segment Ordering(`segment`), Stair Ordering(`str`), Saw Ordering(`saw`).

```bash
bash data_ordering/entry.sh $INPUT_DATA_PATH $OUTPUT_DATA_PATH $METHOD $CONFIG_PATH
Expand Down
2 changes: 2 additions & 0 deletions data_ordering/config/folding.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ description: Config of folding method in data ordering.

score_field: score
folding_layer: 3
window_size: 100000
seed: 42
13 changes: 13 additions & 0 deletions data_ordering/config/saw.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: section
version: 1.0
description: Config of section (hybrid sorting/folding) method in data ordering.


score_field: score
ascending: True
reverse_even_layers: True
folding_layer: 3
num_sections: 3
folding_ratio: 0.1
window_size: 100000
seed: 42
13 changes: 13 additions & 0 deletions data_ordering/config/segment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: segment
version: 1.0
description: Config of segment-based ordering method.


score_field: score

x_pct: 30
y_pct: 30
front_is_high: false
back_is_high: true

seed: 42
2 changes: 1 addition & 1 deletion data_ordering/config/shuffle.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ version: 1.0
description: Config of shuffle method in data ordering.

score_field: score
seed: 10
seed: 42
8 changes: 6 additions & 2 deletions data_ordering/config/sorting.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
name: sorting
version: 1.0
description: Config of sorting method in data ordering.
version: 1.1
description: Config of sorting method with local window shuffling.

score_field: score
ascending: true
use_gumbel: true
temperature: 0
window_size: 100000
seed: 42
13 changes: 13 additions & 0 deletions data_ordering/config/str.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: section
version: 1.0
description: Config of section (hybrid sorting/folding) method in data ordering.


score_field: score
ascending: True
reverse_even_layers: False
folding_layer: 2
num_sections: 3
folding_ratio: 0.1
window_size: 100000
seed: 42
10 changes: 10 additions & 0 deletions data_ordering/config/zigzag.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: zigzag
version: 2.0
description: Config of zigzag method in data ordering.

score_field: score
zigzag_layer: 2
use_gumbel: true
temperature: 1000
seed: 42
window_size: 100000
44 changes: 42 additions & 2 deletions data_ordering/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,19 @@
import shuffle
import sorting
import folding
import zigzag
import segment
import str
import saw
from utils import load_yaml, load_jsonl, add_args, write_jsonl


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Data ordering.")
parser.add_argument("--input_data_path", type=str, help="Path to the input .jsonl file.")
parser.add_argument("--output_data_path", type=str, help="Path to the output .jsonl file.")
parser.add_argument("--method", type=str, choices=["shuffle", "sorting", "folding"], default="folding",
help="Ordering method: 'shuffle', 'sorting', and 'folding'. Defaults to 'folding'.")
parser.add_argument("--method", type=str, choices=["shuffle", "sorting", "folding", "zigzag", "segment", "str", "saw"], default="folding",
help="Ordering method: 'shuffle', 'sorting', and 'folding','zigzag','segment','str','saw'. Defaults to 'folding'.")
parser.add_argument("--config_path", type=str, default="./config/folding.yaml", help="Config file for additional parameters (YAML format).")

args = parser.parse_args()
Expand All @@ -33,10 +37,46 @@
if args.method == "sorting":
out_data = sorting.order(in_data, args)
print(f" Ascending: {args.ascending}")
print(f" Temperature: {args.temperature}")
print(f" Use gumbel: {args.use_gumbel}")
print(f" Window size: {args.window_size}")

if args.method == "folding":
out_data = folding.order(in_data, args)
print(f" Folding layer: {args.folding_layer}")
print(f" Window size: {args.window_size}")

if args.method == "zigzag":
out_data = zigzag.order(in_data, args)
print(f" Zigzag layer: {args.zigzag_layer}")
print(f" Temperature: {args.temperature}")
print(f" Use gumbel: {args.use_gumbel}")
print(f" Window size: {args.window_size}")

if args.method == "segment":
out_data = segment.order(in_data, args)
print(f" Front percentage: {args.x_pct}%")
print(f" Back percentage: {args.y_pct}%")
print(f" Front is high: {args.front_is_high}")
print(f" Back is high: {args.back_is_high}")
if hasattr(args, 'seed'):
print(f" Random seed: {args.seed}")

if args.method == "str":
out_data = str.order(in_data, args)
print(f" Global Ascending: {args.ascending}")
print(f" Num sections: {args.num_sections}")
print(f" Folding ratio: {args.folding_ratio}")
print(f" Folding layer (in section): {args.folding_layer}")
print(f" Window size: {args.window_size}")

if args.method == "saw":
out_data = saw.order(in_data, args)
print(f" Global Ascending: {args.ascending}")
print(f" Num sections: {args.num_sections}")
print(f" Folding ratio: {args.folding_ratio}")
print(f" Folding layer (in section): {args.folding_layer}")
print(f" Window size: {args.window_size}")


write_jsonl(args.output_data_path, out_data)
54 changes: 52 additions & 2 deletions data_ordering/folding.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,62 @@
import numpy as np

def window_based_shuffle(data, window_size, seed=42):
"""
Jittering Ordering:对列表进行局部窗口内的随机打乱,整体有序,局部无序

Args:
data:输入数据列表
window_size:局部打乱窗口大小,如果为 0 或 1,则不进行局部打乱
seed:随机种子

Returns:
list: 重排序后的数据列表
"""
if window_size <= 1:
return data

n = len(data)
rng = np.random.RandomState(seed)
shuffled_final_data = []

for i in range(0, n, window_size):
chunk = data[i: i + window_size]
rng.shuffle(chunk)
shuffled_final_data.extend(chunk)

return shuffled_final_data


def order(in_data, args):
"""
Folding Ordering:将输入数据按分数进行升序排列,排序后的序列按 folding_layer 进行取模分桶,依次提取每个桶中的元素并拼接,实现分数的跳跃式分布。
最后可选执行局部窗口打乱。

Args:
in_data (list): 输入数据列表,每个元素为带有分数的字典
args: 包含配置参数的对象
- score_field: 分数字段名
- folding_layer:折叠层数
- window_size: 局部打乱窗口大小 (可选)
- seed: 随机种子 (可选)

Returns:
list: 重排序后的数据列表
"""
score_field = args.score_field
layers = args.folding_layer

# folding order.
window_size = getattr(args, "window_size", 0)
seed = getattr(args, "seed", 42)

sorted_data = sorted(in_data, key=lambda x: x[score_field], reverse=False)

out_data = list()
for l in range(layers):
sub_data = [sorted_data[i] for i in range(len(sorted_data)) if i % layers == l]
out_data.extend(sub_data)

if window_size > 1:
out_data = window_based_shuffle(out_data, window_size, seed)

return out_data
139 changes: 139 additions & 0 deletions data_ordering/saw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import numpy as np


def _apply_interleave_fold(data_segment, score_field, layers, reverse_even_layers=False): # <--- MODIFICATION: 增加新参数
"""
应用 "Folding" 逻辑

Args:
data_segment (list): 要处理的数据片段
score_field (str): 分数字段
layers (int): 交叉的层数
reverse_even_layers (bool): 是否翻转偶数层(实现 Zigzag)
"""
if not data_segment:
return []

sorted_data = sorted(data_segment, key=lambda x: x[score_field], reverse=False)

out_data = list()
for l in range(layers):

sub_data = [sorted_data[i] for i in range(len(sorted_data)) if i % layers == l]
if reverse_even_layers and (l % 2 != 0):
sub_data.reverse()
out_data.extend(sub_data)

return out_data


def window_based_shuffle(data, window_size, seed=42):
"""
Jittering Ordering:对列表进行局部窗口内的随机打乱,整体有序,局部无序

Args:
data:输入数据列表
window_size:局部打乱窗口大小,如果为 0 或 1,则不进行局部打乱
seed:随机种子

Returns:
list: 重排序后的数据列表
"""
if window_size <= 1:
return data

n = len(data)
rng = np.random.RandomState(seed)
shuffled_final_data = []

for i in range(0, n, window_size):
chunk = data[i: i + window_size]
rng.shuffle(chunk)
shuffled_final_data.extend(chunk)

return shuffled_final_data


def order(in_data, args):
"""
Saw Ordering:先全局排序,然后在 K-1 个分割点应用局部折叠并反转奇数层,最后局部窗口打乱(可选)

Args:
in_data (list): 输入数据列表,每个元素为带有分数的字典。
args: 包含配置参数的对象。
- score_field: 分数字段名
- ascending: 是否升序
- reverse_even_layers:是否翻转偶数层(参数默认True)
- folding_layer: 局部折叠参数 (来自 'folding')
- num_section: 数据被分成的总折数(参数默认为3)
- folding_ratio: 在分割点处,向上和向下各取多少比例的数据进行折叠 (例如 0.10 表示各 10%)
- window_size:局部打乱窗口大小,如果为 0 或 1,则不进行局部打乱
- seed:随机种子

Returns:
list: 重排序后的数据列表
"""

score_field = args.score_field
ascending = args.ascending
num_sections = args.num_sections
folding_ratio = args.folding_ratio
interleave_layers = args.folding_layer
reverse_even_layers = getattr(args, 'reverse_even_layers', False)

window_size = getattr(args, "window_size", 0)
seed = getattr(args, "seed", 42)

if ascending:
sorted_data = sorted(in_data, key=lambda x: x[score_field], reverse=False)
else:
sorted_data = sorted(in_data, key=lambda x: x[score_field], reverse=True)
N = len(sorted_data)

if N == 0:
return sorted_data

if num_sections > 1:
split_indices = [int(round(N * i / num_sections)) for i in range(1, num_sections)]
radius_items = int(round(N * folding_ratio))
segments = []
current_index = 0

for sp_index in split_indices:
fold_start = max(0, sp_index - radius_items)
fold_end = min(N, sp_index + radius_items)
fold_start = max(fold_start, current_index)
fold_end = max(fold_end, fold_start)
if fold_start > current_index:
segments.append((current_index, fold_start, 'stable'))
if fold_end > fold_start:
segments.append((fold_start, fold_end, 'fold'))
current_index = fold_end

if current_index < N:
segments.append((current_index, N, 'stable'))

out_data = list()
for start, end, segment_type in segments:
data_segment = sorted_data[start:end]

if not data_segment:
continue

if segment_type == 'stable':
out_data.extend(data_segment)
else:
folded_segment = _apply_interleave_fold(
data_segment,
score_field,
interleave_layers,
reverse_even_layers
)
out_data.extend(folded_segment)
else:
out_data = sorted_data

if window_size > 1:
out_data = window_based_shuffle(out_data, window_size, seed)

return out_data
Loading