|
|
| """
|
| Download, transform LMSYS-Chat-1M into plain text for LLM completion models
|
| in the format:
|
| <|im_start|>role
|
| message<|endoftext|>
|
| <|im_stop|>
|
|
|
| with 6 newlines between conversations.
|
| """
|
|
|
| from datasets import load_dataset
|
| import sys
|
|
|
| def main(output_path="lmsys_chat_1m.txt", split="train"):
|
| ds = load_dataset("lmsys/lmsys-chat-1m", split=split)
|
|
|
| with open(output_path, "w", encoding="utf-8") as out:
|
| for i, sample in enumerate(ds):
|
| conv = sample["conversation"]
|
|
|
| for msg in conv:
|
| role = msg["role"]
|
| content = msg["content"].strip()
|
| out.write(f"<|im_start|>{role}\n{content}<|endoftext|>\n<|im_stop|>\n")
|
|
|
| out.write("\n" * 6)
|
|
|
| if (i + 1) % 10000 == 0:
|
| print(f"Processed {i + 1} conversations", file=sys.stderr)
|
|
|
| print(f"✔ Saved plain-text to: {output_path}")
|
|
|
| if __name__ == "__main__":
|
| import argparse
|
| p = argparse.ArgumentParser(description="Convert LMSYS-Chat-1M to LLM-friendly text format")
|
| p.add_argument("--output", "-o", default="lmsys_chat_1m.txt", help="Output file path")
|
| p.add_argument("--split", "-s", default="train", help="Dataset split (e.g. 'train')")
|
| args = p.parse_args()
|
| main(output_path=args.output, split=args.split)
|
|
|