import os import glob from tqdm import tqdm folder = os.path.expanduser("~/torch_datasets/github-python/mega_licensed_all_files") output_file = os.path.expanduser( "~/torch_datasets/github-python/mega_licensed_corpus/concatenated.py" ) with open(output_file, "w", encoding="utf-8") as out_f: for file in tqdm(glob.glob(os.path.join(folder, "*.py"))): out_f.write("\n# \n") try: with open(file, "r", encoding="utf-8", errors="ignore") as in_f: out_f.write(in_f.read()) except Exception as e: out_f.write(f"\n# Skipping {file} due to error: {e}\n") print(f"Concatenation complete: {output_file}")