Spaces:

Molbap
/

transformers-modular-refactor

Sleeping

App Files Files Community

transformers-modular-refactor / build_cache.py

Molbap HF Staff

updates

c7100d5 5 months ago

raw

history blame contribute delete

7.74 kB

	# build_cache.py
	import os
	import io
	import json
	import tarfile
	import subprocess
	import tempfile
	from pathlib import Path
	from datetime import datetime, timezone

	from huggingface_hub import HfApi

	from modular_graph_and_candidates import (
	build_graph_json,
	generate_html,
	build_timeline_json,
	generate_timeline_html,
	)

	REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
	CACHE_REPO = "Molbap/hf_cached_embeds_log"
	MIN_THRESH = 0.1
	MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"}
	SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
	MODULAR_CUTOFF_ISO = "2024-05-31"

	def _run(cwd: Path, *args: str) -> str:
	p = subprocess.run(["git", *args], cwd=cwd, text=True, capture_output=True, timeout=1200)
	if p.returncode != 0:
	raise RuntimeError(p.stderr.strip()[:400])
	return p.stdout

	def _count_lines(text: str) -> int:
	return text.count("\n") + (1 if text and not text.endswith("\n") else 0)

	def _compute_loc_growth(repo: Path) -> dict:
	try:
	_run(repo, "fetch", "--unshallow", "--tags", "--prune")
	except Exception:
	_run(repo, "fetch", "--depth=100000", "--tags", "--prune")

	pathspec = "src/transformers/models"
	lines = _run(repo, "log", "--reverse", "--format=%H\|%cI", "HEAD", "--", pathspec).splitlines()
	commits = [(ln.split("\|", 1)[0], ln.split("\|", 1)[1]) for ln in lines if "\|" in ln]
	total = len(commits)
	if total > 500:
	step = max(1, total // 300)
	commits = commits[::step]

	out = []
	for sha, date_iso in commits:
	proc = subprocess.run(
	["git", "archive", sha, "--", pathspec],
	cwd=repo, capture_output=True, timeout=180
	)
	if proc.returncode != 0 or not proc.stdout:
	# Fallback: zero for this point; continue
	out.append({
	"sha": sha, "date": date_iso,
	"loc_modeling_all": 0, "loc_modular": 0,
	"loc_modeling_included": 0, "effective_loc": 0,
	"n_models_with_modular": 0
	})
	continue

	buf = io.BytesIO(proc.stdout)
	modeling_by_model = {}
	modular_by_model = {}

	with tarfile.open(fileobj=buf, mode="r:*") as tar:
	for m in tar.getmembers():
	if not m.isfile():
	continue
	name = m.name
	if not name.endswith(".py"):
	continue
	if "/models/" not in name:
	continue
	parts = name.split("/")
	try:
	idx = parts.index("models")
	model = parts[idx + 1] if idx + 1 < len(parts) else ""
	except ValueError:
	model = ""
	if not model:
	continue
	if "/modeling_" in name or "/modular_" in name:
	f = tar.extractfile(m)
	if not f:
	continue
	try:
	txt = f.read().decode("utf-8", errors="ignore")
	finally:
	f.close()
	n = _count_lines(txt)
	if "/modular_" in name:
	modular_by_model[model] = modular_by_model.get(model, 0) + n
	elif "/modeling_" in name:
	modeling_by_model[model] = modeling_by_model.get(model, 0) + n

	modeling_all = sum(modeling_by_model.values())
	modular_loc = sum(modular_by_model.values())
	models_with_modular = set(modular_by_model.keys())
	modeling_excluded = sum(modeling_by_model.get(m, 0) for m in models_with_modular)
	modeling_included = modeling_all - modeling_excluded
	effective = modeling_included + modular_loc

	out.append({
	"sha": sha,
	"date": date_iso,
	"loc_modeling_all": modeling_all,
	"loc_modular": modular_loc,
	"loc_modeling_included": modeling_included,
	"effective_loc": effective,
	"n_models_with_modular": len(models_with_modular),
	})

	return {"series": out, "cutoff": MODULAR_CUTOFF_ISO}

	def _loc_html(loc: dict) -> str:
	data = json.dumps(loc["series"], separators=(",", ":"))
	cutoff = loc["cutoff"]
	return f"""<!doctype html><meta charset=utf-8>
	<title>LOC growth</title>
	<div id=chart style="height:60vh;width:90vw;margin:2rem auto;"></div>
	<script src="https://cdn.jsdelivr.net/npm/apexcharts"></script>
	<script>
	const raw={data};
	const xs=raw.map(d=>new Date(d.date).getTime());
	const eff=raw.map(d=>d.effective_loc);
	const mod=raw.map(d=>d.loc_modular);
	const mdl_all=raw.map(d=>d.loc_modeling_all);
	const mdl_inc=raw.map(d=>d.loc_modeling_included);
	const cutoffTs=new Date("{cutoff}T00:00:00Z").getTime();
	const opts={{
	chart:{{type:"line",height:"100%"}},
	series:[
	{{name:"Effective LOC",data:xs.map((t,i)=>[t,eff[i]])}},
	{{name:"Modular LOC",data:xs.map((t,i)=>[t,mod[i]])}},
	{{name:"Modeling LOC (all)",data:xs.map((t,i)=>[t,mdl_all[i]])}},
	{{name:"Modeling LOC (included)",data:xs.map((t,i)=>[t,mdl_inc[i]])}}
	],
	xaxis:{{type:"datetime"}},
	yaxis:{{labels:{{formatter:v=>Math.round(v)}}}},
	stroke:{{width:2}},
	tooltip:{{shared:true,x:{{format:"yyyy-MM-dd"}}}},
	annotations:{{xaxis:[{{x:cutoffTs,borderColor:"#e11d48",label:{{text:"2024-05-31 modular",style:{{color:"#fff",background:"#e11d48"}}}}}}]}}
	}};
	new ApexCharts(document.getElementById("chart"),opts).render();
	</script>"""

	def main():
	tmp = Path(tempfile.mkdtemp())
	subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")])
	sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
	repo_path = tmp / "repo"

	loc_growth = _compute_loc_growth(repo_path)
	loc_json_str = json.dumps(loc_growth, separators=(",", ":"))
	loc_html_str = _loc_html(loc_growth)

	graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
	timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
	graph_html = generate_html(graph)
	timeline_html = generate_timeline_html(timeline)

	api = HfApi()
	api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)

	key = f"{sha}/{SIM_METHOD}-m{int(MULTIMODAL)}"
	latest = {
	"sha": sha,
	"updated_utc": datetime.now(timezone.utc).isoformat(),
	"defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL},
	"paths": {
	"graph_json": f"graph/{key}.json",
	"graph_html": f"graph/{key}.html",
	"timeline_json": f"timeline/{key}.json",
	"timeline_html": f"timeline/{key}.html",
	"loc_json": f"loc/{key}.json",
	"loc_html": f"loc/{key}.html",
	},
	}

	def put(path_in_repo: str, text: str):
	api.upload_file(
	path_or_fileobj=io.BytesIO(text.encode("utf-8")),
	path_in_repo=path_in_repo,
	repo_id=CACHE_REPO,
	repo_type="dataset",
	commit_message=f"cache {path_in_repo}",
	)

	put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":")))
	put(f"graph/{key}.html", graph_html)
	put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":")))
	put(f"timeline/{key}.html", timeline_html)
	put(f"loc/{key}.json", loc_json_str)
	put(f"loc/{key}.html", loc_html_str)
	put("latest.json", json.dumps(latest, separators=(",", ":")))

	if __name__ == "__main__":
	main()