| | |
| | import os |
| | import io |
| | import json |
| | import tarfile |
| | import subprocess |
| | import tempfile |
| | from pathlib import Path |
| | from datetime import datetime, timezone |
| |
|
| | from huggingface_hub import HfApi |
| |
|
| | from modular_graph_and_candidates import ( |
| | build_graph_json, |
| | generate_html, |
| | build_timeline_json, |
| | generate_timeline_html, |
| | ) |
| |
|
| | REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers") |
| | CACHE_REPO = "Molbap/hf_cached_embeds_log" |
| | MIN_THRESH = 0.1 |
| | MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"} |
| | SIM_METHOD = os.getenv("SIM_METHOD", "jaccard") |
| | MODULAR_CUTOFF_ISO = "2024-05-31" |
| |
|
| | def _run(cwd: Path, *args: str) -> str: |
| | p = subprocess.run(["git", *args], cwd=cwd, text=True, capture_output=True, timeout=1200) |
| | if p.returncode != 0: |
| | raise RuntimeError(p.stderr.strip()[:400]) |
| | return p.stdout |
| |
|
| | def _count_lines(text: str) -> int: |
| | return text.count("\n") + (1 if text and not text.endswith("\n") else 0) |
| |
|
| | def _compute_loc_growth(repo: Path) -> dict: |
| | try: |
| | _run(repo, "fetch", "--unshallow", "--tags", "--prune") |
| | except Exception: |
| | _run(repo, "fetch", "--depth=100000", "--tags", "--prune") |
| |
|
| | pathspec = "src/transformers/models" |
| | lines = _run(repo, "log", "--reverse", "--format=%H|%cI", "HEAD", "--", pathspec).splitlines() |
| | commits = [(ln.split("|", 1)[0], ln.split("|", 1)[1]) for ln in lines if "|" in ln] |
| | total = len(commits) |
| | if total > 500: |
| | step = max(1, total // 300) |
| | commits = commits[::step] |
| |
|
| | out = [] |
| | for sha, date_iso in commits: |
| | proc = subprocess.run( |
| | ["git", "archive", sha, "--", pathspec], |
| | cwd=repo, capture_output=True, timeout=180 |
| | ) |
| | if proc.returncode != 0 or not proc.stdout: |
| | |
| | out.append({ |
| | "sha": sha, "date": date_iso, |
| | "loc_modeling_all": 0, "loc_modular": 0, |
| | "loc_modeling_included": 0, "effective_loc": 0, |
| | "n_models_with_modular": 0 |
| | }) |
| | continue |
| |
|
| | buf = io.BytesIO(proc.stdout) |
| | modeling_by_model = {} |
| | modular_by_model = {} |
| |
|
| | with tarfile.open(fileobj=buf, mode="r:*") as tar: |
| | for m in tar.getmembers(): |
| | if not m.isfile(): |
| | continue |
| | name = m.name |
| | if not name.endswith(".py"): |
| | continue |
| | if "/models/" not in name: |
| | continue |
| | parts = name.split("/") |
| | try: |
| | idx = parts.index("models") |
| | model = parts[idx + 1] if idx + 1 < len(parts) else "" |
| | except ValueError: |
| | model = "" |
| | if not model: |
| | continue |
| | if "/modeling_" in name or "/modular_" in name: |
| | f = tar.extractfile(m) |
| | if not f: |
| | continue |
| | try: |
| | txt = f.read().decode("utf-8", errors="ignore") |
| | finally: |
| | f.close() |
| | n = _count_lines(txt) |
| | if "/modular_" in name: |
| | modular_by_model[model] = modular_by_model.get(model, 0) + n |
| | elif "/modeling_" in name: |
| | modeling_by_model[model] = modeling_by_model.get(model, 0) + n |
| |
|
| | modeling_all = sum(modeling_by_model.values()) |
| | modular_loc = sum(modular_by_model.values()) |
| | models_with_modular = set(modular_by_model.keys()) |
| | modeling_excluded = sum(modeling_by_model.get(m, 0) for m in models_with_modular) |
| | modeling_included = modeling_all - modeling_excluded |
| | effective = modeling_included + modular_loc |
| |
|
| | out.append({ |
| | "sha": sha, |
| | "date": date_iso, |
| | "loc_modeling_all": modeling_all, |
| | "loc_modular": modular_loc, |
| | "loc_modeling_included": modeling_included, |
| | "effective_loc": effective, |
| | "n_models_with_modular": len(models_with_modular), |
| | }) |
| |
|
| | return {"series": out, "cutoff": MODULAR_CUTOFF_ISO} |
| |
|
| | def _loc_html(loc: dict) -> str: |
| | data = json.dumps(loc["series"], separators=(",", ":")) |
| | cutoff = loc["cutoff"] |
| | return f"""<!doctype html><meta charset=utf-8> |
| | <title>LOC growth</title> |
| | <div id=chart style="height:60vh;width:90vw;margin:2rem auto;"></div> |
| | <script src="https://cdn.jsdelivr.net/npm/apexcharts"></script> |
| | <script> |
| | const raw={data}; |
| | const xs=raw.map(d=>new Date(d.date).getTime()); |
| | const eff=raw.map(d=>d.effective_loc); |
| | const mod=raw.map(d=>d.loc_modular); |
| | const mdl_all=raw.map(d=>d.loc_modeling_all); |
| | const mdl_inc=raw.map(d=>d.loc_modeling_included); |
| | const cutoffTs=new Date("{cutoff}T00:00:00Z").getTime(); |
| | const opts={{ |
| | chart:{{type:"line",height:"100%"}}, |
| | series:[ |
| | {{name:"Effective LOC",data:xs.map((t,i)=>[t,eff[i]])}}, |
| | {{name:"Modular LOC",data:xs.map((t,i)=>[t,mod[i]])}}, |
| | {{name:"Modeling LOC (all)",data:xs.map((t,i)=>[t,mdl_all[i]])}}, |
| | {{name:"Modeling LOC (included)",data:xs.map((t,i)=>[t,mdl_inc[i]])}} |
| | ], |
| | xaxis:{{type:"datetime"}}, |
| | yaxis:{{labels:{{formatter:v=>Math.round(v)}}}}, |
| | stroke:{{width:2}}, |
| | tooltip:{{shared:true,x:{{format:"yyyy-MM-dd"}}}}, |
| | annotations:{{xaxis:[{{x:cutoffTs,borderColor:"#e11d48",label:{{text:"2024-05-31 modular",style:{{color:"#fff",background:"#e11d48"}}}}}}]}} |
| | }}; |
| | new ApexCharts(document.getElementById("chart"),opts).render(); |
| | </script>""" |
| |
|
| | def main(): |
| | tmp = Path(tempfile.mkdtemp()) |
| | subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")]) |
| | sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip() |
| | repo_path = tmp / "repo" |
| |
|
| | loc_growth = _compute_loc_growth(repo_path) |
| | loc_json_str = json.dumps(loc_growth, separators=(",", ":")) |
| | loc_html_str = _loc_html(loc_growth) |
| |
|
| | graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD) |
| | timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD) |
| | graph_html = generate_html(graph) |
| | timeline_html = generate_timeline_html(timeline) |
| |
|
| | api = HfApi() |
| | api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True) |
| |
|
| | key = f"{sha}/{SIM_METHOD}-m{int(MULTIMODAL)}" |
| | latest = { |
| | "sha": sha, |
| | "updated_utc": datetime.now(timezone.utc).isoformat(), |
| | "defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL}, |
| | "paths": { |
| | "graph_json": f"graph/{key}.json", |
| | "graph_html": f"graph/{key}.html", |
| | "timeline_json": f"timeline/{key}.json", |
| | "timeline_html": f"timeline/{key}.html", |
| | "loc_json": f"loc/{key}.json", |
| | "loc_html": f"loc/{key}.html", |
| | }, |
| | } |
| |
|
| | def put(path_in_repo: str, text: str): |
| | api.upload_file( |
| | path_or_fileobj=io.BytesIO(text.encode("utf-8")), |
| | path_in_repo=path_in_repo, |
| | repo_id=CACHE_REPO, |
| | repo_type="dataset", |
| | commit_message=f"cache {path_in_repo}", |
| | ) |
| |
|
| | put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":"))) |
| | put(f"graph/{key}.html", graph_html) |
| | put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":"))) |
| | put(f"timeline/{key}.html", timeline_html) |
| | put(f"loc/{key}.json", loc_json_str) |
| | put(f"loc/{key}.html", loc_html_str) |
| | put("latest.json", json.dumps(latest, separators=(",", ":"))) |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|