{"schema_version":"onlylabs.public_signal.v1","title":"Microsoft Repo: microsoft/benchpress","description":"Microsoft repo signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/5febcd44-ef2e-477f-8d74-1755cabdb826","json_url":"https://onlylabs.fyi/signals/5febcd44-ef2e-477f-8d74-1755cabdb826/signal.json","generated_at":"2026-06-26T23:16:50.143Z","evidence_latest_fetched_at":"2026-06-24T07:04:56.682451+00:00","signal_first_seen_at":"2026-06-24T07:00:33.364532+00:00","org":{"slug":"microsoft","name":"Microsoft","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/microsoft","dossier_json_url":"https://onlylabs.fyi/labs/microsoft/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/5febcd44-ef2e-477f-8d74-1755cabdb826","signal_json":"https://onlylabs.fyi/signals/5febcd44-ef2e-477f-8d74-1755cabdb826/signal.json","source":"https://github.com/microsoft/benchpress","lab_dossier":"https://onlylabs.fyi/labs/microsoft","lab_dossier_json":"https://onlylabs.fyi/labs/microsoft/dossier.json","analysis":"https://onlylabs.fyi/analysis/microsoft","analysis_json":"https://onlylabs.fyi/analysis/microsoft/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/microsoft/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":null,"topic_signals_json":null,"topic_feed":null,"data_business":{"radar":"https://onlylabs.fyi/data-radar","radar_json":"https://onlylabs.fyi/data-radar.json","opportunities":"https://onlylabs.fyi/opportunities","opportunities_json":"https://onlylabs.fyi/opportunities.json","lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}]}},"answer_pack":{"answer":"Microsoft published microsoft/benchpress (Python). This repository signal exposes tooling, eval, infrastructure, or model-adjacent work before it may appear in a launch post. High-signal details: repo microsoft/benchpress · language Python · Microsoft's benchmarking suite for foundation models.. onlylabs links this event to 1 captured evidence page and 6 related repo signals. It also maps to Evals and quality in the data-business radar.","signal_desk":"repos","source_context":{"source_url":"https://github.com/microsoft/benchpress","source_host":"github.com","occurred_at":"2026-05-17T21:29:19+00:00","first_seen_at":"2026-06-24T07:00:33.364532+00:00","date_source":"source","context":"Python"},"context_markers":[{"label":"Lab","value":"Microsoft","source":"signal"},{"label":"Signal desk","value":"repos","source":"signal"},{"label":"Source host","value":"github.com","source":"source"},{"label":"Repository","value":"microsoft/benchpress","source":"source"},{"label":"Language","value":"Python","source":"source"},{"label":"Context","value":"Microsoft's benchmarking suite for foundation models.","source":"signal"},{"label":"Notability","value":"New benchmarking tool from Microsoft, moderate interest.","source":"signal"},{"label":"Radar lane","value":"Evals and quality","source":"radar"},{"label":"Matched term","value":"benchmark","source":"radar"},{"label":"Watch term","value":"Eval methodology","source":"evidence"},{"label":"Watch term","value":"Model card","source":"model"},{"label":"Watch term","value":"Data pipeline","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"},{"label":"Watch term","value":"Safety and alignment","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://github.com/microsoft/benchpress"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-24T07:04:56.682451+00:00"},"data_business":{"matches":true,"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}],"matched_terms":["benchmark"],"score":14,"reason":"Microsoft has a repo signal matching evals and quality."},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/5febcd44-ef2e-477f-8d74-1755cabdb826/signal.json","dossier_json":"https://onlylabs.fyi/labs/microsoft/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/microsoft/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/microsoft/evidence.json","topic_signals_json":null,"topic_feed":null,"category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":"https://onlylabs.fyi/data-radar.json","opportunities_json":"https://onlylabs.fyi/opportunities.json"},"analysis_playbook":{"objective":"Turn new repository signals into early evidence of tooling, eval, infrastructure, model-adjacent, or product work before it appears in polished launch channels.","evidence_focus":["repo name","owner","description","language","stars","source URL","first seen time","data, eval, infra, safety, and product terms"],"extraction_questions":["What technical area does this repository expose?","Does the repo imply eval, data, infrastructure, agent, or deployment work?","Is the repo new evidence for a lab direction that is not yet in writing or releases?","Which related signals should an analyst inspect next?"],"signal_questions":["What does this new repository reveal before a formal announcement exists?","What technical area does this repository expose?","Does the repo imply eval, data, infrastructure, agent, or deployment work?","Which data-business lane explains this signal: Evals and quality?","Do the 6 related repo signals show a repeated pattern?"],"output_fields":["org","repo","technical_theme","data_business_lane","evidence_url"],"data_business_relevance":"New repositories can expose organization build priorities early, especially around internal tooling, eval infrastructure, data systems, deployment, and agent workflows.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/5febcd44-ef2e-477f-8d74-1755cabdb826/signal.json","required":true},{"label":"source","url":"https://github.com/microsoft/benchpress","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/microsoft/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/microsoft/evidence.json","required":true},{"label":"topic_signals_json","url":null,"required":false},{"label":"data_radar_json","url":"https://onlylabs.fyi/data-radar.json","required":true}],"expected_output":["one-paragraph source-grounded interpretation","data-business implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze Microsoft's repo signal \"microsoft/benchpress\" for frontier lab strategy and data-business implications."},"semantic_triples":[{"subject":"Microsoft","predicate":"published repo","object":"microsoft/benchpress","text":"Microsoft published repo microsoft/benchpress."},{"subject":"microsoft/benchpress","predicate":"is classified as","object":"repo signal","text":"microsoft/benchpress is classified as repo signal."},{"subject":"microsoft/benchpress","predicate":"belongs to","object":"repos desk","text":"microsoft/benchpress belongs to repos desk."},{"subject":"microsoft/benchpress","predicate":"has context","object":"Python","text":"microsoft/benchpress has context Python."},{"subject":"microsoft/benchpress","predicate":"has evidence coverage","object":"1 captured evidence page","text":"microsoft/benchpress has evidence coverage 1 captured evidence page."},{"subject":"microsoft/benchpress","predicate":"matches data-business lanes","object":"Evals and quality","text":"microsoft/benchpress matches data-business lanes Evals and quality."},{"subject":"microsoft/benchpress","predicate":"has captured page count","object":"1","text":"microsoft/benchpress has captured page count 1."},{"subject":"microsoft/benchpress","predicate":"has readable page count","object":"1","text":"microsoft/benchpress has readable page count 1."},{"subject":"microsoft/benchpress","predicate":"has related signal count","object":"6","text":"microsoft/benchpress has related signal count 6."},{"subject":"microsoft/benchpress","predicate":"has analysis playbook objective","object":"Turn new repository signals into early evidence of tooling, eval, infrastructure, model-adjacent, or product work before it appears in polished launch channels.","text":"microsoft/benchpress has analysis playbook objective Turn new repository signals into early evidence of tooling, eval, infrastructure, model-adjacent, or product work before it appears in polished launch channels.."},{"subject":"microsoft/benchpress","predicate":"has source host","object":"github.com","text":"microsoft/benchpress has source host github.com."},{"subject":"microsoft/benchpress","predicate":"has lab","object":"Microsoft","text":"microsoft/benchpress has lab Microsoft."},{"subject":"microsoft/benchpress","predicate":"has signal desk","object":"repos","text":"microsoft/benchpress has signal desk repos."},{"subject":"microsoft/benchpress","predicate":"has source host","object":"github.com","text":"microsoft/benchpress has source host github.com."},{"subject":"microsoft/benchpress","predicate":"has repository","object":"microsoft/benchpress","text":"microsoft/benchpress has repository microsoft/benchpress."},{"subject":"microsoft/benchpress","predicate":"has language","object":"Python","text":"microsoft/benchpress has language Python."},{"subject":"microsoft/benchpress","predicate":"has context","object":"Microsoft's benchmarking suite for foundation models.","text":"microsoft/benchpress has context Microsoft's benchmarking suite for foundation models.."},{"subject":"microsoft/benchpress","predicate":"has notability","object":"New benchmarking tool from Microsoft, moderate interest.","text":"microsoft/benchpress has notability New benchmarking tool from Microsoft, moderate interest.."},{"subject":"microsoft/benchpress","predicate":"has radar lane","object":"Evals and quality","text":"microsoft/benchpress has radar lane Evals and quality."}]},"intelligence":{"signal_desk":"repos","answer":"Microsoft published microsoft/benchpress (Python). This repository signal exposes tooling, eval, infrastructure, or model-adjacent work before it may appear in a launch post. High-signal details: repo microsoft/benchpress · language Python · Microsoft's benchmarking suite for foundation models.. onlylabs links this event to 1 captured evidence page and 6 related repo signals. It also maps to Evals and quality in the data-business radar.","semantic_triples":[{"subject":"Microsoft","predicate":"published repo","object":"microsoft/benchpress","text":"Microsoft published repo microsoft/benchpress."},{"subject":"microsoft/benchpress","predicate":"is classified as","object":"repo signal","text":"microsoft/benchpress is classified as repo signal."},{"subject":"microsoft/benchpress","predicate":"belongs to","object":"repos desk","text":"microsoft/benchpress belongs to repos desk."},{"subject":"microsoft/benchpress","predicate":"has context","object":"Python","text":"microsoft/benchpress has context Python."},{"subject":"microsoft/benchpress","predicate":"has evidence coverage","object":"1 captured evidence page","text":"microsoft/benchpress has evidence coverage 1 captured evidence page."},{"subject":"microsoft/benchpress","predicate":"matches data-business lanes","object":"Evals and quality","text":"microsoft/benchpress matches data-business lanes Evals and quality."}]},"signal":{"id":"5febcd44-ef2e-477f-8d74-1755cabdb826","url":"https://onlylabs.fyi/signals/5febcd44-ef2e-477f-8d74-1755cabdb826","json_url":"https://onlylabs.fyi/signals/5febcd44-ef2e-477f-8d74-1755cabdb826/signal.json","source_url":"https://github.com/microsoft/benchpress","title":"microsoft/benchpress","summary":"Microsoft published a new repository. onlylabs watches repos for tooling, eval, infra, and model-adjacent work.","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"microsoft","name":"Microsoft","category":"frontier-lab"},"occurred_at":"2026-05-17T21:29:19+00:00","first_seen_at":"2026-06-24T07:00:33.364532+00:00","date_source":"source","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://github.com/microsoft/benchpress"]},"facets":{"repo":"microsoft/benchpress","language":"Python"},"traction":{"github_stars":0,"hn_points":null,"hn_comments":null,"hn_story_id":null,"hf_downloads":null,"hf_likes":null},"data_radar":{"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals"}],"score":14,"matched_terms":["benchmark"],"reason":"Microsoft has a repo signal matching evals and quality."}},"primary_evidence_page":{"is_primary":true,"source_match":true,"url":"https://github.com/microsoft/benchpress","final_url":"https://github.com/microsoft/benchpress","title":"microsoft/benchpress repository metadata","http_status":200,"content_type":"application/json","capture_method":"plain","fetched_at":"2026-06-24T07:04:56.682451+00:00","bytes":20124,"raw_path":"8dc2750ae791f201cfd7bb02ed11145952c2b3a4d305be29535a1b24055882cc.json","content_hash":"3c078e6a865182ceaf7c8f5d1014e69355ca42d95b91532482b2d828a825e581","excerpt_chars":1200,"truncated":true,"excerpt":"microsoft/benchpress Description: BenchPress: calibrated LLM benchmark score completion Language: Python License: MIT Stars: 0 Forks: 0 Open issues: 1 Created: 2026-05-17T21:29:19Z Pushed: 2026-06-24T02:59:46Z Default branch: main Fork: no Archived: no README: You Don't Need to Run Every Eval Yuchen Zeng, Dimitris Papailiopoulos Microsoft Research, AI Frontiers Project page · Code · Dataset · Paper **Abstract**: A modern model release reports scores on 40+ benchmarks; behind the release, evaluations were run orders of magnitude more often across checkpoints, hyperparameter sweeps, and design choices. We ask whether scores accumulated across public releases can *anticipate* a model's performance on benchmarks it has not yet been run on, and decide which evaluations are most worth running next. We compile a public score matrix of 84 frontier models on 133 benchmarks (2,604 observed cells, 23.3% filled) and find its geometry is approximately rank-2: across complete submatrices, two factors explain more than 90% of the variance. We exploit this structure with **BenchPress**: logit-space bias-decomposed rank-2 matrix completion, which completes hidden scores within a **4.6**..."},"evidence_pages":[],"related_signals":[{"id":"3fdd4690-9c28-4863-ba65-4ec299d85aa0","url":"https://onlylabs.fyi/signals/3fdd4690-9c28-4863-ba65-4ec299d85aa0","source_url":"https://github.com/microsoft/fabric-events","title":"microsoft/fabric-events","context":"HTML","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"microsoft","name":"Microsoft","category":"frontier-lab"},"occurred_at":"2026-06-23T00:54:34+00:00","first_seen_at":"2026-06-23T07:00:32.834993+00:00","date_source":"source"},{"id":"c0d592c7-e699-4bf5-a2c3-5b79f43f85a0","url":"https://onlylabs.fyi/signals/c0d592c7-e699-4bf5-a2c3-5b79f43f85a0","source_url":"https://github.com/microsoft/citadel","title":"microsoft/citadel","context":"HTML","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"microsoft","name":"Microsoft","category":"frontier-lab"},"occurred_at":"2026-06-22T06:20:10+00:00","first_seen_at":"2026-06-22T07:00:31.308446+00:00","date_source":"source"},{"id":"96d85410-04f4-4314-b4d1-6788163ff05e","url":"https://onlylabs.fyi/signals/96d85410-04f4-4314-b4d1-6788163ff05e","source_url":"https://github.com/microsoft/amplifier-app-opencode","title":"microsoft/amplifier-app-opencode","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"microsoft","name":"Microsoft","category":"frontier-lab"},"occurred_at":"2026-06-20T22:24:01+00:00","first_seen_at":"2026-06-23T07:00:32.834993+00:00","date_source":"source"},{"id":"027f6a7e-9e04-4fad-a3d7-9435bc32cf00","url":"https://onlylabs.fyi/signals/027f6a7e-9e04-4fad-a3d7-9435bc32cf00","source_url":"https://github.com/microsoft/amplifier-app-wiki-weaver","title":"microsoft/amplifier-app-wiki-weaver","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"microsoft","name":"Microsoft","category":"frontier-lab"},"occurred_at":"2026-06-19T23:38:20+00:00","first_seen_at":"2026-06-20T07:00:30.944907+00:00","date_source":"source"},{"id":"4680cd70-2c2f-4483-bd8d-ceb9b28de9cf","url":"https://onlylabs.fyi/signals/4680cd70-2c2f-4483-bd8d-ceb9b28de9cf","source_url":"https://github.com/microsoft/What-I-did-with-Cowork","title":"microsoft/What-I-did-with-Cowork","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"microsoft","name":"Microsoft","category":"frontier-lab"},"occurred_at":"2026-06-18T09:39:01+00:00","first_seen_at":"2026-06-23T07:00:32.834993+00:00","date_source":"source"},{"id":"4b65f882-b9ed-45ca-b252-be0ddd8ee03e","url":"https://onlylabs.fyi/signals/4b65f882-b9ed-45ca-b252-be0ddd8ee03e","source_url":"https://github.com/microsoft/Salesforce-Custom-Copilot-Connector","title":"microsoft/Salesforce-Custom-Copilot-Connector","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"microsoft","name":"Microsoft","category":"frontier-lab"},"occurred_at":"2026-06-11T06:55:24+00:00","first_seen_at":"2026-06-18T07:00:31.633644+00:00","date_source":"source"}]}