{"schema_version":"onlylabs.public_signal.v1","title":"Amazon (Nova) Repo: amazon-science/LLM-Accuracy-Stats","description":"Amazon (Nova) repo signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/9dfa5000-4f39-4318-8afb-5a8c0d98b15a","json_url":"https://onlylabs.fyi/signals/9dfa5000-4f39-4318-8afb-5a8c0d98b15a/signal.json","generated_at":"2026-06-11T02:53:06.402621+00:00","org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/amazon","dossier_json_url":"https://onlylabs.fyi/labs/amazon/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/9dfa5000-4f39-4318-8afb-5a8c0d98b15a","signal_json":"https://onlylabs.fyi/signals/9dfa5000-4f39-4318-8afb-5a8c0d98b15a/signal.json","source":"https://github.com/amazon-science/LLM-Accuracy-Stats","lab_dossier":"https://onlylabs.fyi/labs/amazon","lab_dossier_json":"https://onlylabs.fyi/labs/amazon/dossier.json","analysis":"https://onlylabs.fyi/analysis/amazon","analysis_json":"https://onlylabs.fyi/analysis/amazon/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/amazon/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":null,"topic_signals_json":null,"topic_feed":null,"data_business":null},"answer_pack":{"answer":"Amazon (Nova) published amazon-science/LLM-Accuracy-Stats (Python). This repository signal exposes tooling, eval, infrastructure, or model-adjacent work before it may appear in a launch post. High-signal details: repo amazon-science/LLM-Accuracy-Stats · language Python · Low-star repo from Amazon Science. onlylabs links this event to 1 captured evidence page and 6 related repo signals.","signal_desk":"repos","source_context":{"source_url":"https://github.com/amazon-science/LLM-Accuracy-Stats","source_host":"github.com","occurred_at":"2026-02-06T13:09:06+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source","context":"Python"},"context_markers":[{"label":"Lab","value":"Amazon (Nova)","source":"signal"},{"label":"Signal desk","value":"repos","source":"signal"},{"label":"Source host","value":"github.com","source":"source"},{"label":"Repository","value":"amazon-science/LLM-Accuracy-Stats","source":"source"},{"label":"Language","value":"Python","source":"source"},{"label":"Stars","value":"4","source":"traction"},{"label":"Notability","value":"Low-star repo from Amazon Science","source":"signal"},{"label":"Watch term","value":"Eval methodology","source":"evidence"},{"label":"Watch term","value":"Data pipeline","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://github.com/amazon-science/LLM-Accuracy-Stats"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-11T02:53:06.402621+00:00"},"data_business":{"matches":false,"lanes":[],"matched_terms":[],"score":null,"reason":null},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/9dfa5000-4f39-4318-8afb-5a8c0d98b15a/signal.json","dossier_json":"https://onlylabs.fyi/labs/amazon/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/amazon/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/amazon/evidence.json","topic_signals_json":null,"topic_feed":null,"category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":null,"opportunities_json":null},"analysis_playbook":{"objective":"Turn new repository signals into early evidence of tooling, eval, infrastructure, model-adjacent, or product work before it appears in polished launch channels.","evidence_focus":["repo name","owner","description","language","stars","source URL","first seen time","data, eval, infra, safety, and product terms"],"extraction_questions":["What technical area does this repository expose?","Does the repo imply eval, data, infrastructure, agent, or deployment work?","Is the repo new evidence for a lab direction that is not yet in writing or releases?","Which related signals should an analyst inspect next?"],"signal_questions":["What does this new repository reveal before a formal announcement exists?","What technical area does this repository expose?","Does the repo imply eval, data, infrastructure, agent, or deployment work?","Do the 6 related repo signals show a repeated pattern?"],"output_fields":["org","repo","technical_theme","data_business_lane","evidence_url"],"data_business_relevance":"New repositories can expose organization build priorities early, especially around internal tooling, eval infrastructure, data systems, deployment, and agent workflows.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/9dfa5000-4f39-4318-8afb-5a8c0d98b15a/signal.json","required":true},{"label":"source","url":"https://github.com/amazon-science/LLM-Accuracy-Stats","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/amazon/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/amazon/evidence.json","required":true},{"label":"topic_signals_json","url":null,"required":false},{"label":"data_radar_json","url":null,"required":false}],"expected_output":["one-paragraph source-grounded interpretation","category-specific implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze Amazon (Nova)'s repo signal \"amazon-science/LLM-Accuracy-Stats\" for frontier lab strategy."},"semantic_triples":[{"subject":"Amazon (Nova)","predicate":"published repo","object":"amazon-science/LLM-Accuracy-Stats","text":"Amazon (Nova) published repo amazon-science/LLM-Accuracy-Stats."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"is classified as","object":"repo signal","text":"amazon-science/LLM-Accuracy-Stats is classified as repo signal."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"belongs to","object":"repos desk","text":"amazon-science/LLM-Accuracy-Stats belongs to repos desk."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has context","object":"Python","text":"amazon-science/LLM-Accuracy-Stats has context Python."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has evidence coverage","object":"1 captured evidence page","text":"amazon-science/LLM-Accuracy-Stats has evidence coverage 1 captured evidence page."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has captured page count","object":"1","text":"amazon-science/LLM-Accuracy-Stats has captured page count 1."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has readable page count","object":"1","text":"amazon-science/LLM-Accuracy-Stats has readable page count 1."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has related signal count","object":"6","text":"amazon-science/LLM-Accuracy-Stats has related signal count 6."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has analysis playbook objective","object":"Turn new repository signals into early evidence of tooling, eval, infrastructure, model-adjacent, or product work before it appears in polished launch channels.","text":"amazon-science/LLM-Accuracy-Stats has analysis playbook objective Turn new repository signals into early evidence of tooling, eval, infrastructure, model-adjacent, or product work before it appears in polished launch channels.."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has source host","object":"github.com","text":"amazon-science/LLM-Accuracy-Stats has source host github.com."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has lab","object":"Amazon (Nova)","text":"amazon-science/LLM-Accuracy-Stats has lab Amazon (Nova)."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has signal desk","object":"repos","text":"amazon-science/LLM-Accuracy-Stats has signal desk repos."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has source host","object":"github.com","text":"amazon-science/LLM-Accuracy-Stats has source host github.com."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has repository","object":"amazon-science/LLM-Accuracy-Stats","text":"amazon-science/LLM-Accuracy-Stats has repository amazon-science/LLM-Accuracy-Stats."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has language","object":"Python","text":"amazon-science/LLM-Accuracy-Stats has language Python."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has stars","object":"4","text":"amazon-science/LLM-Accuracy-Stats has stars 4."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has notability","object":"Low-star repo from Amazon Science","text":"amazon-science/LLM-Accuracy-Stats has notability Low-star repo from Amazon Science."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has watch term","object":"Eval methodology","text":"amazon-science/LLM-Accuracy-Stats has watch term Eval methodology."}]},"intelligence":{"signal_desk":"repos","answer":"Amazon (Nova) published amazon-science/LLM-Accuracy-Stats (Python). This repository signal exposes tooling, eval, infrastructure, or model-adjacent work before it may appear in a launch post. High-signal details: repo amazon-science/LLM-Accuracy-Stats · language Python · Low-star repo from Amazon Science. onlylabs links this event to 1 captured evidence page and 6 related repo signals.","semantic_triples":[{"subject":"Amazon (Nova)","predicate":"published repo","object":"amazon-science/LLM-Accuracy-Stats","text":"Amazon (Nova) published repo amazon-science/LLM-Accuracy-Stats."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"is classified as","object":"repo signal","text":"amazon-science/LLM-Accuracy-Stats is classified as repo signal."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"belongs to","object":"repos desk","text":"amazon-science/LLM-Accuracy-Stats belongs to repos desk."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has context","object":"Python","text":"amazon-science/LLM-Accuracy-Stats has context Python."},{"subject":"amazon-science/LLM-Accuracy-Stats","predicate":"has evidence coverage","object":"1 captured evidence page","text":"amazon-science/LLM-Accuracy-Stats has evidence coverage 1 captured evidence page."}]},"signal":{"id":"9dfa5000-4f39-4318-8afb-5a8c0d98b15a","url":"https://onlylabs.fyi/signals/9dfa5000-4f39-4318-8afb-5a8c0d98b15a","json_url":"https://onlylabs.fyi/signals/9dfa5000-4f39-4318-8afb-5a8c0d98b15a/signal.json","source_url":"https://github.com/amazon-science/LLM-Accuracy-Stats","title":"amazon-science/LLM-Accuracy-Stats","summary":"Amazon (Nova) published a new repository. onlylabs watches repos for tooling, eval, infra, and model-adjacent work.","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-02-06T13:09:06+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://github.com/amazon-science/LLM-Accuracy-Stats"]},"facets":{"repo":"amazon-science/LLM-Accuracy-Stats","language":"Python"},"traction":{"github_stars":4,"hn_points":null,"hn_comments":null,"hn_story_id":null,"hf_downloads":null,"hf_likes":null},"data_radar":null},"primary_evidence_page":{"url":"https://github.com/amazon-science/LLM-Accuracy-Stats","final_url":"https://github.com/amazon-science/LLM-Accuracy-Stats","title":"amazon-science/LLM-Accuracy-Stats repository metadata","http_status":200,"content_type":"application/json","capture_method":"plain","fetched_at":"2026-06-11T02:53:06.402621+00:00","bytes":20368,"raw_path":"67caf9052224a6c8a611fcc880a2ebe35020875b87955e61e2918a33748e592a.json","content_hash":"2380334841b896651a17dfe8ffee36ce9a5541fc3dfb825d0701707216f1a71f","excerpt_chars":1200,"truncated":true,"excerpt":"amazon-science/LLM-Accuracy-Stats Description: Test optimized LLMs for degraded accuracy Language: Python License: NOASSERTION Stars: 4 Forks: 1 Open issues: 0 Created: 2026-02-06T13:09:06Z Pushed: 2026-02-12T05:47:50Z Default branch: main Fork: no Archived: yes README: When LLMs get significantly worse: A statistical approach to detect model degradations [![ICLR 2026](https://img.shields.io/badge/ICLR-2026-blue)](https://openreview.net/forum?id=cM3gsqEI4K) [![License: CC BY-NC 4.0](https://img.shields.io/badge/License-CC%20BY--NC%204.0-lightgrey.svg)](LICENSE) [![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/) [![arXiv](https://img.shields.io/badge/arXiv-2602.10144-b31b1b.svg)](https://arxiv.org/abs/2602.10144) This repository contains the code for reproducing experiments from our ICLR 2026 paper on statistical detection of LLM model degradations using McNemar's test. We provide tools to detect whether accuracy changes in optimized models are due to actual degradation or evaluation noise. Installation We recommend using uv: https://docs.astral.sh/uv/getting-started/installation/ ```bash uv venv ~/venv_accuracy_paper --python..."},"evidence_pages":[{"url":"https://github.com/amazon-science/LLM-Accuracy-Stats","final_url":"https://github.com/amazon-science/LLM-Accuracy-Stats","title":"amazon-science/LLM-Accuracy-Stats repository metadata","http_status":200,"content_type":"application/json","capture_method":"plain","fetched_at":"2026-06-11T02:53:06.402621+00:00","bytes":20368,"raw_path":"67caf9052224a6c8a611fcc880a2ebe35020875b87955e61e2918a33748e592a.json","content_hash":"2380334841b896651a17dfe8ffee36ce9a5541fc3dfb825d0701707216f1a71f","excerpt_chars":1200,"truncated":true,"excerpt":"amazon-science/LLM-Accuracy-Stats Description: Test optimized LLMs for degraded accuracy Language: Python License: NOASSERTION Stars: 4 Forks: 1 Open issues: 0 Created: 2026-02-06T13:09:06Z Pushed: 2026-02-12T05:47:50Z Default branch: main Fork: no Archived: yes README: When LLMs get significantly worse: A statistical approach to detect model degradations [![ICLR 2026](https://img.shields.io/badge/ICLR-2026-blue)](https://openreview.net/forum?id=cM3gsqEI4K) [![License: CC BY-NC 4.0](https://img.shields.io/badge/License-CC%20BY--NC%204.0-lightgrey.svg)](LICENSE) [![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/) [![arXiv](https://img.shields.io/badge/arXiv-2602.10144-b31b1b.svg)](https://arxiv.org/abs/2602.10144) This repository contains the code for reproducing experiments from our ICLR 2026 paper on statistical detection of LLM model degradations using McNemar's test. We provide tools to detect whether accuracy changes in optimized models are due to actual degradation or evaluation noise. Installation We recommend using uv: https://docs.astral.sh/uv/getting-started/installation/ ```bash uv venv ~/venv_accuracy_paper --python..."}],"related_signals":[{"id":"087c32a2-6ad0-4981-9315-11fdd32a0153","url":"https://onlylabs.fyi/signals/087c32a2-6ad0-4981-9315-11fdd32a0153","source_url":"https://github.com/amazon-science/reskill","title":"amazon-science/reskill","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-06-04T02:13:35+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"},{"id":"e5701aed-6cd3-48dd-bfa6-ef839031e2e8","url":"https://onlylabs.fyi/signals/e5701aed-6cd3-48dd-bfa6-ef839031e2e8","source_url":"https://github.com/amazon-science/dualkv-flash-attn-for-rl","title":"amazon-science/dualkv-flash-attn-for-rl","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-05-27T17:38:58+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"},{"id":"8af28f0c-7331-4b08-b517-e18b3555e503","url":"https://onlylabs.fyi/signals/8af28f0c-7331-4b08-b517-e18b3555e503","source_url":"https://github.com/amazon-science/EvoMAS","title":"amazon-science/EvoMAS","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-05-19T19:23:29+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"},{"id":"e3ff8718-7daa-4ebd-a3e6-3d825c538b74","url":"https://onlylabs.fyi/signals/e3ff8718-7daa-4ebd-a3e6-3d825c538b74","source_url":"https://github.com/amazon-science/adaptive-layerwise-perturbation","title":"amazon-science/adaptive-layerwise-perturbation","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-05-14T17:44:17+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"},{"id":"9afcd328-0124-485c-8ace-9c3ad546e316","url":"https://onlylabs.fyi/signals/9afcd328-0124-485c-8ace-9c3ad546e316","source_url":"https://github.com/amazon-science/temporal-reasoning-dataset","title":"amazon-science/temporal-reasoning-dataset","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-05-13T13:07:08+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"},{"id":"e19ce80b-3d6a-4aaf-9b1a-82d1b19ab682","url":"https://onlylabs.fyi/signals/e19ce80b-3d6a-4aaf-9b1a-82d1b19ab682","source_url":"https://github.com/amazon-science/PROF-GRPO","title":"amazon-science/PROF-GRPO","context":"Python","kind":{"key":"repo_new","label":"Repo"},"org":{"slug":"amazon","name":"Amazon (Nova)","category":"frontier-lab"},"occurred_at":"2026-05-12T19:43:55+00:00","first_seen_at":"2026-06-05T20:58:37.464059+00:00","date_source":"source"}]}