{"schema_version":"onlylabs.public_signal.v1","title":"OpenAI Writing: PaperBench: Evaluating AI’s Ability to Replicate AI Research","description":"OpenAI writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/54d3e07e-bd83-49da-8e99-b6d6faba324e","json_url":"https://onlylabs.fyi/signals/54d3e07e-bd83-49da-8e99-b6d6faba324e/signal.json","generated_at":"2026-06-08T15:45:46.698+00:00","org":{"slug":"openai","name":"OpenAI","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/openai","dossier_json_url":"https://onlylabs.fyi/labs/openai/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/54d3e07e-bd83-49da-8e99-b6d6faba324e","signal_json":"https://onlylabs.fyi/signals/54d3e07e-bd83-49da-8e99-b6d6faba324e/signal.json","source":"https://openai.com/index/paperbench","lab_dossier":"https://onlylabs.fyi/labs/openai","lab_dossier_json":"https://onlylabs.fyi/labs/openai/dossier.json","analysis":"https://onlylabs.fyi/analysis/openai","analysis_json":"https://onlylabs.fyi/analysis/openai/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/openai/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":{"radar":"https://onlylabs.fyi/data-radar","radar_json":"https://onlylabs.fyi/data-radar.json","opportunities":"https://onlylabs.fyi/opportunities","opportunities_json":"https://onlylabs.fyi/opportunities.json","lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}]}},"answer_pack":{"answer":"OpenAI published PaperBench: Evaluating AI’s Ability to Replicate AI Research. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Research benchmark on AI replication ability. · PaperBench: Evaluating AI’s Ability to Replicate AI Research | OpenAI April 2, 2025 PaperBench Evaluating AI’s Ability to Replicate AI Research. Read paper View code.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality in the data-business radar.","signal_desk":"talking","source_context":{"source_url":"https://openai.com/index/paperbench","source_host":"openai.com","occurred_at":"2025-04-02T10:15:00+00:00","first_seen_at":"2026-06-05T05:42:57.832854+00:00","date_source":"rss.item_date","context":null},"context_markers":[{"label":"Lab","value":"OpenAI","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"openai.com","source":"source"},{"label":"Notability","value":"Research benchmark on AI replication ability.","source":"signal"},{"label":"Radar lane","value":"Evals and quality","source":"radar"},{"label":"Matched term","value":"eval","source":"radar"},{"label":"Matched term","value":"benchmark","source":"radar"},{"label":"Watch term","value":"Eval methodology","source":"evidence"},{"label":"Watch term","value":"Agents and tool use","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["exa"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://openai.com/index/paperbench"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-08T15:45:46.698+00:00"},"data_business":{"matches":true,"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"}],"matched_terms":["eval","benchmark"],"score":15,"reason":"OpenAI has a writing signal matching evals and quality."},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/54d3e07e-bd83-49da-8e99-b6d6faba324e/signal.json","dossier_json":"https://onlylabs.fyi/labs/openai/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/openai/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/openai/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":"https://onlylabs.fyi/data-radar.json","opportunities_json":"https://onlylabs.fyi/opportunities.json"},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which data-business lane explains this signal: Evals and quality?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/54d3e07e-bd83-49da-8e99-b6d6faba324e/signal.json","required":true},{"label":"source","url":"https://openai.com/index/paperbench","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/openai/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/openai/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":"https://onlylabs.fyi/data-radar.json","required":true}],"expected_output":["one-paragraph source-grounded interpretation","data-business implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze OpenAI's writing signal \"PaperBench: Evaluating AI’s Ability to Replicate AI Research\" for frontier lab strategy and data-business implications."},"semantic_triples":[{"subject":"OpenAI","predicate":"published","object":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","text":"OpenAI published PaperBench: Evaluating AI’s Ability to Replicate AI Research."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"is classified as","object":"writing signal","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research is classified as writing signal."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"belongs to","object":"talking desk","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research belongs to talking desk."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has evidence coverage","object":"1 captured evidence page","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has evidence coverage 1 captured evidence page."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"matches data-business lanes","object":"Evals and quality","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research matches data-business lanes Evals and quality."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has captured page count","object":"1","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has captured page count 1."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has readable page count","object":"1","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has readable page count 1."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has related signal count","object":"6","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has related signal count 6."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has source host","object":"openai.com","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has source host openai.com."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has lab","object":"OpenAI","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has lab OpenAI."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has signal desk","object":"talking","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has signal desk talking."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has source host","object":"openai.com","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has source host openai.com."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has notability","object":"Research benchmark on AI replication ability.","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has notability Research benchmark on AI replication ability.."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has radar lane","object":"Evals and quality","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has radar lane Evals and quality."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has matched term","object":"eval","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has matched term eval."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has matched term","object":"benchmark","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has matched term benchmark."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has watch term","object":"Eval methodology","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has watch term Eval methodology."}]},"intelligence":{"signal_desk":"talking","answer":"OpenAI published PaperBench: Evaluating AI’s Ability to Replicate AI Research. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Research benchmark on AI replication ability. · PaperBench: Evaluating AI’s Ability to Replicate AI Research | OpenAI April 2, 2025 PaperBench Evaluating AI’s Ability to Replicate AI Research. Read paper View code.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality in the data-business radar.","semantic_triples":[{"subject":"OpenAI","predicate":"published","object":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","text":"OpenAI published PaperBench: Evaluating AI’s Ability to Replicate AI Research."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"is classified as","object":"writing signal","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research is classified as writing signal."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"belongs to","object":"talking desk","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research belongs to talking desk."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"has evidence coverage","object":"1 captured evidence page","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research has evidence coverage 1 captured evidence page."},{"subject":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","predicate":"matches data-business lanes","object":"Evals and quality","text":"PaperBench: Evaluating AI’s Ability to Replicate AI Research matches data-business lanes Evals and quality."}]},"signal":{"id":"54d3e07e-bd83-49da-8e99-b6d6faba324e","url":"https://onlylabs.fyi/signals/54d3e07e-bd83-49da-8e99-b6d6faba324e","json_url":"https://onlylabs.fyi/signals/54d3e07e-bd83-49da-8e99-b6d6faba324e/signal.json","source_url":"https://openai.com/index/paperbench","title":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","summary":"OpenAI published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2025-04-02T10:15:00+00:00","first_seen_at":"2026-06-05T05:42:57.832854+00:00","date_source":"rss.item_date","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["exa"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://openai.com/index/paperbench"]},"facets":{},"traction":{"github_stars":null,"hn_points":null,"hn_comments":null,"hn_story_id":null,"hf_downloads":null,"hf_likes":null},"data_radar":{"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals"}],"score":15,"matched_terms":["eval","benchmark"],"reason":"OpenAI has a writing signal matching evals and quality."}},"primary_evidence_page":{"url":"https://openai.com/index/paperbench","final_url":"https://openai.com/index/paperbench","title":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","http_status":200,"content_type":null,"capture_method":"exa","fetched_at":"2026-06-08T15:45:46.698+00:00","bytes":null,"raw_path":null,"content_hash":null,"excerpt_chars":1200,"truncated":true,"excerpt":"PaperBench: Evaluating AI’s Ability to Replicate AI Research | OpenAI April 2, 2025 PaperBench Evaluating AI’s Ability to Replicate AI Research. Read paper View code Share We introduce PaperBench, a benchmark evaluating the ability of AI agents to replicate state-of-the-art AI research. Agents must replicate 20 ICML 2024 Spotlight and Oral papers from scratch, including understanding paper contributions, developing a codebase, and successfully executing experiments. For objective evaluation, we develop rubrics that hierarchically decompose each replication task into smaller sub-tasks with clear grading criteria. In total, PaperBench contains 8,316 individually gradable tasks. Rubrics are co-developed with the author(s) of each ICML paper for accuracy and realism. To enable scalable evaluation, we also develop an LLM-based judge to automatically grade replication attempts against rubrics, and assess our judge’s performance by creating a separate benchmark for judges. We evaluate several frontier models on PaperBench, finding that the best-performing tested agent, Claude 3.5 Sonnet (New) with open-source scaffolding, achieves an average replication score of 21.0%. Finally, we..."},"evidence_pages":[{"url":"https://openai.com/index/paperbench","final_url":"https://openai.com/index/paperbench","title":"PaperBench: Evaluating AI’s Ability to Replicate AI Research","http_status":200,"content_type":null,"capture_method":"exa","fetched_at":"2026-06-08T15:45:46.698+00:00","bytes":null,"raw_path":null,"content_hash":null,"excerpt_chars":1200,"truncated":true,"excerpt":"PaperBench: Evaluating AI’s Ability to Replicate AI Research | OpenAI April 2, 2025 PaperBench Evaluating AI’s Ability to Replicate AI Research. Read paper View code Share We introduce PaperBench, a benchmark evaluating the ability of AI agents to replicate state-of-the-art AI research. Agents must replicate 20 ICML 2024 Spotlight and Oral papers from scratch, including understanding paper contributions, developing a codebase, and successfully executing experiments. For objective evaluation, we develop rubrics that hierarchically decompose each replication task into smaller sub-tasks with clear grading criteria. In total, PaperBench contains 8,316 individually gradable tasks. Rubrics are co-developed with the author(s) of each ICML paper for accuracy and realism. To enable scalable evaluation, we also develop an LLM-based judge to automatically grade replication attempts against rubrics, and assess our judge’s performance by creating a separate benchmark for judges. We evaluate several frontier models on PaperBench, finding that the best-performing tested agent, Claude 3.5 Sonnet (New) with open-source scaffolding, achieves an average replication score of 21.0%. Finally, we..."}],"related_signals":[{"id":"b3668d3b-26d2-40c0-9d4f-ed1a67927aa4","url":"https://onlylabs.fyi/signals/b3668d3b-26d2-40c0-9d4f-ed1a67927aa4","source_url":"https://openai.com/index/supporting-eu-trustworthy-ai-ecosystem","title":"Supporting Europe’s work in ensuring a trustworthy AI ecosystem ","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-11T00:00:00+00:00","first_seen_at":"2026-06-11T08:00:56.140796+00:00","date_source":"rss.item_date"},{"id":"2638c0a7-b372-409c-ac72-f6d81d6464dc","url":"https://onlylabs.fyi/signals/2638c0a7-b372-409c-ac72-f6d81d6464dc","source_url":"https://openai.com/index/using-codex-to-simulate-black-holes","title":"How an astrophysicist uses Codex to help simulate black holes","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-11T00:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"509ea784-51ec-4ede-855b-5a4d1b27d3be","url":"https://onlylabs.fyi/signals/509ea784-51ec-4ede-855b-5a4d1b27d3be","source_url":"https://openai.com/index/openai-on-oracle-cloud","title":"Access OpenAI models and Codex through your Oracle cloud commitment","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T20:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"4f051449-87f2-466e-941e-b5918381a8fe","url":"https://onlylabs.fyi/signals/4f051449-87f2-466e-941e-b5918381a8fe","source_url":"https://openai.com/index/prc-linked-influence-operations-ai-debates","title":"PRC-linked influence operations are targeting AI debates in the US","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T12:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"4507c0c1-cb74-4bb3-b62b-5f6c2d37e20d","url":"https://onlylabs.fyi/signals/4507c0c1-cb74-4bb3-b62b-5f6c2d37e20d","source_url":"https://openai.com/index/lseg","title":"From data to decisions: how LSEG is scaling trusted AI","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T00:00:00+00:00","first_seen_at":"2026-06-10T09:18:54.26094+00:00","date_source":"rss.item_date"},{"id":"fb16aa7a-c4ef-4859-b514-0839c2f1330d","url":"https://onlylabs.fyi/signals/fb16aa7a-c4ef-4859-b514-0839c2f1330d","source_url":"https://openai.com/index/nextdoor","title":"How engineers at Nextdoor use Codex to build without limits","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-09T12:00:00+00:00","first_seen_at":"2026-06-10T07:01:28.700378+00:00","date_source":"rss.item_date"}]}