{"schema_version":"onlylabs.public_signal.v1","title":"OpenAI Writing: Why we no longer evaluate SWE-bench Verified","description":"OpenAI writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/125c4f04-45ae-48be-a6ab-7eef7aba2f61","json_url":"https://onlylabs.fyi/signals/125c4f04-45ae-48be-a6ab-7eef7aba2f61/signal.json","generated_at":"2026-06-08T15:45:21.306+00:00","org":{"slug":"openai","name":"OpenAI","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/openai","dossier_json_url":"https://onlylabs.fyi/labs/openai/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/125c4f04-45ae-48be-a6ab-7eef7aba2f61","signal_json":"https://onlylabs.fyi/signals/125c4f04-45ae-48be-a6ab-7eef7aba2f61/signal.json","source":"https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified","lab_dossier":"https://onlylabs.fyi/labs/openai","lab_dossier_json":"https://onlylabs.fyi/labs/openai/dossier.json","analysis":"https://onlylabs.fyi/analysis/openai","analysis_json":"https://onlylabs.fyi/analysis/openai/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/openai/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":{"radar":"https://onlylabs.fyi/data-radar","radar_json":"https://onlylabs.fyi/data-radar.json","opportunities":"https://onlylabs.fyi/opportunities","opportunities_json":"https://onlylabs.fyi/opportunities.json","lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"},{"key":"infrastructure","label":"Infrastructure","url":"https://onlylabs.fyi/data-radar/infrastructure","json_url":"https://onlylabs.fyi/data-radar/infrastructure/signals.json"}]}},"answer_pack":{"answer":"OpenAI published Why we no longer evaluate SWE-bench Verified. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: OpenAI's substantive post with high HN traction · Why SWE-bench Verified no longer measures frontier coding capabilities | OpenAI February 23, 2026 Why SWE-bench Verified no longer measures frontier coding capabilities.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality, Infrastructure in the data-business radar.","signal_desk":"talking","source_context":{"source_url":"https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified","source_host":"openai.com","occurred_at":"2026-02-23T11:00:00+00:00","first_seen_at":"2026-06-05T05:42:57.832854+00:00","date_source":"rss.item_date","context":null},"context_markers":[{"label":"Lab","value":"OpenAI","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"openai.com","source":"source"},{"label":"Notability","value":"OpenAI's substantive post with high HN traction","source":"signal"},{"label":"Radar lane","value":"Evals and quality","source":"radar"},{"label":"Radar lane","value":"Infrastructure","source":"radar"},{"label":"Matched term","value":"eval","source":"radar"},{"label":"Matched term","value":"training","source":"radar"},{"label":"Watch term","value":"Eval methodology","source":"evidence"},{"label":"Watch term","value":"Data pipeline","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"},{"label":"Watch term","value":"Agents and tool use","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["exa"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-08T15:45:21.306+00:00"},"data_business":{"matches":true,"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals","json_url":"https://onlylabs.fyi/data-radar/evals/signals.json"},{"key":"infrastructure","label":"Infrastructure","url":"https://onlylabs.fyi/data-radar/infrastructure","json_url":"https://onlylabs.fyi/data-radar/infrastructure/signals.json"}],"matched_terms":["eval","training"],"score":30,"reason":"OpenAI has a writing signal matching evals and quality, infrastructure."},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/125c4f04-45ae-48be-a6ab-7eef7aba2f61/signal.json","dossier_json":"https://onlylabs.fyi/labs/openai/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/openai/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/openai/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":"https://onlylabs.fyi/data-radar.json","opportunities_json":"https://onlylabs.fyi/opportunities.json"},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which data-business lane explains this signal: Evals and quality, Infrastructure?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/125c4f04-45ae-48be-a6ab-7eef7aba2f61/signal.json","required":true},{"label":"source","url":"https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/openai/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/openai/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":"https://onlylabs.fyi/data-radar.json","required":true}],"expected_output":["one-paragraph source-grounded interpretation","data-business implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze OpenAI's writing signal \"Why we no longer evaluate SWE-bench Verified\" for frontier lab strategy and data-business implications."},"semantic_triples":[{"subject":"OpenAI","predicate":"published","object":"Why we no longer evaluate SWE-bench Verified","text":"OpenAI published Why we no longer evaluate SWE-bench Verified."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"is classified as","object":"writing signal","text":"Why we no longer evaluate SWE-bench Verified is classified as writing signal."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"belongs to","object":"talking desk","text":"Why we no longer evaluate SWE-bench Verified belongs to talking desk."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Why we no longer evaluate SWE-bench Verified has evidence coverage 1 captured evidence page."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"matches data-business lanes","object":"Evals and quality, Infrastructure","text":"Why we no longer evaluate SWE-bench Verified matches data-business lanes Evals and quality, Infrastructure."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has captured page count","object":"1","text":"Why we no longer evaluate SWE-bench Verified has captured page count 1."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has readable page count","object":"1","text":"Why we no longer evaluate SWE-bench Verified has readable page count 1."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has related signal count","object":"6","text":"Why we no longer evaluate SWE-bench Verified has related signal count 6."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"Why we no longer evaluate SWE-bench Verified has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has source host","object":"openai.com","text":"Why we no longer evaluate SWE-bench Verified has source host openai.com."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has lab","object":"OpenAI","text":"Why we no longer evaluate SWE-bench Verified has lab OpenAI."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has signal desk","object":"talking","text":"Why we no longer evaluate SWE-bench Verified has signal desk talking."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has source host","object":"openai.com","text":"Why we no longer evaluate SWE-bench Verified has source host openai.com."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has notability","object":"OpenAI's substantive post with high HN traction","text":"Why we no longer evaluate SWE-bench Verified has notability OpenAI's substantive post with high HN traction."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has radar lane","object":"Evals and quality","text":"Why we no longer evaluate SWE-bench Verified has radar lane Evals and quality."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has radar lane","object":"Infrastructure","text":"Why we no longer evaluate SWE-bench Verified has radar lane Infrastructure."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has matched term","object":"eval","text":"Why we no longer evaluate SWE-bench Verified has matched term eval."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has matched term","object":"training","text":"Why we no longer evaluate SWE-bench Verified has matched term training."}]},"intelligence":{"signal_desk":"talking","answer":"OpenAI published Why we no longer evaluate SWE-bench Verified. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: OpenAI's substantive post with high HN traction · Why SWE-bench Verified no longer measures frontier coding capabilities | OpenAI February 23, 2026 Why SWE-bench Verified no longer measures frontier coding capabilities.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Evals and quality, Infrastructure in the data-business radar.","semantic_triples":[{"subject":"OpenAI","predicate":"published","object":"Why we no longer evaluate SWE-bench Verified","text":"OpenAI published Why we no longer evaluate SWE-bench Verified."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"is classified as","object":"writing signal","text":"Why we no longer evaluate SWE-bench Verified is classified as writing signal."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"belongs to","object":"talking desk","text":"Why we no longer evaluate SWE-bench Verified belongs to talking desk."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Why we no longer evaluate SWE-bench Verified has evidence coverage 1 captured evidence page."},{"subject":"Why we no longer evaluate SWE-bench Verified","predicate":"matches data-business lanes","object":"Evals and quality, Infrastructure","text":"Why we no longer evaluate SWE-bench Verified matches data-business lanes Evals and quality, Infrastructure."}]},"signal":{"id":"125c4f04-45ae-48be-a6ab-7eef7aba2f61","url":"https://onlylabs.fyi/signals/125c4f04-45ae-48be-a6ab-7eef7aba2f61","json_url":"https://onlylabs.fyi/signals/125c4f04-45ae-48be-a6ab-7eef7aba2f61/signal.json","source_url":"https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified","title":"Why we no longer evaluate SWE-bench Verified","summary":"OpenAI published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-02-23T11:00:00+00:00","first_seen_at":"2026-06-05T05:42:57.832854+00:00","date_source":"rss.item_date","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["exa"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified"]},"facets":{},"traction":{"github_stars":null,"hn_points":343,"hn_comments":181,"hn_story_id":"47910388","hf_downloads":null,"hf_likes":null},"data_radar":{"lanes":[{"key":"evals","label":"Evals and quality","url":"https://onlylabs.fyi/data-radar/evals"},{"key":"infrastructure","label":"Infrastructure","url":"https://onlylabs.fyi/data-radar/infrastructure"}],"score":30,"matched_terms":["eval","training"],"reason":"OpenAI has a writing signal matching evals and quality, infrastructure."}},"primary_evidence_page":{"url":"https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified","final_url":"https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified","title":"Why we no longer evaluate SWE-bench Verified","http_status":200,"content_type":null,"capture_method":"exa","fetched_at":"2026-06-08T15:45:21.306+00:00","bytes":null,"raw_path":null,"content_hash":null,"excerpt_chars":1200,"truncated":true,"excerpt":"Why SWE-bench Verified no longer measures frontier coding capabilities | OpenAI February 23, 2026 Why SWE-bench Verified no longer measures frontier coding capabilities SWE-bench Verified is increasingly contaminated. We recommend SWE-bench Pro. Loading… Share Since we first published SWE-bench Verified in August 2024, the industry has widely used it to measure the progress of models on autonomous software engineering tasks. After its release, SWE-bench Verified provided a strong signal of capability progress and became a standard metric reported in frontier model releases. Tracking and forecasting progress of these capabilities is also an important part of OpenAI’s Preparedness Framework. When we created the Verified benchmark initially, we attempted to solve issues in the original evaluation that made certain tasks impossible to accomplish in the SWE-bench dataset⁠. After initial leaps, state-of-the-art progress on SWE-bench Verified has slowed, improving⁠ from 74.9% to 80.9% in the last 6 months. This raises the question: do the remaining failures reflect model limitations or properties of the dataset itself? In a new analysis, we found two major issues with the Verified set..."},"evidence_pages":[{"url":"https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified","final_url":"https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified","title":"Why we no longer evaluate SWE-bench Verified","http_status":200,"content_type":null,"capture_method":"exa","fetched_at":"2026-06-08T15:45:21.306+00:00","bytes":null,"raw_path":null,"content_hash":null,"excerpt_chars":1200,"truncated":true,"excerpt":"Why SWE-bench Verified no longer measures frontier coding capabilities | OpenAI February 23, 2026 Why SWE-bench Verified no longer measures frontier coding capabilities SWE-bench Verified is increasingly contaminated. We recommend SWE-bench Pro. Loading… Share Since we first published SWE-bench Verified in August 2024, the industry has widely used it to measure the progress of models on autonomous software engineering tasks. After its release, SWE-bench Verified provided a strong signal of capability progress and became a standard metric reported in frontier model releases. Tracking and forecasting progress of these capabilities is also an important part of OpenAI’s Preparedness Framework. When we created the Verified benchmark initially, we attempted to solve issues in the original evaluation that made certain tasks impossible to accomplish in the SWE-bench dataset⁠. After initial leaps, state-of-the-art progress on SWE-bench Verified has slowed, improving⁠ from 74.9% to 80.9% in the last 6 months. This raises the question: do the remaining failures reflect model limitations or properties of the dataset itself? In a new analysis, we found two major issues with the Verified set..."}],"related_signals":[{"id":"b3668d3b-26d2-40c0-9d4f-ed1a67927aa4","url":"https://onlylabs.fyi/signals/b3668d3b-26d2-40c0-9d4f-ed1a67927aa4","source_url":"https://openai.com/index/supporting-eu-trustworthy-ai-ecosystem","title":"Supporting Europe’s work in ensuring a trustworthy AI ecosystem ","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-11T00:00:00+00:00","first_seen_at":"2026-06-11T08:00:56.140796+00:00","date_source":"rss.item_date"},{"id":"2638c0a7-b372-409c-ac72-f6d81d6464dc","url":"https://onlylabs.fyi/signals/2638c0a7-b372-409c-ac72-f6d81d6464dc","source_url":"https://openai.com/index/using-codex-to-simulate-black-holes","title":"How an astrophysicist uses Codex to help simulate black holes","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-11T00:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"509ea784-51ec-4ede-855b-5a4d1b27d3be","url":"https://onlylabs.fyi/signals/509ea784-51ec-4ede-855b-5a4d1b27d3be","source_url":"https://openai.com/index/openai-on-oracle-cloud","title":"Access OpenAI models and Codex through your Oracle cloud commitment","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T20:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"4f051449-87f2-466e-941e-b5918381a8fe","url":"https://onlylabs.fyi/signals/4f051449-87f2-466e-941e-b5918381a8fe","source_url":"https://openai.com/index/prc-linked-influence-operations-ai-debates","title":"PRC-linked influence operations are targeting AI debates in the US","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T12:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"4507c0c1-cb74-4bb3-b62b-5f6c2d37e20d","url":"https://onlylabs.fyi/signals/4507c0c1-cb74-4bb3-b62b-5f6c2d37e20d","source_url":"https://openai.com/index/lseg","title":"From data to decisions: how LSEG is scaling trusted AI","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T00:00:00+00:00","first_seen_at":"2026-06-10T09:18:54.26094+00:00","date_source":"rss.item_date"},{"id":"fb16aa7a-c4ef-4859-b514-0839c2f1330d","url":"https://onlylabs.fyi/signals/fb16aa7a-c4ef-4859-b514-0839c2f1330d","source_url":"https://openai.com/index/nextdoor","title":"How engineers at Nextdoor use Codex to build without limits","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-09T12:00:00+00:00","first_seen_at":"2026-06-10T07:01:28.700378+00:00","date_source":"rss.item_date"}]}