{"schema_version":"onlylabs.public_signal.v1","title":"Anthropic Writing: Probes Catch Sleeper Agents","description":"Anthropic writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/278868a8-59e0-490c-9d66-a5abb21e0ae0","json_url":"https://onlylabs.fyi/signals/278868a8-59e0-490c-9d66-a5abb21e0ae0/signal.json","generated_at":"2026-06-11T04:18:59.19721+00:00","org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/anthropic","dossier_json_url":"https://onlylabs.fyi/labs/anthropic/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/278868a8-59e0-490c-9d66-a5abb21e0ae0","signal_json":"https://onlylabs.fyi/signals/278868a8-59e0-490c-9d66-a5abb21e0ae0/signal.json","source":"https://www.anthropic.com/research/probes-catch-sleeper-agents","lab_dossier":"https://onlylabs.fyi/labs/anthropic","lab_dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis":"https://onlylabs.fyi/analysis/anthropic","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":null},"answer_pack":{"answer":"Anthropic published Probes Catch Sleeper Agents. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Simple probes can catch sleeper agents \\ Anthropic Alignment Interpretability Simple probes can catch sleeper agents Apr 23, 2024 This “Alignment Note” presents some.... onlylabs links this event to 1 captured evidence page and 6 related writing signals.","signal_desk":"talking","source_context":{"source_url":"https://www.anthropic.com/research/probes-catch-sleeper-agents","source_host":"anthropic.com","occurred_at":"2024-04-23T00:00:00.000Z","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"page.visible_date","context":null},"context_markers":[{"label":"Lab","value":"Anthropic","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"anthropic.com","source":"source"},{"label":"Watch term","value":"Data pipeline","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"},{"label":"Watch term","value":"Safety and alignment","source":"evidence"},{"label":"Watch term","value":"Agents and tool use","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/probes-catch-sleeper-agents"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-11T04:18:59.19721+00:00"},"data_business":{"matches":false,"lanes":[],"matched_terms":[],"score":null,"reason":null},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/278868a8-59e0-490c-9d66-a5abb21e0ae0/signal.json","dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":null,"opportunities_json":null},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/278868a8-59e0-490c-9d66-a5abb21e0ae0/signal.json","required":true},{"label":"source","url":"https://www.anthropic.com/research/probes-catch-sleeper-agents","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/anthropic/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/anthropic/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":null,"required":false}],"expected_output":["one-paragraph source-grounded interpretation","category-specific implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze Anthropic's writing signal \"Probes Catch Sleeper Agents\" for frontier lab strategy."},"semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"Probes Catch Sleeper Agents","text":"Anthropic published Probes Catch Sleeper Agents."},{"subject":"Probes Catch Sleeper Agents","predicate":"is classified as","object":"writing signal","text":"Probes Catch Sleeper Agents is classified as writing signal."},{"subject":"Probes Catch Sleeper Agents","predicate":"belongs to","object":"talking desk","text":"Probes Catch Sleeper Agents belongs to talking desk."},{"subject":"Probes Catch Sleeper Agents","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Probes Catch Sleeper Agents has evidence coverage 1 captured evidence page."},{"subject":"Probes Catch Sleeper Agents","predicate":"has captured page count","object":"1","text":"Probes Catch Sleeper Agents has captured page count 1."},{"subject":"Probes Catch Sleeper Agents","predicate":"has readable page count","object":"1","text":"Probes Catch Sleeper Agents has readable page count 1."},{"subject":"Probes Catch Sleeper Agents","predicate":"has related signal count","object":"6","text":"Probes Catch Sleeper Agents has related signal count 6."},{"subject":"Probes Catch Sleeper Agents","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"Probes Catch Sleeper Agents has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"Probes Catch Sleeper Agents","predicate":"has source host","object":"anthropic.com","text":"Probes Catch Sleeper Agents has source host anthropic.com."},{"subject":"Probes Catch Sleeper Agents","predicate":"has lab","object":"Anthropic","text":"Probes Catch Sleeper Agents has lab Anthropic."},{"subject":"Probes Catch Sleeper Agents","predicate":"has signal desk","object":"talking","text":"Probes Catch Sleeper Agents has signal desk talking."},{"subject":"Probes Catch Sleeper Agents","predicate":"has source host","object":"anthropic.com","text":"Probes Catch Sleeper Agents has source host anthropic.com."},{"subject":"Probes Catch Sleeper Agents","predicate":"has watch term","object":"Data pipeline","text":"Probes Catch Sleeper Agents has watch term Data pipeline."},{"subject":"Probes Catch Sleeper Agents","predicate":"has watch term","object":"Infrastructure","text":"Probes Catch Sleeper Agents has watch term Infrastructure."},{"subject":"Probes Catch Sleeper Agents","predicate":"has watch term","object":"Safety and alignment","text":"Probes Catch Sleeper Agents has watch term Safety and alignment."},{"subject":"Probes Catch Sleeper Agents","predicate":"has watch term","object":"Agents and tool use","text":"Probes Catch Sleeper Agents has watch term Agents and tool use."}]},"intelligence":{"signal_desk":"talking","answer":"Anthropic published Probes Catch Sleeper Agents. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Simple probes can catch sleeper agents \\ Anthropic Alignment Interpretability Simple probes can catch sleeper agents Apr 23, 2024 This “Alignment Note” presents some.... onlylabs links this event to 1 captured evidence page and 6 related writing signals.","semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"Probes Catch Sleeper Agents","text":"Anthropic published Probes Catch Sleeper Agents."},{"subject":"Probes Catch Sleeper Agents","predicate":"is classified as","object":"writing signal","text":"Probes Catch Sleeper Agents is classified as writing signal."},{"subject":"Probes Catch Sleeper Agents","predicate":"belongs to","object":"talking desk","text":"Probes Catch Sleeper Agents belongs to talking desk."},{"subject":"Probes Catch Sleeper Agents","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Probes Catch Sleeper Agents has evidence coverage 1 captured evidence page."}]},"signal":{"id":"278868a8-59e0-490c-9d66-a5abb21e0ae0","url":"https://onlylabs.fyi/signals/278868a8-59e0-490c-9d66-a5abb21e0ae0","json_url":"https://onlylabs.fyi/signals/278868a8-59e0-490c-9d66-a5abb21e0ae0/signal.json","source_url":"https://www.anthropic.com/research/probes-catch-sleeper-agents","title":"Probes Catch Sleeper Agents","summary":"Anthropic published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2024-04-23T00:00:00.000Z","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"page.visible_date","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/probes-catch-sleeper-agents"]},"facets":{},"traction":{"github_stars":null,"hn_points":33,"hn_comments":0,"hn_story_id":"40138676","hf_downloads":null,"hf_likes":null},"data_radar":null},"primary_evidence_page":{"url":"https://www.anthropic.com/research/probes-catch-sleeper-agents","final_url":"https://www.anthropic.com/research/probes-catch-sleeper-agents","title":"Probes Catch Sleeper Agents","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-11T04:18:59.19721+00:00","bytes":202146,"raw_path":"72c1254d07071bf7974530e418a5c5d4c190bee9ded1ae9c16d498ba282f62ae.html","content_hash":"807c4e984d9999bd46f63d3ea31d2d78f79212fda74063a79bc673b2430d38e6","excerpt_chars":1200,"truncated":true,"excerpt":"Simple probes can catch sleeper agents \\ Anthropic Alignment Interpretability Simple probes can catch sleeper agents Apr 23, 2024 This “Alignment Note” presents some early-stage research from the Anthropic Alignment Science team following up on our recent “ Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training ” paper. It should be treated as a work-in-progress update, and is intended for a more technical audience than our typical blog post. This research makes use of some simple interpretability techniques, and we expect to share more results from collaborations between our Alignment and Interpretability teams soon. Summary In this post we present “defection probes”: linear classifiers that use residual stream activations to predict when a sleeper agent trojan model will choose to “defect” and behave in accordance with a dangerous hidden goal. Using the models we trained in “ Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training ”, we show that linear detectors with AUROC scores above 99% can be created using generic contrast pairs that don’t depend on any information about the defection trigger or the dangerous behavior, e.g. “Human:..."},"evidence_pages":[{"url":"https://www.anthropic.com/research/probes-catch-sleeper-agents","final_url":"https://www.anthropic.com/research/probes-catch-sleeper-agents","title":"Probes Catch Sleeper Agents","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-11T04:18:59.19721+00:00","bytes":202146,"raw_path":"72c1254d07071bf7974530e418a5c5d4c190bee9ded1ae9c16d498ba282f62ae.html","content_hash":"807c4e984d9999bd46f63d3ea31d2d78f79212fda74063a79bc673b2430d38e6","excerpt_chars":1200,"truncated":true,"excerpt":"Simple probes can catch sleeper agents \\ Anthropic Alignment Interpretability Simple probes can catch sleeper agents Apr 23, 2024 This “Alignment Note” presents some early-stage research from the Anthropic Alignment Science team following up on our recent “ Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training ” paper. It should be treated as a work-in-progress update, and is intended for a more technical audience than our typical blog post. This research makes use of some simple interpretability techniques, and we expect to share more results from collaborations between our Alignment and Interpretability teams soon. Summary In this post we present “defection probes”: linear classifiers that use residual stream activations to predict when a sleeper agent trojan model will choose to “defect” and behave in accordance with a dangerous hidden goal. Using the models we trained in “ Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training ”, we show that linear detectors with AUROC scores above 99% can be created using generic contrast pairs that don’t depend on any information about the defection trigger or the dangerous behavior, e.g. “Human:..."}],"related_signals":[{"id":"6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","url":"https://onlylabs.fyi/signals/6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","source_url":"https://www.anthropic.com/research/agents-in-biology","title":"Agents In Biology","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-10T15:16:01+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"2648db51-9d6a-42a9-aece-a0ca5f9ce64f","url":"https://onlylabs.fyi/signals/2648db51-9d6a-42a9-aece-a0ca5f9ce64f","source_url":"https://www.anthropic.com/news/claude-fable-5-mythos-5","title":"Claude Fable 5 Mythos 5","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-09T20:27:50+00:00","first_seen_at":"2026-06-10T07:01:05.666054+00:00","date_source":"sitemap.lastmod"},{"id":"8475487f-45b4-4689-9bc5-8e4c6ca0457d","url":"https://onlylabs.fyi/signals/8475487f-45b4-4689-9bc5-8e4c6ca0457d","source_url":"https://www.anthropic.com/engineering/how-we-contain-claude","title":"How We Contain Claude","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-06T00:28:16+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","url":"https://onlylabs.fyi/signals/e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","source_url":"https://www.anthropic.com/research/making-claude-a-chemist","title":"Making Claude A Chemist","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T20:13:40+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"cc62deba-9682-4751-aa6b-81c3bd7122a0","url":"https://onlylabs.fyi/signals/cc62deba-9682-4751-aa6b-81c3bd7122a0","source_url":"https://www.anthropic.com/research/measuring-agent-autonomy","title":"Measuring Agent Autonomy","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:49:18+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"93da14fd-7141-4e17-abd6-1c8d52435c70","url":"https://onlylabs.fyi/signals/93da14fd-7141-4e17-abd6-1c8d52435c70","source_url":"https://www.anthropic.com/research/values-wild","title":"Values Wild","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:38:54+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"}]}