{"schema_version":"onlylabs.public_signal.v1","title":"Anthropic Writing: Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","description":"Anthropic writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/82ba34ed-f85a-4644-af61-05c78f392fac","json_url":"https://onlylabs.fyi/signals/82ba34ed-f85a-4644-af61-05c78f392fac/signal.json","generated_at":"2026-06-11T04:18:41.436014+00:00","org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/anthropic","dossier_json_url":"https://onlylabs.fyi/labs/anthropic/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/82ba34ed-f85a-4644-af61-05c78f392fac","signal_json":"https://onlylabs.fyi/signals/82ba34ed-f85a-4644-af61-05c78f392fac/signal.json","source":"https://www.anthropic.com/research/sleeper-agents-training-deceptive-llms-that-persist-through-safety-training","lab_dossier":"https://onlylabs.fyi/labs/anthropic","lab_dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis":"https://onlylabs.fyi/analysis/anthropic","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":{"radar":"https://onlylabs.fyi/data-radar","radar_json":"https://onlylabs.fyi/data-radar.json","opportunities":"https://onlylabs.fyi/opportunities","opportunities_json":"https://onlylabs.fyi/opportunities.json","lanes":[{"key":"infrastructure","label":"Infrastructure","url":"https://onlylabs.fyi/data-radar/infrastructure","json_url":"https://onlylabs.fyi/data-radar/infrastructure/signals.json"},{"key":"safety","label":"Safety and policy","url":"https://onlylabs.fyi/data-radar/safety","json_url":"https://onlylabs.fyi/data-radar/safety/signals.json"}]}},"answer_pack":{"answer":"Anthropic published Sleeper Agents Training Deceptive Llms That Persist Through Safety Training. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training \\ Anthropic Alignment Research Sleeper Agents: Training Deceptive LLMs that Persist Through.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Infrastructure, Safety and policy in the data-business radar.","signal_desk":"talking","source_context":{"source_url":"https://www.anthropic.com/research/sleeper-agents-training-deceptive-llms-that-persist-through-safety-training","source_host":"anthropic.com","occurred_at":"2024-01-14T00:00:00.000Z","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"page.visible_date","context":null},"context_markers":[{"label":"Lab","value":"Anthropic","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"anthropic.com","source":"source"},{"label":"Radar lane","value":"Infrastructure","source":"radar"},{"label":"Radar lane","value":"Safety and policy","source":"radar"},{"label":"Matched term","value":"training","source":"radar"},{"label":"Matched term","value":"safety","source":"radar"},{"label":"Watch term","value":"RL environments","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"},{"label":"Watch term","value":"Safety and alignment","source":"evidence"},{"label":"Watch term","value":"Agents and tool use","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/sleeper-agents-training-deceptive-llms-that-persist-through-safety-training"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-11T04:18:41.436014+00:00"},"data_business":{"matches":true,"lanes":[{"key":"infrastructure","label":"Infrastructure","url":"https://onlylabs.fyi/data-radar/infrastructure","json_url":"https://onlylabs.fyi/data-radar/infrastructure/signals.json"},{"key":"safety","label":"Safety and policy","url":"https://onlylabs.fyi/data-radar/safety","json_url":"https://onlylabs.fyi/data-radar/safety/signals.json"}],"matched_terms":["training","safety"],"score":25,"reason":"Anthropic has a writing signal matching infrastructure, safety and policy."},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/82ba34ed-f85a-4644-af61-05c78f392fac/signal.json","dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":"https://onlylabs.fyi/data-radar.json","opportunities_json":"https://onlylabs.fyi/opportunities.json"},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which data-business lane explains this signal: Infrastructure, Safety and policy?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/82ba34ed-f85a-4644-af61-05c78f392fac/signal.json","required":true},{"label":"source","url":"https://www.anthropic.com/research/sleeper-agents-training-deceptive-llms-that-persist-through-safety-training","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/anthropic/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/anthropic/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":"https://onlylabs.fyi/data-radar.json","required":true}],"expected_output":["one-paragraph source-grounded interpretation","data-business implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze Anthropic's writing signal \"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training\" for frontier lab strategy and data-business implications."},"semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","text":"Anthropic published Sleeper Agents Training Deceptive Llms That Persist Through Safety Training."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"is classified as","object":"writing signal","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training is classified as writing signal."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"belongs to","object":"talking desk","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training belongs to talking desk."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has evidence coverage 1 captured evidence page."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"matches data-business lanes","object":"Infrastructure, Safety and policy","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training matches data-business lanes Infrastructure, Safety and policy."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has captured page count","object":"1","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has captured page count 1."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has readable page count","object":"1","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has readable page count 1."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has related signal count","object":"6","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has related signal count 6."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has source host","object":"anthropic.com","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has source host anthropic.com."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has lab","object":"Anthropic","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has lab Anthropic."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has signal desk","object":"talking","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has signal desk talking."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has source host","object":"anthropic.com","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has source host anthropic.com."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has radar lane","object":"Infrastructure","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has radar lane Infrastructure."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has radar lane","object":"Safety and policy","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has radar lane Safety and policy."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has matched term","object":"training","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has matched term training."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has matched term","object":"safety","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has matched term safety."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has watch term","object":"RL environments","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has watch term RL environments."}]},"intelligence":{"signal_desk":"talking","answer":"Anthropic published Sleeper Agents Training Deceptive Llms That Persist Through Safety Training. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training \\ Anthropic Alignment Research Sleeper Agents: Training Deceptive LLMs that Persist Through.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Infrastructure, Safety and policy in the data-business radar.","semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","text":"Anthropic published Sleeper Agents Training Deceptive Llms That Persist Through Safety Training."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"is classified as","object":"writing signal","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training is classified as writing signal."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"belongs to","object":"talking desk","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training belongs to talking desk."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training has evidence coverage 1 captured evidence page."},{"subject":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","predicate":"matches data-business lanes","object":"Infrastructure, Safety and policy","text":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training matches data-business lanes Infrastructure, Safety and policy."}]},"signal":{"id":"82ba34ed-f85a-4644-af61-05c78f392fac","url":"https://onlylabs.fyi/signals/82ba34ed-f85a-4644-af61-05c78f392fac","json_url":"https://onlylabs.fyi/signals/82ba34ed-f85a-4644-af61-05c78f392fac/signal.json","source_url":"https://www.anthropic.com/research/sleeper-agents-training-deceptive-llms-that-persist-through-safety-training","title":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","summary":"Anthropic published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2024-01-14T00:00:00.000Z","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"page.visible_date","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/sleeper-agents-training-deceptive-llms-that-persist-through-safety-training"]},"facets":{},"traction":{"github_stars":null,"hn_points":null,"hn_comments":null,"hn_story_id":null,"hf_downloads":null,"hf_likes":null},"data_radar":{"lanes":[{"key":"infrastructure","label":"Infrastructure","url":"https://onlylabs.fyi/data-radar/infrastructure"},{"key":"safety","label":"Safety and policy","url":"https://onlylabs.fyi/data-radar/safety"}],"score":25,"matched_terms":["training","safety"],"reason":"Anthropic has a writing signal matching infrastructure, safety and policy."}},"primary_evidence_page":{"url":"https://www.anthropic.com/research/sleeper-agents-training-deceptive-llms-that-persist-through-safety-training","final_url":"https://www.anthropic.com/research/sleeper-agents-training-deceptive-llms-that-persist-through-safety-training","title":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-11T04:18:41.436014+00:00","bytes":107087,"raw_path":"83b187f91a7c6b88c2325db74d3145981df209cca4873dab9cd0b60934fe2905.html","content_hash":"cd7afa45d7f7e92dbdcdf5cd711f6d2903ee80e6ea5d20ca6771682a1930b896","excerpt_chars":1200,"truncated":true,"excerpt":"Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training \\ Anthropic Alignment Research Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training Jan 14, 2024 Read Paper Humans are capable of strategically deceptive behavior: behaving helpfully in most situations, but then behaving very differently in order to pursue alternative objectives when given the opportunity. If an AI system learned such a deceptive strategy, could we detect it and remove it using current state-of-the-art safety training techniques? To study this question, we construct proof-of-concept examples of deceptive behavior in large language models (LLMs). For example, we train models that write secure code when the prompt states that the year is 2023, but insert exploitable code when the stated year is 2024. We find that such backdoor behavior can be made persistent, so that it is not removed by standard safety training techniques, including supervised fine-tuning, reinforcement learning, and adversarial training (eliciting unsafe behavior and then training to remove it). The backdoor behavior is most persistent in the largest models and in models trained to produce..."},"evidence_pages":[{"url":"https://www.anthropic.com/research/sleeper-agents-training-deceptive-llms-that-persist-through-safety-training","final_url":"https://www.anthropic.com/research/sleeper-agents-training-deceptive-llms-that-persist-through-safety-training","title":"Sleeper Agents Training Deceptive Llms That Persist Through Safety Training","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-11T04:18:41.436014+00:00","bytes":107087,"raw_path":"83b187f91a7c6b88c2325db74d3145981df209cca4873dab9cd0b60934fe2905.html","content_hash":"cd7afa45d7f7e92dbdcdf5cd711f6d2903ee80e6ea5d20ca6771682a1930b896","excerpt_chars":1200,"truncated":true,"excerpt":"Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training \\ Anthropic Alignment Research Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training Jan 14, 2024 Read Paper Humans are capable of strategically deceptive behavior: behaving helpfully in most situations, but then behaving very differently in order to pursue alternative objectives when given the opportunity. If an AI system learned such a deceptive strategy, could we detect it and remove it using current state-of-the-art safety training techniques? To study this question, we construct proof-of-concept examples of deceptive behavior in large language models (LLMs). For example, we train models that write secure code when the prompt states that the year is 2023, but insert exploitable code when the stated year is 2024. We find that such backdoor behavior can be made persistent, so that it is not removed by standard safety training techniques, including supervised fine-tuning, reinforcement learning, and adversarial training (eliciting unsafe behavior and then training to remove it). The backdoor behavior is most persistent in the largest models and in models trained to produce..."}],"related_signals":[{"id":"6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","url":"https://onlylabs.fyi/signals/6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","source_url":"https://www.anthropic.com/research/agents-in-biology","title":"Agents In Biology","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-10T15:16:01+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"2648db51-9d6a-42a9-aece-a0ca5f9ce64f","url":"https://onlylabs.fyi/signals/2648db51-9d6a-42a9-aece-a0ca5f9ce64f","source_url":"https://www.anthropic.com/news/claude-fable-5-mythos-5","title":"Claude Fable 5 Mythos 5","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-09T20:27:50+00:00","first_seen_at":"2026-06-10T07:01:05.666054+00:00","date_source":"sitemap.lastmod"},{"id":"8475487f-45b4-4689-9bc5-8e4c6ca0457d","url":"https://onlylabs.fyi/signals/8475487f-45b4-4689-9bc5-8e4c6ca0457d","source_url":"https://www.anthropic.com/engineering/how-we-contain-claude","title":"How We Contain Claude","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-06T00:28:16+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","url":"https://onlylabs.fyi/signals/e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","source_url":"https://www.anthropic.com/research/making-claude-a-chemist","title":"Making Claude A Chemist","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T20:13:40+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"cc62deba-9682-4751-aa6b-81c3bd7122a0","url":"https://onlylabs.fyi/signals/cc62deba-9682-4751-aa6b-81c3bd7122a0","source_url":"https://www.anthropic.com/research/measuring-agent-autonomy","title":"Measuring Agent Autonomy","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:49:18+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"93da14fd-7141-4e17-abd6-1c8d52435c70","url":"https://onlylabs.fyi/signals/93da14fd-7141-4e17-abd6-1c8d52435c70","source_url":"https://www.anthropic.com/research/values-wild","title":"Values Wild","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:38:54+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"}]}