{"schema_version":"onlylabs.public_signal.v1","title":"Anthropic Writing: Emergent Misalignment Reward Hacking","description":"Anthropic writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/2c220fe3-b486-4357-b0d4-0181ed3a1d40","json_url":"https://onlylabs.fyi/signals/2c220fe3-b486-4357-b0d4-0181ed3a1d40/signal.json","generated_at":"2026-06-09T02:20:55.216934+00:00","org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/anthropic","dossier_json_url":"https://onlylabs.fyi/labs/anthropic/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/2c220fe3-b486-4357-b0d4-0181ed3a1d40","signal_json":"https://onlylabs.fyi/signals/2c220fe3-b486-4357-b0d4-0181ed3a1d40/signal.json","source":"https://www.anthropic.com/research/emergent-misalignment-reward-hacking","lab_dossier":"https://onlylabs.fyi/labs/anthropic","lab_dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis":"https://onlylabs.fyi/analysis/anthropic","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":{"radar":"https://onlylabs.fyi/data-radar","radar_json":"https://onlylabs.fyi/data-radar.json","opportunities":"https://onlylabs.fyi/opportunities","opportunities_json":"https://onlylabs.fyi/opportunities.json","lanes":[{"key":"safety","label":"Safety and policy","url":"https://onlylabs.fyi/data-radar/safety","json_url":"https://onlylabs.fyi/data-radar/safety/signals.json"}]}},"answer_pack":{"answer":"Anthropic published Emergent Misalignment Reward Hacking. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: From shortcuts to sabotage: natural emergent misalignment from reward hacking \\ Anthropic Alignment From shortcuts to sabotage: natural emergent misalignment from reward.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Safety and policy in the data-business radar.","signal_desk":"talking","source_context":{"source_url":"https://www.anthropic.com/research/emergent-misalignment-reward-hacking","source_host":"anthropic.com","occurred_at":"2025-11-21T00:00:00.000Z","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"page.visible_date","context":null},"context_markers":[{"label":"Lab","value":"Anthropic","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"anthropic.com","source":"source"},{"label":"Radar lane","value":"Safety and policy","source":"radar"},{"label":"Matched term","value":"alignment","source":"radar"},{"label":"Watch term","value":"RL environments","source":"evidence"},{"label":"Watch term","value":"Eval methodology","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"},{"label":"Watch term","value":"Safety and alignment","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/emergent-misalignment-reward-hacking"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-09T02:20:55.216934+00:00"},"data_business":{"matches":true,"lanes":[{"key":"safety","label":"Safety and policy","url":"https://onlylabs.fyi/data-radar/safety","json_url":"https://onlylabs.fyi/data-radar/safety/signals.json"}],"matched_terms":["alignment"],"score":13,"reason":"Anthropic has a writing signal matching safety and policy."},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/2c220fe3-b486-4357-b0d4-0181ed3a1d40/signal.json","dossier_json":"https://onlylabs.fyi/labs/anthropic/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/anthropic/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/anthropic/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":"https://onlylabs.fyi/data-radar.json","opportunities_json":"https://onlylabs.fyi/opportunities.json"},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which data-business lane explains this signal: Safety and policy?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/2c220fe3-b486-4357-b0d4-0181ed3a1d40/signal.json","required":true},{"label":"source","url":"https://www.anthropic.com/research/emergent-misalignment-reward-hacking","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/anthropic/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/anthropic/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":"https://onlylabs.fyi/data-radar.json","required":true}],"expected_output":["one-paragraph source-grounded interpretation","data-business implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze Anthropic's writing signal \"Emergent Misalignment Reward Hacking\" for frontier lab strategy and data-business implications."},"semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"Emergent Misalignment Reward Hacking","text":"Anthropic published Emergent Misalignment Reward Hacking."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"is classified as","object":"writing signal","text":"Emergent Misalignment Reward Hacking is classified as writing signal."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"belongs to","object":"talking desk","text":"Emergent Misalignment Reward Hacking belongs to talking desk."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Emergent Misalignment Reward Hacking has evidence coverage 1 captured evidence page."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"matches data-business lanes","object":"Safety and policy","text":"Emergent Misalignment Reward Hacking matches data-business lanes Safety and policy."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has captured page count","object":"1","text":"Emergent Misalignment Reward Hacking has captured page count 1."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has readable page count","object":"1","text":"Emergent Misalignment Reward Hacking has readable page count 1."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has related signal count","object":"6","text":"Emergent Misalignment Reward Hacking has related signal count 6."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"Emergent Misalignment Reward Hacking has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has source host","object":"anthropic.com","text":"Emergent Misalignment Reward Hacking has source host anthropic.com."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has lab","object":"Anthropic","text":"Emergent Misalignment Reward Hacking has lab Anthropic."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has signal desk","object":"talking","text":"Emergent Misalignment Reward Hacking has signal desk talking."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has source host","object":"anthropic.com","text":"Emergent Misalignment Reward Hacking has source host anthropic.com."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has radar lane","object":"Safety and policy","text":"Emergent Misalignment Reward Hacking has radar lane Safety and policy."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has matched term","object":"alignment","text":"Emergent Misalignment Reward Hacking has matched term alignment."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has watch term","object":"RL environments","text":"Emergent Misalignment Reward Hacking has watch term RL environments."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has watch term","object":"Eval methodology","text":"Emergent Misalignment Reward Hacking has watch term Eval methodology."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has watch term","object":"Infrastructure","text":"Emergent Misalignment Reward Hacking has watch term Infrastructure."}]},"intelligence":{"signal_desk":"talking","answer":"Anthropic published Emergent Misalignment Reward Hacking. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: From shortcuts to sabotage: natural emergent misalignment from reward hacking \\ Anthropic Alignment From shortcuts to sabotage: natural emergent misalignment from reward.... onlylabs links this event to 1 captured evidence page and 6 related writing signals. It also maps to Safety and policy in the data-business radar.","semantic_triples":[{"subject":"Anthropic","predicate":"published","object":"Emergent Misalignment Reward Hacking","text":"Anthropic published Emergent Misalignment Reward Hacking."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"is classified as","object":"writing signal","text":"Emergent Misalignment Reward Hacking is classified as writing signal."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"belongs to","object":"talking desk","text":"Emergent Misalignment Reward Hacking belongs to talking desk."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"has evidence coverage","object":"1 captured evidence page","text":"Emergent Misalignment Reward Hacking has evidence coverage 1 captured evidence page."},{"subject":"Emergent Misalignment Reward Hacking","predicate":"matches data-business lanes","object":"Safety and policy","text":"Emergent Misalignment Reward Hacking matches data-business lanes Safety and policy."}]},"signal":{"id":"2c220fe3-b486-4357-b0d4-0181ed3a1d40","url":"https://onlylabs.fyi/signals/2c220fe3-b486-4357-b0d4-0181ed3a1d40","json_url":"https://onlylabs.fyi/signals/2c220fe3-b486-4357-b0d4-0181ed3a1d40/signal.json","source_url":"https://www.anthropic.com/research/emergent-misalignment-reward-hacking","title":"Emergent Misalignment Reward Hacking","summary":"Anthropic published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2025-11-21T00:00:00.000Z","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"page.visible_date","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["plain"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://www.anthropic.com/research/emergent-misalignment-reward-hacking"]},"facets":{},"traction":{"github_stars":null,"hn_points":null,"hn_comments":null,"hn_story_id":null,"hf_downloads":null,"hf_likes":null},"data_radar":{"lanes":[{"key":"safety","label":"Safety and policy","url":"https://onlylabs.fyi/data-radar/safety"}],"score":13,"matched_terms":["alignment"],"reason":"Anthropic has a writing signal matching safety and policy."}},"primary_evidence_page":{"url":"https://www.anthropic.com/research/emergent-misalignment-reward-hacking","final_url":"https://www.anthropic.com/research/emergent-misalignment-reward-hacking","title":"Emergent Misalignment Reward Hacking","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-09T02:20:55.216934+00:00","bytes":152210,"raw_path":"7a21b9c5237a8a163b1efeb6dea1859200905ce254d1b71065554fa75a836d73.html","content_hash":"067c32d89040c465c4c8e0da0ebb64e51de2074de2f4e4a9f765600469d0d456","excerpt_chars":1200,"truncated":true,"excerpt":"From shortcuts to sabotage: natural emergent misalignment from reward hacking \\ Anthropic Alignment From shortcuts to sabotage: natural emergent misalignment from reward hacking Nov 21, 2025 Read the paper In the latest research from Anthropic’s alignment team, we show for the first time that realistic AI training processes can accidentally produce misaligned models 1 . In Shakespeare’s King Lear , the character of Edmund commits a range of villainous acts: he forges letters, frames his brother, betrays his father, and eventually goes as far as having innocent people killed. He begins this campaign of evil acts after railing against how he’s been labelled. Because he was an illegitimate child, he’s seen as “base” (“ Why brand they us… with baseness? ”). “ Well, then ”, he says: if society is labelling him this way, he might as well play up to the stereotype. His self-concept is of a “base”, evil person. So why not truly be evil? In our latest research , we find that a similar mechanism is at play in large language models. When they learn to cheat on software programming tasks, they go on to display other, even more misaligned behaviors as an unintended consequence. These include..."},"evidence_pages":[{"url":"https://www.anthropic.com/research/emergent-misalignment-reward-hacking","final_url":"https://www.anthropic.com/research/emergent-misalignment-reward-hacking","title":"Emergent Misalignment Reward Hacking","http_status":200,"content_type":"text/html; charset=utf-8","capture_method":"plain","fetched_at":"2026-06-09T02:20:55.216934+00:00","bytes":152210,"raw_path":"7a21b9c5237a8a163b1efeb6dea1859200905ce254d1b71065554fa75a836d73.html","content_hash":"067c32d89040c465c4c8e0da0ebb64e51de2074de2f4e4a9f765600469d0d456","excerpt_chars":1200,"truncated":true,"excerpt":"From shortcuts to sabotage: natural emergent misalignment from reward hacking \\ Anthropic Alignment From shortcuts to sabotage: natural emergent misalignment from reward hacking Nov 21, 2025 Read the paper In the latest research from Anthropic’s alignment team, we show for the first time that realistic AI training processes can accidentally produce misaligned models 1 . In Shakespeare’s King Lear , the character of Edmund commits a range of villainous acts: he forges letters, frames his brother, betrays his father, and eventually goes as far as having innocent people killed. He begins this campaign of evil acts after railing against how he’s been labelled. Because he was an illegitimate child, he’s seen as “base” (“ Why brand they us… with baseness? ”). “ Well, then ”, he says: if society is labelling him this way, he might as well play up to the stereotype. His self-concept is of a “base”, evil person. So why not truly be evil? In our latest research , we find that a similar mechanism is at play in large language models. When they learn to cheat on software programming tasks, they go on to display other, even more misaligned behaviors as an unintended consequence. These include..."}],"related_signals":[{"id":"6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","url":"https://onlylabs.fyi/signals/6c78c028-3ab4-4b33-86f7-d86c8ba9e3ba","source_url":"https://www.anthropic.com/research/agents-in-biology","title":"Agents In Biology","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-10T15:16:01+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"2648db51-9d6a-42a9-aece-a0ca5f9ce64f","url":"https://onlylabs.fyi/signals/2648db51-9d6a-42a9-aece-a0ca5f9ce64f","source_url":"https://www.anthropic.com/news/claude-fable-5-mythos-5","title":"Claude Fable 5 Mythos 5","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-09T20:27:50+00:00","first_seen_at":"2026-06-10T07:01:05.666054+00:00","date_source":"sitemap.lastmod"},{"id":"8475487f-45b4-4689-9bc5-8e4c6ca0457d","url":"https://onlylabs.fyi/signals/8475487f-45b4-4689-9bc5-8e4c6ca0457d","source_url":"https://www.anthropic.com/engineering/how-we-contain-claude","title":"How We Contain Claude","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-06T00:28:16+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","url":"https://onlylabs.fyi/signals/e4fbfcdd-15b4-41b9-b011-fd83e7068ae9","source_url":"https://www.anthropic.com/research/making-claude-a-chemist","title":"Making Claude A Chemist","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T20:13:40+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"cc62deba-9682-4751-aa6b-81c3bd7122a0","url":"https://onlylabs.fyi/signals/cc62deba-9682-4751-aa6b-81c3bd7122a0","source_url":"https://www.anthropic.com/research/measuring-agent-autonomy","title":"Measuring Agent Autonomy","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:49:18+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"},{"id":"93da14fd-7141-4e17-abd6-1c8d52435c70","url":"https://onlylabs.fyi/signals/93da14fd-7141-4e17-abd6-1c8d52435c70","source_url":"https://www.anthropic.com/research/values-wild","title":"Values Wild","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"anthropic","name":"Anthropic","category":"frontier-lab"},"occurred_at":"2026-06-05T15:38:54+00:00","first_seen_at":"2026-06-09T02:17:26.339488+00:00","date_source":"sitemap.lastmod"}]}