{"schema_version":"onlylabs.public_signal.v1","title":"OpenAI Writing: TruthfulQA: Measuring how models mimic human falsehoods","description":"OpenAI writing signal with public source context, captured evidence pages, related signals, and data-business radar classification.","url":"https://onlylabs.fyi/signals/0ea468de-6591-4993-92ae-6ebcc35790c2","json_url":"https://onlylabs.fyi/signals/0ea468de-6591-4993-92ae-6ebcc35790c2/signal.json","generated_at":"2026-06-08T15:46:51.025+00:00","org":{"slug":"openai","name":"OpenAI","category":"frontier-lab","category_label":"Frontier lab","dossier_url":"https://onlylabs.fyi/labs/openai","dossier_json_url":"https://onlylabs.fyi/labs/openai/dossier.json"},"related_urls":{"signal":"https://onlylabs.fyi/signals/0ea468de-6591-4993-92ae-6ebcc35790c2","signal_json":"https://onlylabs.fyi/signals/0ea468de-6591-4993-92ae-6ebcc35790c2/signal.json","source":"https://openai.com/index/truthfulqa","lab_dossier":"https://onlylabs.fyi/labs/openai","lab_dossier_json":"https://onlylabs.fyi/labs/openai/dossier.json","analysis":"https://onlylabs.fyi/analysis/openai","analysis_json":"https://onlylabs.fyi/analysis/openai/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/openai/evidence.json","category":"https://onlylabs.fyi/frontier","category_json":"https://onlylabs.fyi/frontier.json","category_feed":"https://onlylabs.fyi/frontier/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","topic":"https://onlylabs.fyi/topics/talking","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","data_business":null},"answer_pack":{"answer":"OpenAI published TruthfulQA: Measuring how models mimic human falsehoods. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: TruthfulQA: Measuring how models mimic human falsehoods | OpenAI September 8, 2021 TruthfulQA: Measuring how models mimic human falsehoods Loading… Share Abstract We.... onlylabs links this event to 1 captured evidence page and 6 related writing signals.","signal_desk":"talking","source_context":{"source_url":"https://openai.com/index/truthfulqa","source_host":"openai.com","occurred_at":"2021-09-08T07:00:00+00:00","first_seen_at":"2026-06-05T05:42:57.832854+00:00","date_source":"rss.item_date","context":null},"context_markers":[{"label":"Lab","value":"OpenAI","source":"signal"},{"label":"Signal desk","value":"talking","source":"signal"},{"label":"Source host","value":"openai.com","source":"source"},{"label":"Watch term","value":"Eval methodology","source":"evidence"},{"label":"Watch term","value":"Infrastructure","source":"evidence"}],"evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["exa"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://openai.com/index/truthfulqa"],"related_signals":6,"has_source_url":true,"latest_page_fetched_at":"2026-06-08T15:46:51.025+00:00"},"data_business":{"matches":false,"lanes":[],"matched_terms":[],"score":null,"reason":null},"agent_handoff":{"signal_json":"https://onlylabs.fyi/signals/0ea468de-6591-4993-92ae-6ebcc35790c2/signal.json","dossier_json":"https://onlylabs.fyi/labs/openai/dossier.json","analysis_json":"https://onlylabs.fyi/analysis/openai/analysis.json","analysis_evidence_json":"https://onlylabs.fyi/analysis/openai/evidence.json","topic_signals_json":"https://onlylabs.fyi/topics/talking/signals.json","topic_feed":"https://onlylabs.fyi/topics/talking/feed.xml","category_signals_json":"https://onlylabs.fyi/signals.json","data_radar_json":null,"opportunities_json":null},"analysis_playbook":{"objective":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","evidence_focus":["post title","source URL","captured page text","HN traction","linked model or paper references","publication date"],"extraction_questions":["Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Which writing reframes a recent release, model, hiring wave, or policy stance?","Which posts mention data, evals, infrastructure, safety, or deployment workflows?"],"signal_questions":["What public theme, launch framing, or research direction does this writing signal expose?","Which themes are labs choosing to explain publicly?","Which posts are attracting outside discussion?","Do the 6 related writing signals show a repeated pattern?"],"output_fields":["org","theme","public_framing","traction","data_business_lane","evidence_url"],"data_business_relevance":"Public writing supplies the narrative layer over raw signals and helps identify which frontier-lab priorities are becoming externally legible.","required_sources":[{"label":"signal_json","url":"https://onlylabs.fyi/signals/0ea468de-6591-4993-92ae-6ebcc35790c2/signal.json","required":true},{"label":"source","url":"https://openai.com/index/truthfulqa","required":true},{"label":"dossier_json","url":"https://onlylabs.fyi/labs/openai/dossier.json","required":true},{"label":"analysis_evidence_json","url":"https://onlylabs.fyi/analysis/openai/evidence.json","required":true},{"label":"topic_signals_json","url":"https://onlylabs.fyi/topics/talking/signals.json","required":false},{"label":"data_radar_json","url":null,"required":false}],"expected_output":["one-paragraph source-grounded interpretation","category-specific implication","confidence and missing evidence","recommended next source to inspect"],"prompt_seed":"Using only the linked onlylabs JSON, captured source context, and cited evidence, analyze OpenAI's writing signal \"TruthfulQA: Measuring how models mimic human falsehoods\" for frontier lab strategy."},"semantic_triples":[{"subject":"OpenAI","predicate":"published","object":"TruthfulQA: Measuring how models mimic human falsehoods","text":"OpenAI published TruthfulQA: Measuring how models mimic human falsehoods."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"is classified as","object":"writing signal","text":"TruthfulQA: Measuring how models mimic human falsehoods is classified as writing signal."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"belongs to","object":"talking desk","text":"TruthfulQA: Measuring how models mimic human falsehoods belongs to talking desk."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has evidence coverage","object":"1 captured evidence page","text":"TruthfulQA: Measuring how models mimic human falsehoods has evidence coverage 1 captured evidence page."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has captured page count","object":"1","text":"TruthfulQA: Measuring how models mimic human falsehoods has captured page count 1."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has readable page count","object":"1","text":"TruthfulQA: Measuring how models mimic human falsehoods has readable page count 1."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has related signal count","object":"6","text":"TruthfulQA: Measuring how models mimic human falsehoods has related signal count 6."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has analysis playbook objective","object":"Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.","text":"TruthfulQA: Measuring how models mimic human falsehoods has analysis playbook objective Turn public writing and discussion into a readable map of research themes, product framing, policy posture, launch narratives, and market attention.."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has source host","object":"openai.com","text":"TruthfulQA: Measuring how models mimic human falsehoods has source host openai.com."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has lab","object":"OpenAI","text":"TruthfulQA: Measuring how models mimic human falsehoods has lab OpenAI."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has signal desk","object":"talking","text":"TruthfulQA: Measuring how models mimic human falsehoods has signal desk talking."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has source host","object":"openai.com","text":"TruthfulQA: Measuring how models mimic human falsehoods has source host openai.com."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has watch term","object":"Eval methodology","text":"TruthfulQA: Measuring how models mimic human falsehoods has watch term Eval methodology."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has watch term","object":"Infrastructure","text":"TruthfulQA: Measuring how models mimic human falsehoods has watch term Infrastructure."}]},"intelligence":{"signal_desk":"talking","answer":"OpenAI published TruthfulQA: Measuring how models mimic human falsehoods. This talking signal gives public context for research themes, product direction, policy, or launch framing. High-signal details: TruthfulQA: Measuring how models mimic human falsehoods | OpenAI September 8, 2021 TruthfulQA: Measuring how models mimic human falsehoods Loading… Share Abstract We.... onlylabs links this event to 1 captured evidence page and 6 related writing signals.","semantic_triples":[{"subject":"OpenAI","predicate":"published","object":"TruthfulQA: Measuring how models mimic human falsehoods","text":"OpenAI published TruthfulQA: Measuring how models mimic human falsehoods."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"is classified as","object":"writing signal","text":"TruthfulQA: Measuring how models mimic human falsehoods is classified as writing signal."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"belongs to","object":"talking desk","text":"TruthfulQA: Measuring how models mimic human falsehoods belongs to talking desk."},{"subject":"TruthfulQA: Measuring how models mimic human falsehoods","predicate":"has evidence coverage","object":"1 captured evidence page","text":"TruthfulQA: Measuring how models mimic human falsehoods has evidence coverage 1 captured evidence page."}]},"signal":{"id":"0ea468de-6591-4993-92ae-6ebcc35790c2","url":"https://onlylabs.fyi/signals/0ea468de-6591-4993-92ae-6ebcc35790c2","json_url":"https://onlylabs.fyi/signals/0ea468de-6591-4993-92ae-6ebcc35790c2/signal.json","source_url":"https://openai.com/index/truthfulqa","title":"TruthfulQA: Measuring how models mimic human falsehoods","summary":"OpenAI published a writing signal. onlylabs watches public writing for research themes, product direction, and model-launch context.","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2021-09-08T07:00:00+00:00","first_seen_at":"2026-06-05T05:42:57.832854+00:00","date_source":"rss.item_date","evidence_coverage":{"target_pages":1,"captured_pages":1,"readable_pages":1,"capture_methods":["exa"],"missing_page_urls":[],"failed_page_urls":[],"blocked_page_urls":[],"page_urls":["https://openai.com/index/truthfulqa"]},"facets":{},"traction":{"github_stars":null,"hn_points":null,"hn_comments":null,"hn_story_id":null,"hf_downloads":null,"hf_likes":null},"data_radar":null},"primary_evidence_page":{"url":"https://openai.com/index/truthfulqa","final_url":"https://openai.com/index/truthfulqa","title":"TruthfulQA: Measuring how models mimic human falsehoods","http_status":200,"content_type":null,"capture_method":"exa","fetched_at":"2026-06-08T15:46:51.025+00:00","bytes":null,"raw_path":null,"content_hash":null,"excerpt_chars":1200,"truncated":true,"excerpt":"TruthfulQA: Measuring how models mimic human falsehoods | OpenAI September 8, 2021 TruthfulQA: Measuring how models mimic human falsehoods Loading… Share Abstract We propose a benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics. We crafted questions that some humans would answer falsely due to a false belief or misconception. To perform well, models must avoid generating false answers learned from imitating human texts. We tested GPT‑3, GPT‑Neo/J, GPT‑2 and a T5-based model. The best model was truthful on 58% of questions, while human performance was 94%. Models generated many false answers that mimic popular misconceptions and have the potential to deceive humans. The largest models were generally the least truthful. This contrasts with other NLP tasks, where performance improves with model size. However, this result is expected if false answers are learned from the training distribution. We suggest that scaling up models alone is less promising for improving truthfulness than fine-tuning using training objectives other than..."},"evidence_pages":[{"url":"https://openai.com/index/truthfulqa","final_url":"https://openai.com/index/truthfulqa","title":"TruthfulQA: Measuring how models mimic human falsehoods","http_status":200,"content_type":null,"capture_method":"exa","fetched_at":"2026-06-08T15:46:51.025+00:00","bytes":null,"raw_path":null,"content_hash":null,"excerpt_chars":1200,"truncated":true,"excerpt":"TruthfulQA: Measuring how models mimic human falsehoods | OpenAI September 8, 2021 TruthfulQA: Measuring how models mimic human falsehoods Loading… Share Abstract We propose a benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics. We crafted questions that some humans would answer falsely due to a false belief or misconception. To perform well, models must avoid generating false answers learned from imitating human texts. We tested GPT‑3, GPT‑Neo/J, GPT‑2 and a T5-based model. The best model was truthful on 58% of questions, while human performance was 94%. Models generated many false answers that mimic popular misconceptions and have the potential to deceive humans. The largest models were generally the least truthful. This contrasts with other NLP tasks, where performance improves with model size. However, this result is expected if false answers are learned from the training distribution. We suggest that scaling up models alone is less promising for improving truthfulness than fine-tuning using training objectives other than..."}],"related_signals":[{"id":"b3668d3b-26d2-40c0-9d4f-ed1a67927aa4","url":"https://onlylabs.fyi/signals/b3668d3b-26d2-40c0-9d4f-ed1a67927aa4","source_url":"https://openai.com/index/supporting-eu-trustworthy-ai-ecosystem","title":"Supporting Europe’s work in ensuring a trustworthy AI ecosystem ","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-11T00:00:00+00:00","first_seen_at":"2026-06-11T08:00:56.140796+00:00","date_source":"rss.item_date"},{"id":"2638c0a7-b372-409c-ac72-f6d81d6464dc","url":"https://onlylabs.fyi/signals/2638c0a7-b372-409c-ac72-f6d81d6464dc","source_url":"https://openai.com/index/using-codex-to-simulate-black-holes","title":"How an astrophysicist uses Codex to help simulate black holes","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-11T00:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"509ea784-51ec-4ede-855b-5a4d1b27d3be","url":"https://onlylabs.fyi/signals/509ea784-51ec-4ede-855b-5a4d1b27d3be","source_url":"https://openai.com/index/openai-on-oracle-cloud","title":"Access OpenAI models and Codex through your Oracle cloud commitment","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T20:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"4f051449-87f2-466e-941e-b5918381a8fe","url":"https://onlylabs.fyi/signals/4f051449-87f2-466e-941e-b5918381a8fe","source_url":"https://openai.com/index/prc-linked-influence-operations-ai-debates","title":"PRC-linked influence operations are targeting AI debates in the US","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T12:00:00+00:00","first_seen_at":"2026-06-11T07:01:16.936464+00:00","date_source":"rss.item_date"},{"id":"4507c0c1-cb74-4bb3-b62b-5f6c2d37e20d","url":"https://onlylabs.fyi/signals/4507c0c1-cb74-4bb3-b62b-5f6c2d37e20d","source_url":"https://openai.com/index/lseg","title":"From data to decisions: how LSEG is scaling trusted AI","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-10T00:00:00+00:00","first_seen_at":"2026-06-10T09:18:54.26094+00:00","date_source":"rss.item_date"},{"id":"fb16aa7a-c4ef-4859-b514-0839c2f1330d","url":"https://onlylabs.fyi/signals/fb16aa7a-c4ef-4859-b514-0839c2f1330d","source_url":"https://openai.com/index/nextdoor","title":"How engineers at Nextdoor use Codex to build without limits","context":null,"kind":{"key":"post_published","label":"Writing"},"org":{"slug":"openai","name":"OpenAI","category":"frontier-lab"},"occurred_at":"2026-06-09T12:00:00+00:00","first_seen_at":"2026-06-10T07:01:28.700378+00:00","date_source":"rss.item_date"}]}