{"@context":"https://neupai.io/schema/v0.2","@type":"StructuredNewsArticle","identity":{"article_id":"tech42_20260511_ai-programming-benchmark-zero-success","canonical_url":"https://www.tech42.co.kr/%ec%84%b8%ea%b3%84-%ec%b5%9c%ea%b3%a0-ai-9%ec%a2%85-%ec%8b%9c%ed%97%98-%eb%b4%a4%eb%8d%94%eb%8b%88200%ea%b0%9c-%ea%b3%bc%ec%a0%9c-%ec%99%84%ec%a0%84-%ec%a0%95%eb%b3%b5-%eb%8b%a8-%ed%95%98/?utm_source=rss&utm_medium=rss&utm_campaign=%25ec%2584%25b8%25ea%25b3%2584-%25ec%25b5%259c%25ea%25b3%25a0-ai-9%25ec%25a2%2585-%25ec%258b%259c%25ed%2597%2598-%25eb%25b4%25a4%25eb%258d%2594%25eb%258b%2588200%25ea%25b0%259c-%25ea%25b3%25bc%25ec%25a0%259c-%25ec%2599%2584%25ec%25a0%2584-%25ec%25a0%2595%25eb%25b3%25b5-%25eb%258b%25a8-%25ed%2595%2598","ai_url":null,"publisher":{"name":"테크42","domain":"www.tech42.co.kr","type":"online"},"author":"정재엽 기자","published_at":"2026-05-11T23:09:14.000Z","updated_at":null,"language":"en","article_type":"analysis","originality":"self_produced"},"content":{"headline":"World's Top 9 AI Models Take Test... Not One Completely Conquered 200 Tasks","summary":"In a ProgramBench test conducted by a joint research team from Meta, Stanford, and Harvard, none of the world's top 9 AI models achieved a 100% success rate on 200 programming tasks. Richard Sutton, founder of reinforcement learning, argued that LLMs are a dead end and that a new AI paradigm that interacts with the world is needed.","topics":["ai","programming","research","technology"],"geography":["US","KR"],"entities":[{"name":"Meta FAIR","canonical_id":"org:us:meta-fair","type":"organization","role_in_article":"primary_subject","metadata":{"ticker":null,"parent":null}},{"name":"Stanford University","canonical_id":"org:us:stanford-university","type":"organization","role_in_article":"primary_subject","metadata":{"ticker":null,"parent":null}},{"name":"Harvard University","canonical_id":"org:us:harvard-university","type":"organization","role_in_article":"primary_subject","metadata":{"ticker":null,"parent":null}},{"name":"Claude Opus","canonical_id":"product:us:claude-opus","type":"product","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"GPT","canonical_id":"product:us:gpt","type":"product","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"Gemini Pro","canonical_id":"product:us:gemini-pro","type":"product","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"Richard Sutton","canonical_id":"person:ca:richard-sutton","type":"person","role_in_article":"quoted","metadata":{"ticker":null,"parent":null}},{"name":"University of Alberta","canonical_id":"org:ca:university-of-alberta","type":"organization","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"Google DeepMind","canonical_id":"org:us:google-deepmind","type":"organization","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"OpenAI","canonical_id":"corp:us:openai","type":"company","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"Ilya Sutskever","canonical_id":"person:us:ilya-sutskever","type":"person","role_in_article":"quoted","metadata":{"ticker":null,"parent":null}}],"claims":[{"id":"c1","statement":"A joint research team from Meta FAIR, Stanford University, and Harvard University published a ProgramBench benchmark paper in May 2026","as_of":"2026-05","as_of_explicit":true,"as_of_raw":"May 2026","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c2","statement":"The research team selected 200 core programs used in actual field environments, including FFmpeg, SQLite, and PHP interpreter, as tasks","as_of":"2026-05","as_of_explicit":false,"as_of_raw":"May 2026","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c3","statement":"They deployed 9 of the highest-performing language models currently available","as_of":"2026-05","as_of_explicit":false,"as_of_raw":"May 2026","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c4","statement":"The research team verified the AI models' code with a total of 248,853 operational tests","as_of":"2026-05","as_of_explicit":false,"as_of_raw":"May 2026","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c5","statement":"Not a single model was able to completely solve any of the 200 tasks","as_of":"2026-05","as_of_explicit":false,"as_of_raw":"May 2026","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c6","statement":"Even the best-performing model only managed to pass 95% of tests on just 6 out of 200 tasks","as_of":"2026-05","as_of_explicit":false,"as_of_raw":"May 2026","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c7","statement":"Richard Sutton, University of Alberta professor, is a 2024 ACM A.M. Turing Award recipient","as_of":"2024","as_of_explicit":true,"as_of_raw":"2024","source_type":"company_disclosure","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c8","statement":"Professor Sutton and Andrew Barto, Professor Emeritus at University of Massachusetts, were selected as recipients in March 2025","as_of":"2025-03","as_of_explicit":true,"as_of_raw":"March 2025","source_type":"company_disclosure","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c9","statement":"Professor Sutton wrote 'The Bitter Lesson' essay in 2019","as_of":"2019","as_of_explicit":true,"as_of_raw":"2019","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null}],"ai_emotional_context":{"valence":0,"arousal":0,"primary_emotions":[],"secondary_emotions":[],"emotional_triggers":[]}},"provenance":{"source_chain":["primary_reporting"],"original_source_url":null,"related_articles":[]},"temporal":{"freshness":"recent","next_update_expected":null},"access":{"license":"neupai_standard","attribution_required":true,"structured_data":"free","full_text_available":false,"full_text_access":null}}