{"@context":"https://neupai.io/schema/v0.2","@type":"StructuredNewsArticle","identity":{"article_id":"tech42_20260511_ai-programming-benchmark-zero-success","canonical_url":"https://www.tech42.co.kr/%ec%84%b8%ea%b3%84-%ec%b5%9c%ea%b3%a0-ai-9%ec%a2%85-%ec%8b%9c%ed%97%98-%eb%b4%a4%eb%8d%94%eb%8b%88200%ea%b0%9c-%ea%b3%bc%ec%a0%9c-%ec%99%84%ec%a0%84-%ec%a0%95%eb%b3%b5-%eb%8b%a8-%ed%95%98/?utm_source=rss&utm_medium=rss&utm_campaign=%25ec%2584%25b8%25ea%25b3%2584-%25ec%25b5%259c%25ea%25b3%25a0-ai-9%25ec%25a2%2585-%25ec%258b%259c%25ed%2597%2598-%25eb%25b4%25a4%25eb%258d%2594%25eb%258b%2588200%25ea%25b0%259c-%25ea%25b3%25bc%25ec%25a0%259c-%25ec%2599%2584%25ec%25a0%2584-%25ec%25a0%2595%25eb%25b3%25b5-%25eb%258b%25a8-%25ed%2595%2598","ai_url":null,"publisher":{"name":"테크42","domain":"www.tech42.co.kr","type":"online"},"author":"정재엽 기자","published_at":"2026-05-11T23:09:14.000Z","updated_at":null,"language":"ko","article_type":"analysis","originality":"self_produced"},"content":{"headline":"세계 최고 AI 9종 시험 봤더니…200개 과제 완전 정복, 단 하나도 없었다","summary":"메타·스탠퍼드·하버드 공동 연구팀이 실시한 프로그램벤치 테스트에서 세계 최고 AI 9종이 200개 프로그래밍 과제를 완전히 해결한 경우는 0%였다. 강화학습 창시자 리처드 서튼 교수는 LLM이 막다른 길이라며 세계와 상호작용하는 새로운 AI 패러다임이 필요하다고 주장했다.","topics":["AI","프로그래밍","연구","기술"],"geography":["US","KR"],"entities":[{"name":"메타 FAIR","canonical_id":"org:us:meta-fair","type":"organization","role_in_article":"primary_subject","metadata":{"ticker":null,"parent":null}},{"name":"스탠퍼드대","canonical_id":"org:us:stanford-university","type":"organization","role_in_article":"primary_subject","metadata":{"ticker":null,"parent":null}},{"name":"하버드대","canonical_id":"org:us:harvard-university","type":"organization","role_in_article":"primary_subject","metadata":{"ticker":null,"parent":null}},{"name":"클로드 오퍼스","canonical_id":"product:us:claude-opus","type":"product","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"GPT","canonical_id":"product:us:gpt","type":"product","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"제미나이 프로","canonical_id":"product:us:gemini-pro","type":"product","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"리처드 서튼","canonical_id":"person:ca:richard-sutton","type":"person","role_in_article":"quoted","metadata":{"ticker":null,"parent":null}},{"name":"앨버타대","canonical_id":"org:ca:university-of-alberta","type":"organization","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"구글 딥마인드","canonical_id":"org:us:google-deepmind","type":"organization","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"오픈AI","canonical_id":"corp:us:openai","type":"company","role_in_article":"mentioned","metadata":{"ticker":null,"parent":null}},{"name":"일리야 수츠케버","canonical_id":"person:us:ilya-sutskever","type":"person","role_in_article":"quoted","metadata":{"ticker":null,"parent":null}}],"claims":[{"id":"c1","statement":"메타 FAIR·스탠퍼드대·하버드대 공동 연구팀이 2026년 5월 프로그램벤치 벤치마크 논문을 공개했다","as_of":"2026-05","as_of_explicit":true,"as_of_raw":"2026년 5월","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c2","statement":"연구팀은 FFmpeg, SQLite, PHP 인터프리터 등 실제 현장에서 쓰이는 핵심 프로그램 200종을 과제로 선정했다","as_of":"2026-05","as_of_explicit":false,"as_of_raw":"2026년 5월","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c3","statement":"현존 최고 수준의 언어 모델 9종을 투입했다","as_of":"2026-05","as_of_explicit":false,"as_of_raw":"2026년 5월","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c4","statement":"연구팀은 총 24만 8853개의 동작 테스트로 AI들의 코드를 검증했다","as_of":"2026-05","as_of_explicit":false,"as_of_raw":"2026년 5월","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c5","statement":"단 한 개의 모델도 200개 과제 중 하나를 완전히 풀어내지 못했다","as_of":"2026-05","as_of_explicit":false,"as_of_raw":"2026년 5월","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c6","statement":"가장 성적이 좋은 모델조차 200개 중 6개 과제에서만 테스트의 95%를 통과하는 수준에 그쳤다","as_of":"2026-05","as_of_explicit":false,"as_of_raw":"2026년 5월","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c7","statement":"리처드 서튼 앨버타대 교수는 2024년 ACM A.M. 튜링상 수상자다","as_of":"2024","as_of_explicit":true,"as_of_raw":"2024년","source_type":"company_disclosure","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c8","statement":"서튼 교수와 앤드류 바르토 매사추세츠대 명예교수는 2025년 3월 수상자로 선정됐다","as_of":"2025-03","as_of_explicit":true,"as_of_raw":"2025년 3월","source_type":"company_disclosure","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null},{"id":"c9","statement":"서튼 교수는 2019년 '쓴 교훈' 에세이를 썼다","as_of":"2019","as_of_explicit":true,"as_of_raw":"2019년","source_type":"research_paper","comparison":null,"type":"fact","figures":null,"expiry_hint":null,"insight":null}],"ai_emotional_context":{"valence":0,"arousal":0,"primary_emotions":[],"secondary_emotions":[],"emotional_triggers":[]}},"provenance":{"source_chain":["primary_reporting"],"original_source_url":null,"related_articles":[]},"temporal":{"freshness":"recent","next_update_expected":null},"access":{"license":"neupai_standard","attribution_required":true,"structured_data":"free","full_text_available":false,"full_text_access":null}}