{"id":"https://openalex.org/W4417326926","doi":"https://doi.org/10.48550/arxiv.2510.21460","title":"Risk Management for Mitigating Benchmark Failure Modes: BenchRisk","display_name":"Risk Management for Mitigating Benchmark Failure Modes: BenchRisk","publication_year":2025,"publication_date":"2025-10-24","ids":{"openalex":"https://openalex.org/W4417326926","doi":"https://doi.org/10.48550/arxiv.2510.21460"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2510.21460","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.21460","pdf_url":"https://arxiv.org/pdf/2510.21460","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2510.21460","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113182879","display_name":"Sean McGregor","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"McGregor, Sean","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050298825","display_name":"Victor M. Lu","orcid":"https://orcid.org/0000-0002-9470-5890"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Victor","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114331536","display_name":"Vassil Tashev","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tashev, Vassil","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120804075","display_name":"Armstrong Foundjem","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Foundjem, Armstrong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120804076","display_name":"Aishwarya Ramasethu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ramasethu, Aishwarya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120804077","display_name":"Sadegh AlMahdi Kazemi Zarkouei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zarkouei, Sadegh AlMahdi Kazemi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095785547","display_name":"Chris Knotz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Knotz, Chris","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017393194","display_name":"Kongtao Chen","orcid":"https://orcid.org/0000-0002-5803-6570"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Kongtao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082569485","display_name":"Alicia Parrish","orcid":"https://orcid.org/0000-0002-1054-0516"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Parrish, Alicia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059838123","display_name":"Anka Reuel","orcid":"https://orcid.org/0000-0002-7913-9296"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Reuel, Anka","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5095785543","display_name":"Heather Frase","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Frase, Heather","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5113182879"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.46129998564720154,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.46129998564720154,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.04879999905824661,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.03220000118017197,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.9017000198364258},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.6723999977111816},{"id":"https://openalex.org/keywords/risk-management","display_name":"Risk management","score":0.5551999807357788},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5400999784469604},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.5396000146865845},{"id":"https://openalex.org/keywords/risk-assessment","display_name":"Risk assessment","score":0.5389999747276306},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.47540000081062317}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.9017000198364258},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.6723999977111816},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.592199981212616},{"id":"https://openalex.org/C32896092","wikidata":"https://www.wikidata.org/wiki/Q189447","display_name":"Risk management","level":2,"score":0.5551999807357788},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.5498999953269958},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5400999784469604},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.5396000146865845},{"id":"https://openalex.org/C12174686","wikidata":"https://www.wikidata.org/wiki/Q1058438","display_name":"Risk assessment","level":2,"score":0.5389999747276306},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.47540000081062317},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4359999895095825},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.42179998755455017},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.351500004529953},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.3481999933719635},{"id":"https://openalex.org/C37945671","wikidata":"https://www.wikidata.org/wiki/Q7336207","display_name":"Risk-based testing","level":5,"score":0.3391999900341034},{"id":"https://openalex.org/C66283442","wikidata":"https://www.wikidata.org/wiki/Q1389268","display_name":"Failure mode and effects analysis","level":2,"score":0.3174999952316284},{"id":"https://openalex.org/C42475967","wikidata":"https://www.wikidata.org/wiki/Q194292","display_name":"Operations research","level":1,"score":0.2653000056743622},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26330000162124634},{"id":"https://openalex.org/C184356942","wikidata":"https://www.wikidata.org/wiki/Q830382","display_name":"Best practice","level":2,"score":0.25940001010894775}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2510.21460","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.21460","pdf_url":"https://arxiv.org/pdf/2510.21460","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2510.21460","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2510.21460","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2510.21460","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.21460","pdf_url":"https://arxiv.org/pdf/2510.21460","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"model":[2],"(LLM)":[3],"benchmarks":[4,23,121],"inform":[5],"LLM":[6,12,151],"use":[7,18],"decisions":[8],"(e.g.,":[9],"\"is":[10],"this":[11,59],"safe":[13],"to":[14,41,93,108,142],"deploy":[15],"for":[16,86,147,157],"my":[17],"case":[19],"and":[20,51,71,138,170,174],"context?\").":[21],"However,":[22],"may":[24],"be":[25],"rendered":[26],"unreliable":[27],"by":[28],"various":[29],"failure":[30,69,79],"modes":[31,70],"that":[32,102],"impact":[33],"benchmark":[34,43,103],"bias,":[35],"variance,":[36],"coverage,":[37],"or":[38,112,127],"people's":[39],"capacity":[40],"understand":[42],"evidence.":[44],"Using":[45],"the":[46,130,148,168],"National":[47],"Institute":[48],"of":[49,129,150,172],"Standards":[50],"Technology's":[52],"risk":[53,124],"management":[54],"process":[55],"as":[56,161],"a":[57,84,95],"foundation,":[58],"research":[60,145],"iteratively":[61],"analyzed":[62],"26":[63,119],"popular":[64],"benchmarks,":[65],"identifying":[66],"57":[67],"potential":[68],"196":[72],"corresponding":[73],"mitigation":[74],"strategies.":[75],"The":[76,153],"mitigations":[77],"reduce":[78],"likelihood":[80],"and/or":[81],"severity,":[82],"providing":[83],"frame":[85],"evaluating":[87],"\"benchmark":[88],"risk,\"":[89],"which":[90,140],"is":[91],"scored":[92,120,132],"provide":[94],"metaevaluation":[96],"benchmark:":[97],"BenchRisk.":[98],"Higher":[99],"scores":[100],"indicate":[101],"users":[104],"are":[105],"less":[106],"likely":[107],"reach":[109],"an":[110,116,162],"incorrect":[111],"unsupported":[113],"conclusion":[114],"about":[115],"LLM.":[117],"All":[118],"present":[122],"significant":[123],"within":[125],"one":[126],"more":[128],"five":[131],"dimensions":[133],"(comprehensiveness,":[134],"intelligibility,":[135],"consistency,":[136],"correctness,":[137],"longevity),":[139],"points":[141],"important":[143],"open":[144],"directions":[146],"field":[149],"benchmarking.":[152],"BenchRisk":[154],"workflow":[155],"allows":[156],"comparison":[158],"between":[159],"benchmarks;":[160],"open-source":[163],"tool,":[164],"it":[165],"also":[166],"facilitates":[167],"identification":[169],"sharing":[171],"risks":[173],"their":[175],"mitigations.":[176]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-28T00:00:00"}
