@inproceedings{wickramasekara2025AutoDFBench, author={Wickramasekara, Akila and Densmore, Alanna and Breitinger, Frank and Studiawan, Hudan and Scanlon, Mark}, title="{AutoDFBench: A Framework for AI Generated Digital Forensic Code and Tool Testing and Evaluation}", booktitle={Digital Forensics Doctoral Symposium}, series={DFDS 2025}, year=2025, month=04, isbn = {97984007107662504}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, doi={10.1145/3712716.3712718}, url={https://doi.org/10.1145/3712716.3712718}, location={Brno, Czech Republic}, abstract={Generative AI and Large Language Models (LLMs) show potential across various domains, including digital forensics (DF). A notable use case is automatic code generation, which is expected to extend to DF soon. As with any DF tool, these systems must undergo thorough testing and validation. However, manually evaluating outputs, including generated DF code, remains challenging. AutoDFBench is an automated framework designed to address this by validating AI-generated code and tools against NIST's Computer Forensics Tool Testing Program (CFTT) procedures, subsequently calculating an AutoDFBench benchmarking score. The framework operates in four phases: data preparation, API handling, code execution, and result recording with score calculation. It benchmarks generative AI systems, such as LLMs and automated code generation agents, for DF applications. This benchmark can support iterative development or serve as a comparison metric between DF AI systems. As a proof of concept, NIST's forensic string search tests were used, involving over 24,200 tests with five top-performing code generation LLMs. These tests validated outputs of 121 cases, considering two user expertise levels, two programming languages, and ten iterations per case with varying prompts. The results highlight significant limitations of DF-specific solutions generated by generic LLMs.} }