@article{JMAI11381,
author = {Mark Twyford and Brian Fahey and Colum Keohane and Daniel Westby and J. Donagh Healy and Stewart W. Walsh},
title = {Quality assessment of ChatGPT-3.5—generated patient information leaflets in vascular surgery},
journal = {Journal of Medical Artificial Intelligence},
volume = {9},
number = {0},
year = {2026},
keywords = {},
abstract = {Background: Generative Pre-trained Transformer (ChatGPT) is a generative artificial intelligence (AI) model developed by Open AI (San Francisco, CA, USA), which generates responses based on input received and can solve problems and complete tasks by using reinforcement techniques and machine learning from sources online. Despite the growing use of AI in healthcare, its application in vascular surgery remains limited. This study aimed to evaluate ChatGPT’s ability to generate patient information related to digital subtraction angiography (DSA). Fifteen commonly asked patient questions regarding DSA were identified, and ChatGPT-3.5 was used to produce responses. Additionally, the model was tasked with generating a complete patient information leaflet for DSA. The outputs were assessed for readability, informational quality, and appropriateness.Methods: Fifteen questions were entered into ChatGPT-3.5, which also generated a complete patient information leaflet for DSA. The readability of the outputs was evaluated using two standardized scoring systems: the Flesch-Kincaid Reading Ease Score (FRES) and the Gunning Fog Index (GFI). The quality of the responses was assessed using the DISCERN tool, and their appropriateness was rated on a Likert scale.Results: The readability analysis using the FRES yielded an average score of 31.21 (range, 16.29–53.57), corresponding to a college-level reading ability. The mean Flesch-Kincaid Grade Level (FKGL) was 13.54 (range, 9.37–15.30). The patient information leaflet generated by ChatGPT-3.5 scored 41.30 on the Reading Ease scale, also indicating a college-level reading age. Using the GFI, the average score for the responses was 15.91 (range, 12.24–20.84), equivalent to the reading level of a college junior or senior, while the patient information leaflet scored 14.43. These findings suggest that the content is written at a significantly higher level than the recommended 6th–8th grade reading level for patient education materials. The quality of the responses, assessed using the DISCERN tool, averaged 41, which is considered “fair”, whereas the patient information leaflet scored 34, classified as “poor”. Despite this, the majority of the content was rated as factually accurate and generally appropriate for the clinical context.Conclusions: Using ChatGPT-3.5 to generate responses to patient questions and create a patient information leaflet resulted in content written at a significantly higher reading level than recommended for patient education. While individual responses demonstrated generally good appropriateness and achieved fair DISCERN scores, the overall quality and readability decreased when generating a complete leaflet. These findings suggest that ChatGPT-3.5 performs better with discrete questions than with comprehensive tasks. Care and caution should be exercised when considering the use of such tools for patient-facing materials.},
issn = {2617-2496}, url = {https://jmai.amegroups.org/article/view/11381}
}