<?xml version="1.0" encoding="iso-8859-1" standalone="no"?>
<!DOCTYPE GmsArticle SYSTEM "http://www.egms.de/dtd/2.0.34/GmsArticle.dtd">
<GmsArticle xmlns:xlink="http://www.w3.org/1999/xlink">
  <MetaData>
    <Identifier>25gmds060</Identifier>
    <IdentifierDoi>10.3205/25gmds060</IdentifierDoi>
    <IdentifierUrn>urn:nbn:de:0183-25gmds0603</IdentifierUrn>
    <ArticleType>Meeting Abstract</ArticleType>
    <TitleGroup>
      <Title language="en">LLMs for data extraction in a systematic review: Is AI yet ready for the task&#63;</Title>
    </TitleGroup>
    <CreatorList>
      <Creator>
        <PersonNames>
          <Lastname>Stenzel</Lastname>
          <LastnameHeading>Stenzel</LastnameHeading>
          <Firstname>Monique</Firstname>
          <Initials>M</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Federated Information Systems, Deutsches Krebsforschungszentrum (DKFZ), Heidelberg, Germany</Affiliation>
          <Affiliation>Complex Medical Informatics, Medizinische Fakult&#228;t Mannheim, Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
          <Affiliation>Mannheimer Institut f&#252;r intelligente Systeme in der Medizin, Medizinische Fakult&#228;t Mannheim, Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
          <Affiliation>Deutsches Konsortium f&#252;r Translationale Krebsforschung (DKTK), DKFZ, Kernzentrum Heidelberg, Heidelberg, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Albu</Lastname>
          <LastnameHeading>Albu</LastnameHeading>
          <Firstname>Alexandra</Firstname>
          <Initials>A</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Klinik f&#252;r An&#228;sthesiologie, Operative Intensivmedizin und Schmerzmedizin, Medizinische Fakult&#228;t Mannheim der Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
          <Affiliation>Mannheim Institute for Innate Immunoscience (MI3), Medizinische Fakult&#228;t Mannheim der Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Holke</Lastname>
          <LastnameHeading>Holke</LastnameHeading>
          <Firstname>Franziska</Firstname>
          <Initials>F</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Klinik f&#252;r An&#228;sthesiologie, Operative Intensivmedizin und Schmerzmedizin, Medizinische Fakult&#228;t Mannheim der Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
          <Affiliation>Mannheim Institute for Innate Immunoscience (MI3), Medizinische Fakult&#228;t Mannheim der Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Hahn</Lastname>
          <LastnameHeading>Hahn</LastnameHeading>
          <Firstname>Bianka</Firstname>
          <Initials>B</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Klinik f&#252;r An&#228;sthesiologie, Operative Intensivmedizin und Schmerzmedizin, Medizinische Fakult&#228;t Mannheim der Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
          <Affiliation>Mannheim Institute for Innate Immunoscience (MI3), Medizinische Fakult&#228;t Mannheim der Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Tellbach</Lastname>
          <LastnameHeading>Tellbach</LastnameHeading>
          <Firstname>Joshua Georg</Firstname>
          <Initials>JG</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Klinik f&#252;r An&#228;sthesiologie, Operative Intensivmedizin und Schmerzmedizin, Medizinische Fakult&#228;t Mannheim der Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
          <Affiliation>Mannheim Institute for Innate Immunoscience (MI3), Medizinische Fakult&#228;t Mannheim der Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Lablans</Lastname>
          <LastnameHeading>Lablans</LastnameHeading>
          <Firstname>Martin</Firstname>
          <Initials>M</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Federated Information Systems, Deutsches Krebsforschungszentrum (DKFZ), Heidelberg, Germany</Affiliation>
          <Affiliation>Complex Medical Informatics, Medizinische Fakult&#228;t Mannheim, Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
          <Affiliation>Mannheimer Institut f&#252;r intelligente Systeme in der Medizin, Medizinische Fakult&#228;t Mannheim, Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
          <Affiliation>Deutsches Konsortium f&#252;r Translationale Krebsforschung (DKTK), DKFZ, Kernzentrum Heidelberg, Heidelberg, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Schneider-Lindner</Lastname>
          <LastnameHeading>Schneider-Lindner</LastnameHeading>
          <Firstname>Verena</Firstname>
          <Initials>V</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Klinik f&#252;r An&#228;sthesiologie, Operative Intensivmedizin und Schmerzmedizin, Medizinische Fakult&#228;t Mannheim der Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
          <Affiliation>Mannheim Institute for Innate Immunoscience (MI3), Medizinische Fakult&#228;t Mannheim der Universit&#228;t Heidelberg, Mannheim, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
    </CreatorList>
    <PublisherList>
      <Publisher>
        <Corporation>
          <Corporatename>German Medical Science GMS Publishing House</Corporatename>
        </Corporation>
        <Address>D&#252;sseldorf</Address>
      </Publisher>
    </PublisherList>
    <SubjectGroup>
      <SubjectheadingDDB>610</SubjectheadingDDB>
      <Keyword language="en">large language model (LLM)</Keyword>
      <Keyword language="en">artificial intelligence in evidence synthesis</Keyword>
      <Keyword language="en">systematic review - data extraction</Keyword>
      <Keyword language="en">propensity score methods</Keyword>
      <Keyword language="en">sepsis</Keyword>
    </SubjectGroup>
    <DatePublishedList>
      <DatePublished>20251103</DatePublished>
    </DatePublishedList>
    <Language>engl</Language>
    <License license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/4.0/">
      <AltText language="en">This is an Open Access article distributed under the terms of the Creative Commons Attribution 4.0 License.</AltText>
      <AltText language="de">Dieser Artikel ist ein Open-Access-Artikel und steht unter den Lizenzbedingungen der Creative Commons Attribution 4.0 License (Namensnennung).</AltText>
    </License>
    <SourceGroup>
      <Meeting>
        <MeetingId>M0631</MeetingId>
        <MeetingSequence>060</MeetingSequence>
        <MeetingCorporation>Deutsche Gesellschaft f&#252;r Medizinische Informatik, Biometrie und Epidemiologie</MeetingCorporation>
        <MeetingName>70. Jahrestagung der Deutschen Gesellschaft f&#252;r Medizinische Informatik, Biometrie und Epidemiologie e. V. (GMDS)</MeetingName>
        <MeetingTitle></MeetingTitle>
        <MeetingSession>V: Large language models &#38; medical texts 2</MeetingSession>
        <MeetingCity>Jena</MeetingCity>
        <MeetingDate>
          <DateFrom>20250907</DateFrom>
          <DateTo>20250911</DateTo>
        </MeetingDate>
      </Meeting>
    </SourceGroup>
    <ArticleNo>Abstr. 63</ArticleNo>
  </MetaData>
  <OrigData>
    <TextBlock name="Text" linked="yes">
      <MainHeadline>Text</MainHeadline><Pgraph><Mark1>Introduction:</Mark1> Systematic reviews are typically conducted following established standards <TextLink reference="1"></TextLink>. With their increasing capabilities, Artificial intelligence (AI) large language models (LLMs) such as ChatGPT may assist with systematic reviews, but their performance in data extraction from published studies is unclear. We therefore evaluated the consistency of data extracted by humans and AI, offering insights into AI&#39;s potential in systematic reviews. </Pgraph><Pgraph><Mark1>Methods:</Mark1> We used extracted data available from a time-stratified subset (N&#61;45) of our systematic review of studies in ICU patients with sepsis risk or sepsis treatment that utilized propensity score methods <TextLink reference="2"></TextLink>. From each time stratum we selected one study (N&#61;5) for data re-extraction with freely accessible AI tools (ChatGPT&#47;ChatGPT-4-turbo, Deep Seek&#47;DeepSeekR1 <TextLink reference="3"></TextLink>, Qwen&#47;Qwen2.5-Max <TextLink reference="4"></TextLink>, Mistral&#47;Mistral 1.0 (LeChat) <TextLink reference="5"></TextLink>, Grok&#47;Grok-3). The AI received a prompt containing instructions on requirements, expected result format and the previously employed data extraction form comprising 24 questions. Two independent evaluators compared pre-existing and AI-generated free-text answers, and single&#47;multiple choice answers were compared with python. We assigned a value of 2 for full agreement, 1 for partial agreement and 0 for disagreement. Correct answers had a value of &#8805;1. We quantified agreement with weighted Cohen&#8217;s kappa(&#954;). For this, we first compared each of the human reviews to the human consensus, followed by each AI to each human reviewer&#8217;s consensus-aligned assessment. </Pgraph><Pgraph><Mark1>Results:</Mark1> Overall, all AI yielded a similar correct proportion of the 120 answers: ChatGPT 70.8&#37;, Mistral 73.3&#37;, Grok 74.2&#37;, Qwen 75&#37; and DeepSeek 78.3&#37;. Averages for correct AI answers &#91;Min-Max&#93; varied by study, ranging from 60.0&#37; &#91;54.2&#37;-66.7&#37;&#93; to 83.3&#37; &#91;66.7&#37;-95.8&#37;&#93;. The weighted inter-rater agreement between the two human reviewers and their consensus was substantial (Cohen&#8217;s &#954;&#61;0.65). The AI agreement with the consensus-aligned ratings of the first human reviewer were: ChatGPT (&#954;&#61;0.36), DeepSeek (&#954;&#61;0.48), Qwen (&#954;&#61;0.48), Mistral (&#954;&#61;0.44), and Grok (&#954;&#61;0.41), indicating fair to moderate agreement. Compared to the second human reviewer, the weighted kappa values were: ChatGPT (&#954;&#61;0.28), DeepSeek (&#954;&#61;0.46), Qwen (&#954;&#61;0.41), Mistral (&#954;&#61;0.41), and Grok (&#954;&#61;0.42), also reflecting fair to moderate agreement. Across all studies, for some questions, both individual humans and AI noticeably deviated from human consensus. For additional questions the AI strongly deviated from human answers. For the remaining questions some or all AIs&#8217; answers matched humans&#8217;.</Pgraph><Pgraph><Mark1>Discussion:</Mark1> The substantial agreement indicates a generally reliable human baseline. Questions being challenging for both AI and human reviewers reflect difficulties in extracting complex or ambiguously reported, often methodological information. Agreement variability for such questions across studies indicates that some publications facilitated data extraction, but characteristics for reporting recommendations are yet to be identified. Prompt engineering may enhance AI-performance for some questions. AI-assistance for other systematic review steps remains untested. A larger study sample would allow a more precise determination of agreement and particular strengths of specific AIs.</Pgraph><Pgraph><Mark1>Conclusion:</Mark1> While current LLMs show potential for supporting data extraction in systematic reviews, particularly when guided by well-structured forms, their agreement with human reviewers remains limited and their performance is insufficient to replace human judgment. Future strategies may include flagging low AI confidence, enabling human reviewers to prioritize, thereby enhancing review efficiency.</Pgraph><Pgraph>The authors declare that they have no competing interests.</Pgraph><Pgraph>The authors declare that an ethics committee vote is not required.</Pgraph></TextBlock>
    <References linked="yes">
      <Reference refNo="1">
        <RefAuthor>Page MJ</RefAuthor>
        <RefAuthor>McKenzie JE</RefAuthor>
        <RefAuthor>Bossuyt PM</RefAuthor>
        <RefAuthor>Boutron I</RefAuthor>
        <RefAuthor>Hoffmann TC</RefAuthor>
        <RefAuthor>Mulrow CD</RefAuthor>
        <RefAuthor></RefAuthor>
        <RefTitle>The PRISMA 2020 statement: an updated guideline for reporting systematic reviews</RefTitle>
        <RefYear>2021</RefYear>
        <RefJournal>J Clin Epidemiol</RefJournal>
        <RefPage>178-89</RefPage>
        <RefTotal>Page MJ, McKenzie JE, Bossuyt PM, Boutron I, Hoffmann TC, Mulrow CD, et al. The PRISMA 2020 statement: an updated guideline for reporting systematic reviews. J Clin Epidemiol. 2021;134:178-89. DOI: 10.1016&#47;j.jclinepi.2021.03.001</RefTotal>
        <RefLink>http:&#47;&#47;dx.doi.org&#47;10.1016&#47;j.jclinepi.2021.03.001</RefLink>
      </Reference>
      <Reference refNo="2">
        <RefAuthor>Stenzel M</RefAuthor>
        <RefAuthor>Holke F</RefAuthor>
        <RefAuthor>Schneider-Lindner V</RefAuthor>
        <RefTitle></RefTitle>
        <RefYear>2023</RefYear>
        <RefBookTitle>Propensity score methods in sepsis research in critical care: a systematic review</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>Stenzel M, Holke F, Schneider-Lindner V. Propensity score methods in sepsis research in critical care: a systematic review. PROSPERO; 2023. Available from: https:&#47;&#47;www.crd.york.ac.uk&#47;PROSPERO&#47;view&#47;CRD42023458707</RefTotal>
        <RefLink>https:&#47;&#47;www.crd.york.ac.uk&#47;PROSPERO&#47;view&#47;CRD42023458707</RefLink>
      </Reference>
      <Reference refNo="3">
        <RefAuthor>DeepSeek-Ai</RefAuthor>
        <RefAuthor>Liu A</RefAuthor>
        <RefAuthor>Feng B</RefAuthor>
        <RefAuthor>Xue B</RefAuthor>
        <RefAuthor>Wang B</RefAuthor>
        <RefAuthor>Wu B</RefAuthor>
        <RefAuthor></RefAuthor>
        <RefTitle>DeepSeek-V3 Technical Report &#91;Preprint&#93;</RefTitle>
        <RefYear>2024</RefYear>
        <RefJournal>arXiv</RefJournal>
        <RefPage></RefPage>
        <RefTotal>DeepSeek-Ai, Liu A, Feng B, Xue B, Wang B, Wu B, et al. DeepSeek-V3 Technical Report &#91;Preprint&#93;. arXiv. 2024 Dec 27. DOI: 10.48550&#47;arXiv.2412.19437</RefTotal>
        <RefLink>http:&#47;&#47;dx.doi.org&#47;10.48550&#47;arXiv.2412.19437</RefLink>
      </Reference>
      <Reference refNo="4">
        <RefAuthor>Qwen</RefAuthor>
        <RefAuthor>Yang A</RefAuthor>
        <RefAuthor>Yang B</RefAuthor>
        <RefAuthor>Zhang B</RefAuthor>
        <RefAuthor>Hui B</RefAuthor>
        <RefAuthor>Zheng B</RefAuthor>
        <RefAuthor>Yu B</RefAuthor>
        <RefAuthor></RefAuthor>
        <RefTitle>Qwen2.5 Technical Report &#91;Preprint&#93;</RefTitle>
        <RefYear>2024</RefYear>
        <RefJournal>arXiv</RefJournal>
        <RefPage></RefPage>
        <RefTotal>Qwen, Yang A, Yang B, Zhang B, Hui B, Zheng B, Yu B, et al. Qwen2.5 Technical Report &#91;Preprint&#93;. arXiv. 2024 Dec 19. DOI: 10.48550&#47;arXiv.2412.15115</RefTotal>
        <RefLink>http:&#47;&#47;dx.doi.org&#47;10.48550&#47;arXiv.2412.15115</RefLink>
      </Reference>
      <Reference refNo="5">
        <RefAuthor>Jiang AQ</RefAuthor>
        <RefAuthor>Sablayrolles A</RefAuthor>
        <RefAuthor>Mensch A</RefAuthor>
        <RefAuthor>Bamford C</RefAuthor>
        <RefAuthor>Singh Chaplot D</RefAuthor>
        <RefAuthor>de las Casa D</RefAuthor>
        <RefAuthor></RefAuthor>
        <RefTitle>Mistral 7B &#91;Preprint&#93;</RefTitle>
        <RefYear>2023</RefYear>
        <RefJournal>arXiv</RefJournal>
        <RefPage></RefPage>
        <RefTotal>Jiang AQ, Sablayrolles A, Mensch A, Bamford C, Singh Chaplot D, de las Casa D, et al. Mistral 7B &#91;Preprint&#93;. arXiv. 2023 Oct 10. DOI: 10.48550&#47;arXiv.2310.06825</RefTotal>
        <RefLink>http:&#47;&#47;dx.doi.org&#47;10.48550&#47;arXiv.2310.06825</RefLink>
      </Reference>
    </References>
    <Media>
      <Tables>
        <NoOfTables>0</NoOfTables>
      </Tables>
      <Figures>
        <NoOfPictures>0</NoOfPictures>
      </Figures>
      <InlineFigures>
        <NoOfPictures>0</NoOfPictures>
      </InlineFigures>
      <Attachments>
        <NoOfAttachments>0</NoOfAttachments>
      </Attachments>
    </Media>
  </OrigData>
</GmsArticle>