<?xml version="1.0" encoding="iso-8859-1" standalone="no"?>
<!DOCTYPE GmsArticle SYSTEM "http://www.egms.de/dtd/2.0.34/GmsArticle.dtd">
<GmsArticle xmlns:xlink="http://www.w3.org/1999/xlink">
  <MetaData>
    <Identifier>25gmds061</Identifier>
    <IdentifierDoi>10.3205/25gmds061</IdentifierDoi>
    <IdentifierUrn>urn:nbn:de:0183-25gmds0617</IdentifierUrn>
    <ArticleType>Meeting Abstract</ArticleType>
    <TitleGroup>
      <Title language="en">Retrieval-Augmented Generation for Semantic Annotation of Clinical Trials on the Example of ICD-10 Codes</Title>
    </TitleGroup>
    <CreatorList>
      <Creator>
        <PersonNames>
          <Lastname>Lehmann</Lastname>
          <LastnameHeading>Lehmann</LastnameHeading>
          <Firstname>Paula</Firstname>
          <Initials>P</Initials>
        </PersonNames>
        <Address>
          <Affiliation>MOLIT Institut gGmbH, Heilbronn, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Mathes</Lastname>
          <LastnameHeading>Mathes</LastnameHeading>
          <Firstname>Georg</Firstname>
          <Initials>G</Initials>
        </PersonNames>
        <Address>
          <Affiliation>MOLIT Institut gGmbH, Heilbronn, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Vishnevskaya</Lastname>
          <LastnameHeading>Vishnevskaya</LastnameHeading>
          <Firstname>Valeriya</Firstname>
          <Initials>V</Initials>
        </PersonNames>
        <Address>
          <Affiliation>MOLIT Institut gGmbH, Heilbronn, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Bochum</Lastname>
          <LastnameHeading>Bochum</LastnameHeading>
          <Firstname>Sylvia</Firstname>
          <Initials>S</Initials>
        </PersonNames>
        <Address>
          <Affiliation>SLK Kliniken Heilbronn GmbH, Heilbronn, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Sigle</Lastname>
          <LastnameHeading>Sigle</LastnameHeading>
          <Firstname>Stefan</Firstname>
          <Initials>S</Initials>
        </PersonNames>
        <Address>
          <Affiliation>MOLIT Institut gGmbH, Heilbronn, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
    </CreatorList>
    <PublisherList>
      <Publisher>
        <Corporation>
          <Corporatename>German Medical Science GMS Publishing House</Corporatename>
        </Corporation>
        <Address>D&#252;sseldorf</Address>
      </Publisher>
    </PublisherList>
    <SubjectGroup>
      <SubjectheadingDDB>610</SubjectheadingDDB>
      <Keyword language="en">retrieval-augmented generation</Keyword>
      <Keyword language="en">terminologies</Keyword>
      <Keyword language="en">ICD-10</Keyword>
      <Keyword language="en">clinical trial annotation</Keyword>
      <Keyword language="en">semantic annotation</Keyword>
    </SubjectGroup>
    <DatePublishedList>
      <DatePublished>20251103</DatePublished>
    </DatePublishedList>
    <Language>engl</Language>
    <License license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/4.0/">
      <AltText language="en">This is an Open Access article distributed under the terms of the Creative Commons Attribution 4.0 License.</AltText>
      <AltText language="de">Dieser Artikel ist ein Open-Access-Artikel und steht unter den Lizenzbedingungen der Creative Commons Attribution 4.0 License (Namensnennung).</AltText>
    </License>
    <SourceGroup>
      <Meeting>
        <MeetingId>M0631</MeetingId>
        <MeetingSequence>061</MeetingSequence>
        <MeetingCorporation>Deutsche Gesellschaft f&#252;r Medizinische Informatik, Biometrie und Epidemiologie</MeetingCorporation>
        <MeetingName>70. Jahrestagung der Deutschen Gesellschaft f&#252;r Medizinische Informatik, Biometrie und Epidemiologie e. V. (GMDS)</MeetingName>
        <MeetingTitle></MeetingTitle>
        <MeetingSession>V: Large language models &#38; medical texts 2</MeetingSession>
        <MeetingCity>Jena</MeetingCity>
        <MeetingDate>
          <DateFrom>20250907</DateFrom>
          <DateTo>20250911</DateTo>
        </MeetingDate>
      </Meeting>
    </SourceGroup>
    <ArticleNo>Abstr. 296</ArticleNo>
  </MetaData>
  <OrigData>
    <TextBlock name="Text" linked="yes">
      <MainHeadline>Text</MainHeadline><Pgraph><Mark1>Introduction:</Mark1> Accurate, structured annotation of clinical research data is essential to support personalized oncology workflows, such as matching patients to relevant clinical trials <TextLink reference="1"></TextLink>. However, manual coding of clinical diagnoses using standardized terminologies remains time-consuming and prone to inconsistencies <TextLink reference="2"></TextLink>. </Pgraph><Pgraph>This study explores the application of Retrieval-Augmented Generation (RAG) <TextLink reference="3"></TextLink> for accurate ICD-10-CM classification of cancer-related diagnoses.  </Pgraph><Pgraph>The goal is enabling automated semantic annotation of clinical trials in precision oncology. By mapping unstructured diagnosis descriptions to standardized codes, this improves interoperability and enhances analysis of clinical trial documentation.</Pgraph><Pgraph><Mark1>State of the art:</Mark1>  Primary clinical trial registries such as clinicaltrials.gov and DRKS often do not allow for semantic searching on investigated conditions, leaving the risk of studies being omitted and disregarded in a search. </Pgraph><Pgraph>Current RAG architectures leverage vector databases and language models for information retrieval and response generation <TextLink reference="3"></TextLink>. Other groups used RAG in clinical trial screening, using cloud models <TextLink reference="4"></TextLink>.</Pgraph><Pgraph><Mark1>Concept:</Mark1> Our prototype is intended to generate precise, contextually relevant ICD-10 codes for free-text condition names. Relevant terminology information is provided to the RAG pipeline, chunked (split into smaller units) to facilitate logically coherent retrieval and stored in a vector store via local embedding. </Pgraph><Pgraph>When translating, relevant mappings are retrieved from the store, provided to the LLM, along with a system prompt containing examples, to extract matching codes.</Pgraph><Pgraph>We continually evaluate the mapped codes, as well as the pure retrieved context without LLM involvement to enable iterative optimization of parameters.</Pgraph><Pgraph><Mark1>Implementation:</Mark1> The RAG system is implemented in Python as OpenWebui pipeline, llama-index, langchain, ChromaDB&#47;FAISS <TextLink reference="5"></TextLink> as vector storage. The ICD-10 classification is provided in JSON. Knowledge is chunked and embedded into the vector store via nomic-embed-text as embedding model.  </Pgraph><Pgraph>Context retrieval uses strategies like similarity search. LLMs are hosted locally via Ollama. Regex-based post-processing extracts ICD codes from the LLM output.</Pgraph><Pgraph>For evaluation, retrieved context and the whole pipelines&#8217; output are checked against the expected codes via JavaScript. Two datasets were used: One with ICD code-display pairs randomly selected from the ICD-10 classification chapter 2 (n&#61;30), the other with free-text diagnoses from public trial records, translated to ICD-10 codes by a physician (n&#61;52).</Pgraph><Pgraph><Mark1>Lessons learned:</Mark1> Lessons drawn from this work: </Pgraph><Pgraph><UnorderedList><ListItem level="1">Logical chunking seems to work well for information retrieval in our use case </ListItem><ListItem level="1">Identifying ICD main groups is much easier than finding more specific codes </ListItem><ListItem level="1">Some descriptions lack specificity, preventing mapping (e.g. &#8216;Refractory Cancer&#8217;) </ListItem><ListItem level="1">Detailed prompts help LLMs to produce precise outputs </ListItem></UnorderedList></Pgraph><Pgraph>These led to promising, intermediary results: Using Chroma vector storage, similarity scoring retrieval strategy, a k value of 5, Phi4 as LLM, the ICD-10 main group (e.g. C00 for C00.1) was correct for all generated codes in 37 of 52 (71.2&#37;) test conditions. Full match between generated and expected codes was achieved in 17 (32.7&#37;) conditions. For test cases from the ICD-10 classification, the setup achieved correct main groups in 25 of 30 (83.3&#37;) cases, complete matches in 22 (73.3&#37;) cases. This highlights difficulties caused by non-standardized condition descriptions (e.g. &#8216;Cml&#8217;) and indicates room for further improvement.</Pgraph><Pgraph>The authors declare that they have no competing interests.</Pgraph><Pgraph>The authors declare that an ethics committee vote is not required.</Pgraph></TextBlock>
    <References linked="yes">
      <Reference refNo="1">
        <RefAuthor>Zeng J</RefAuthor>
        <RefAuthor>Shufean MA</RefAuthor>
        <RefAuthor>Khotskaya Y</RefAuthor>
        <RefAuthor>Yang D</RefAuthor>
        <RefAuthor>Kahle M</RefAuthor>
        <RefAuthor>Johnson A</RefAuthor>
        <RefAuthor></RefAuthor>
        <RefTitle>OCTANE: Oncology Clinical Trial Annotation Engine</RefTitle>
        <RefYear>2019</RefYear>
        <RefJournal>JCO Clin Cancer Inform</RefJournal>
        <RefPage>1&#8211;11</RefPage>
        <RefTotal>Zeng J, Shufean MA, Khotskaya Y, Yang D, Kahle M, Johnson A, et al. OCTANE: Oncology Clinical Trial Annotation Engine. JCO Clin Cancer Inform. 2019;3:1&#8211;11. DOI: 10.1200&#47;CCI.18.00145</RefTotal>
        <RefLink>http:&#47;&#47;dx.doi.org&#47;10.1200&#47;CCI.18.00145</RefLink>
      </Reference>
      <Reference refNo="2">
        <RefAuthor>Mi&#241;arro-Gim&#233;nez JA</RefAuthor>
        <RefAuthor>Mart&#237;nez-Costa C</RefAuthor>
        <RefAuthor>Karlsson D</RefAuthor>
        <RefAuthor>Schulz S</RefAuthor>
        <RefAuthor>G&#248;eg KR</RefAuthor>
        <RefTitle>Qualitative analysis of manual annotations of clinical text with SNOMED CT</RefTitle>
        <RefYear>2018</RefYear>
        <RefJournal>PloS One</RefJournal>
        <RefPage>e0209547</RefPage>
        <RefTotal>Mi&#241;arro-Gim&#233;nez JA, Mart&#237;nez-Costa C, Karlsson D, Schulz S, G&#248;eg KR. Qualitative analysis of manual annotations of clinical text with SNOMED CT. PloS One. 2018;13:e0209547. DOI: 10.1371&#47;journal.pone.0209547</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1371&#47;journal.pone.0209547</RefLink>
      </Reference>
      <Reference refNo="3">
        <RefAuthor>Gao Y</RefAuthor>
        <RefAuthor>Xiong Y</RefAuthor>
        <RefAuthor>Gao X</RefAuthor>
        <RefAuthor>Jia K</RefAuthor>
        <RefAuthor>Pan J</RefAuthor>
        <RefAuthor>Bi Y</RefAuthor>
        <RefAuthor></RefAuthor>
        <RefTitle>Retrieval-Augmented Generation for Large Language Models</RefTitle>
        <RefYear></RefYear>
        <RefTotal>Gao Y, Xiong Y, Gao X, Jia K, Pan J, Bi Y, et al. Retrieval-Augmented Generation for Large Language Models: A Survey &#91;Preprint&#93;. arXiv. 2023. DOI: 10.48550&#47;ARXIV.2312.10997</RefTotal>
        <RefLink>http:&#47;&#47;dx.doi.org&#47;10.48550&#47;ARXIV.2312.10997</RefLink>
      </Reference>
      <Reference refNo="4">
        <RefAuthor>Tan R</RefAuthor>
        <RefAuthor>Ho SX</RefAuthor>
        <RefAuthor>Oo SVF</RefAuthor>
        <RefAuthor>Chua SL</RefAuthor>
        <RefAuthor>Zaw MWW</RefAuthor>
        <RefAuthor>and Tan DS-W</RefAuthor>
        <RefTitle>Retrieval-augmented large language models for clinical trial screening</RefTitle>
        <RefYear>2024</RefYear>
        <RefJournal>J Clin Oncol</RefJournal>
        <RefPage>e13611&#8211;e13611</RefPage>
        <RefTotal>Tan R, Ho SX, Oo SVF, Chua SL, Zaw MWW, and Tan DS-W. Retrieval-augmented large language models for clinical trial screening. J Clin Oncol. 2024;42:e13611&#8211;e13611. DOI: 10.1200&#47;JCO.2024.42.16&#95;suppl.e13611</RefTotal>
        <RefLink>http:&#47;&#47;dx.doi.org&#47;10.1200&#47;JCO.2024.42.16&#95;suppl.e13611</RefLink>
      </Reference>
      <Reference refNo="5">
        <RefAuthor>Douze M</RefAuthor>
        <RefAuthor>Guzhva A</RefAuthor>
        <RefAuthor>Deng C</RefAuthor>
        <RefAuthor>Johnson J</RefAuthor>
        <RefAuthor>Szilvasy G</RefAuthor>
        <RefAuthor>Mazar&#233; P-E</RefAuthor>
        <RefAuthor></RefAuthor>
        <RefTitle>The Faiss library</RefTitle>
        <RefYear>2025</RefYear>
        <RefJournal>arXiv</RefJournal>
        <RefPage></RefPage>
        <RefTotal>Douze M, Guzhva A, Deng C, Johnson J, Szilvasy G, Mazar&#233; P-E, et al. The Faiss library &#91;Preprint&#93;. arXiv. 2025. DOI: 10.48550&#47;arXiv.2401.08281</RefTotal>
        <RefLink>http:&#47;&#47;dx.doi.org&#47;10.48550&#47;arXiv.2401.08281</RefLink>
      </Reference>
    </References>
    <Media>
      <Tables>
        <NoOfTables>0</NoOfTables>
      </Tables>
      <Figures>
        <NoOfPictures>0</NoOfPictures>
      </Figures>
      <InlineFigures>
        <NoOfPictures>0</NoOfPictures>
      </InlineFigures>
      <Attachments>
        <NoOfAttachments>0</NoOfAttachments>
      </Attachments>
    </Media>
  </OrigData>
</GmsArticle>