<?xml version="1.0" encoding="iso-8859-1" standalone="no"?>
<!DOCTYPE GmsArticle SYSTEM "http://www.egms.de/dtd/2.0.34/GmsArticle.dtd">
<GmsArticle xmlns:xlink="http://www.w3.org/1999/xlink">
  <MetaData>
    <Identifier>25gmds036</Identifier>
    <IdentifierDoi>10.3205/25gmds036</IdentifierDoi>
    <IdentifierUrn>urn:nbn:de:0183-25gmds0365</IdentifierUrn>
    <ArticleType>Meeting Abstract</ArticleType>
    <TitleGroup>
      <Title language="en">Linking Clinical Trials and Publications: Enhancing DRKS Bibliographic Metadata via Reference Matching</Title>
    </TitleGroup>
    <CreatorList>
      <Creator>
        <PersonNames>
          <Lastname>Darms</Lastname>
          <LastnameHeading>Darms</LastnameHeading>
          <Firstname>Johannes</Firstname>
          <Initials>J</Initials>
        </PersonNames>
        <Address>
          <Affiliation>ZB MED &#8211; Informationszentrum Lebenswissenschaften, K&#246;ln, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Salholz-Hillel</Lastname>
          <LastnameHeading>Salholz-Hillel</LastnameHeading>
          <Firstname>Maia</Firstname>
          <Initials>M</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Berlin Institute of Health (BIH) at Charit&#233; - Universit&#228;tsmedizin Berlin, QUEST Center for Responsible Research, Berlin, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Franzen</Lastname>
          <LastnameHeading>Franzen</LastnameHeading>
          <Firstname>Delwen</Firstname>
          <Initials>D</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Berlin Institute of Health (BIH) at Charit&#233; - Universit&#228;tsmedizin Berlin, QUEST Center for Responsible Research, Berlin, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Strech</Lastname>
          <LastnameHeading>Strech</LastnameHeading>
          <Firstname>Daniel</Firstname>
          <Initials>D</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Berlin Institute of Health (BIH) at Charit&#233; - Universit&#228;tsmedizin Berlin, QUEST Center for Responsible Research, Berlin, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Fluck</Lastname>
          <LastnameHeading>Fluck</LastnameHeading>
          <Firstname>Juliane</Firstname>
          <Initials>J</Initials>
        </PersonNames>
        <Address>
          <Affiliation>ZB MED &#8211; Informationszentrum Lebenswissenschaften, K&#246;ln, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
    </CreatorList>
    <PublisherList>
      <Publisher>
        <Corporation>
          <Corporatename>German Medical Science GMS Publishing House</Corporatename>
        </Corporation>
        <Address>D&#252;sseldorf</Address>
      </Publisher>
    </PublisherList>
    <SubjectGroup>
      <SubjectheadingDDB>610</SubjectheadingDDB>
      <Keyword language="en">FAIR data</Keyword>
      <Keyword language="en">reference matching</Keyword>
    </SubjectGroup>
    <DatePublishedList>
      <DatePublished>20251103</DatePublished>
    </DatePublishedList>
    <Language>engl</Language>
    <License license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/4.0/">
      <AltText language="en">This is an Open Access article distributed under the terms of the Creative Commons Attribution 4.0 License.</AltText>
      <AltText language="de">Dieser Artikel ist ein Open-Access-Artikel und steht unter den Lizenzbedingungen der Creative Commons Attribution 4.0 License (Namensnennung).</AltText>
    </License>
    <SourceGroup>
      <Meeting>
        <MeetingId>M0631</MeetingId>
        <MeetingSequence>036</MeetingSequence>
        <MeetingCorporation>Deutsche Gesellschaft f&#252;r Medizinische Informatik, Biometrie und Epidemiologie</MeetingCorporation>
        <MeetingName>70. Jahrestagung der Deutschen Gesellschaft f&#252;r Medizinische Informatik, Biometrie und Epidemiologie e. V. (GMDS)</MeetingName>
        <MeetingTitle></MeetingTitle>
        <MeetingSession>V: FAIR IT infrastructures and data quality</MeetingSession>
        <MeetingCity>Jena</MeetingCity>
        <MeetingDate>
          <DateFrom>20250907</DateFrom>
          <DateTo>20250911</DateTo>
        </MeetingDate>
      </Meeting>
    </SourceGroup>
    <ArticleNo>Abstr. 215</ArticleNo>
  </MetaData>
  <OrigData>
    <TextBlock name="Text" linked="yes">
      <MainHeadline>Text</MainHeadline><Pgraph><Mark1>Introduction:</Mark1> Clinical trials play a vital role in medical research by providing evidence-based assessments of the efficacy and safety of new therapies. Reusing data from clinical trial registries &#8211; such as the German Clinical Trials Register (DRKS) (<Hyperlink href="https:&#47;&#47;www.drks.de&#47;">https:&#47;&#47;www.drks.de&#47;</Hyperlink>) &#8211; enhances centralized findability, for example via the NFDI4Health Study Hub <TextLink reference="1"></TextLink>, or through meta-research analyses like the BIH QUEST Clinical Trials Transparency Dashboard <TextLink reference="2"></TextLink>. Scientific work related to these trials is also crucial, yet often incomplete or unavailable via the DRKS API; persistent identifiers and links to scientific publications are frequently missing <TextLink reference="3"></TextLink>. To address this, we apply automated reference matching techniques to identify and link relevant literature, making complete citation data accessible through the Health Study Hub.</Pgraph><Pgraph><Mark1>Methods:</Mark1> We exported the complete DRKS data, via the JSON API, comprising 18,143 trials. From these, we extracted publication metadata from the &#8220;trialResults.publications&#8221; section (n&#61;6,016), resulting in 10,621 individual publication records. Of these, 5,196 entries lacked both a link and an uploaded document. Although the document field was consistently empty, many corresponding files were available for download via the DRKS website. We implemented a web scraping routine and identified and retrieved 3,451 valid document URLs.</Pgraph><Pgraph>To extract structured bibliographic metadata, we applied regular expressions to identify digital object identifier (DOIs), PubMed IDs (PMID and PMC), and web URLs from the description field. We further parsed free-text references using the GROBID (GeneRation Of BIbliographic Data) toolkit <TextLink reference="4"></TextLink>, to generate structured citation components (title, authors, journal, year, etc.). To normalize entries and retrieve DOIs, we queried the Crossref API <TextLink reference="5"></TextLink> and applied a weighted title and first author string similarity via the Gestalt algorithm with a similarity threshold of 0.75 to minimize false positives.</Pgraph><Pgraph><Mark1>Results:</Mark1> We applied our approach on the 5,196 items lacking both links and documents using regular expressions, we extracted DOIs from 896 entries, 260 PubMed IDs (PMID) and 140 PMC IDs, and 232 web URLs . By combining GROBID parsing with Crossref matching we identified an additional 568 DOIs. Another 649 references are detected by GROBID could not be resolved via Crossref. Further, 3,669 descriptions as not detected as references by GROBID. Among these, 1,115 were duplicates (e.g., &#8220;Studienprotokoll&#8221; occurred 252 times), and 573 contained fewer than three words &#8211; these were manually confirmed as non-references. The remaining 1,332 entries as well as the 649 detected but not resolved descriptions still require further assessment but likely include a mix of false positives and false negatives.</Pgraph><Pgraph><Mark1>Discussion:</Mark1> Automated bibliographic enrichment using regular expressions, GROBID and CrossRef significantly improves metadata quality and facilitates linking studies to related scientific literature. This increases FAIRness and improves the reusability of data. A full evaluation is still ongoing, although the current implementation represents a significant step forward.</Pgraph><Pgraph><Mark1>Conclusion:</Mark1> We present the first version of an automated pipeline that detects bibliographic references in DRKS. A subset of high-confidence matches has already been integrated into the Health Study Hub, contributing to a more connected and transparent research ecosystem and enhances the FAIRness of clinical trials.</Pgraph><Pgraph>The authors declare that they have no competing interests.</Pgraph><Pgraph>The authors declare that an ethics committee vote is not required.</Pgraph></TextBlock>
    <References linked="yes">
      <Reference refNo="5">
        <RefAuthor>Anonym</RefAuthor>
        <RefTitle></RefTitle>
        <RefYear></RefYear>
        <RefBookTitle>CrossRef API</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>CrossRef API. &#91;cited 2025 Apr 24&#93;. Available from: https:&#47;&#47;api.crossref.org&#47;swagger-ui&#47;index.html</RefTotal>
        <RefLink>https:&#47;&#47;api.crossref.org&#47;swagger-ui&#47;index.html</RefLink>
      </Reference>
      <Reference refNo="2">
        <RefAuthor>Franzen DL</RefAuthor>
        <RefAuthor>Carlisle BG</RefAuthor>
        <RefAuthor>Salholz-Hillel M</RefAuthor>
        <RefAuthor>Riedel N</RefAuthor>
        <RefAuthor>Strech D</RefAuthor>
        <RefTitle>Institutional dashboards on clinical trial transparency for University Medical Centers: A case study. Naudet F, editor</RefTitle>
        <RefYear>2023</RefYear>
        <RefJournal>PLoS Med</RefJournal>
        <RefPage>e1004175</RefPage>
        <RefTotal>Franzen DL, Carlisle BG, Salholz-Hillel M, Riedel N, Strech D. Institutional dashboards on clinical trial transparency for University Medical Centers: A case study. Naudet F, editor. PLoS Med. 2023 Mar 21;20(3):e1004175.</RefTotal>
      </Reference>
      <Reference refNo="4">
        <RefAuthor>Anonym</RefAuthor>
        <RefTitle></RefTitle>
        <RefYear></RefYear>
        <RefBookTitle>kermitt2&#47;grobid: A machine learning software for extracting information from scholarly documents</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>kermitt2&#47;grobid: A machine learning software for extracting information from scholarly documents. &#91;cited 2025 Apr 24&#93;. Available from: https:&#47;&#47;github.com&#47;kermitt2&#47;grobid</RefTotal>
        <RefLink>https:&#47;&#47;github.com&#47;kermitt2&#47;grobid</RefLink>
      </Reference>
      <Reference refNo="3">
        <RefAuthor>Salholz-Hillel M</RefAuthor>
        <RefAuthor>Strech D</RefAuthor>
        <RefAuthor>Carlisle BG</RefAuthor>
        <RefTitle>Results publications are inadequately linked to trial registrations: An automated pipeline and evaluation of German university medical centers</RefTitle>
        <RefYear>2022</RefYear>
        <RefJournal>Clinical Trials</RefJournal>
        <RefPage>337&#8211;46</RefPage>
        <RefTotal>Salholz-Hillel M, Strech D, Carlisle BG. Results publications are inadequately linked to trial registrations: An automated pipeline and evaluation of German university medical centers. Clinical Trials. 2022 Jun;19(3):337&#8211;46.</RefTotal>
      </Reference>
      <Reference refNo="1">
        <RefAuthor>Darms J</RefAuthor>
        <RefAuthor>Clemens V</RefAuthor>
        <RefAuthor>Gonzalez-Ocanto M</RefAuthor>
        <RefAuthor>Br&#252;nings-Kuppe C</RefAuthor>
        <RefAuthor>Cici S</RefAuthor>
        <RefAuthor>Fluck J</RefAuthor>
        <RefTitle>The German Central Health Study Hub &#8211; A Service to Find and Publish Clinical, Public Health and Epidemiolocal Studies and Associated Documents</RefTitle>
        <RefYear>2024</RefYear>
        <RefBookTitle>German Medical Data Sciences 2024. Health &#8211; Thinking, Researching and Acting Together. Proceedings of the 69th Annual Meeting of the German Association of Medical Informatics, Biometry, and Epidemiology e.V. (gmds) 2024 in Dresden, Germany</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>Darms J, Clemens V, Gonzalez-Ocanto M, Br&#252;nings-Kuppe C, Cici S, Fluck J. The German Central Health Study Hub &#8211; A Service to Find and Publish Clinical, Public Health and Epidemiolocal Studies and Associated Documents. In: R&#246;hrig R, Grabe N, H&#252;bner UH, Jung K, Sax U, Schmidt CO, et al, editors. German Medical Data Sciences 2024. Health &#8211; Thinking, Researching and Acting Together. Proceedings of the 69th Annual Meeting of the German Association of Medical Informatics, Biometry, and Epidemiology e.V. (gmds) 2024 in Dresden, Germany. IOS Press; 2024. (Studies in Health Technology and Informatics; 317). DOI: 
10.3233&#47;SHTI240847</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.3233&#47;SHTI240847</RefLink>
      </Reference>
    </References>
    <Media>
      <Tables>
        <NoOfTables>0</NoOfTables>
      </Tables>
      <Figures>
        <NoOfPictures>0</NoOfPictures>
      </Figures>
      <InlineFigures>
        <NoOfPictures>0</NoOfPictures>
      </InlineFigures>
      <Attachments>
        <NoOfAttachments>0</NoOfAttachments>
      </Attachments>
    </Media>
  </OrigData>
</GmsArticle>