<?xml version="1.0" encoding="iso-8859-1" standalone="no"?>
<!DOCTYPE GmsArticle SYSTEM "http://www.egms.de/dtd/2.0.34/GmsArticle.dtd">
<GmsArticle xmlns:xlink="http://www.w3.org/1999/xlink">
  <MetaData>
    <Identifier>25gmds086</Identifier>
    <IdentifierDoi>10.3205/25gmds086</IdentifierDoi>
    <IdentifierUrn>urn:nbn:de:0183-25gmds0864</IdentifierUrn>
    <ArticleType>Meeting Abstract</ArticleType>
    <TitleGroup>
      <Title language="en">Towards automatic inference of agents for cancer treatment from substance data in cancer registry data</Title>
    </TitleGroup>
    <CreatorList>
      <Creator>
        <PersonNames>
          <Lastname>D&#228;hlmann</Lastname>
          <LastnameHeading>D&#228;hlmann</LastnameHeading>
          <Firstname>Klaas</Firstname>
          <Initials>K</Initials>
        </PersonNames>
        <Address>
          <Affiliation>OFFIS - Institute for Information Technology, Oldenburg, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Wolters</Lastname>
          <LastnameHeading>Wolters</LastnameHeading>
          <Firstname>Timo</Firstname>
          <Initials>T</Initials>
        </PersonNames>
        <Address>
          <Affiliation>OFFIS - Institute for Information Technology, Oldenburg, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Beckhaus</Lastname>
          <LastnameHeading>Beckhaus</LastnameHeading>
          <Firstname>Julia</Firstname>
          <Initials>J</Initials>
        </PersonNames>
        <Address>
          <Affiliation>OFFIS - Institute for Information Technology, Oldenburg, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Sauerberg</Lastname>
          <LastnameHeading>Sauerberg</LastnameHeading>
          <Firstname>Markus</Firstname>
          <Initials>M</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Hamburgisches Krebsregister, Hamburg, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Kusche</Lastname>
          <LastnameHeading>Kusche</LastnameHeading>
          <Firstname>Henrik</Firstname>
          <Initials>H</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Hamburgisches Krebsregister, Hamburg, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>H&#252;bner</Lastname>
          <LastnameHeading>H&#252;bner</LastnameHeading>
          <Firstname>Joachim</Firstname>
          <Initials>J</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Klinische Landesauswertungsstelle Niedersachsen (KLast), Oldenburg, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>L&#252;pkes</Lastname>
          <LastnameHeading>L&#252;pkes</LastnameHeading>
          <Firstname>Christian</Firstname>
          <Initials>C</Initials>
        </PersonNames>
        <Address>
          <Affiliation>OFFIS - Institute for Information Technology, Oldenburg, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Hein</Lastname>
          <LastnameHeading>Hein</LastnameHeading>
          <Firstname>Andreas</Firstname>
          <Initials>A</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Universit&#228;t Oldenburg, Abteilung f&#252;r Assistenzsysteme und Medizintechnik, Oldenburg, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
    </CreatorList>
    <PublisherList>
      <Publisher>
        <Corporation>
          <Corporatename>German Medical Science GMS Publishing House</Corporatename>
        </Corporation>
        <Address>D&#252;sseldorf</Address>
      </Publisher>
    </PublisherList>
    <SubjectGroup>
      <SubjectheadingDDB>610</SubjectheadingDDB>
      <Keyword language="en">clinical cancer registration</Keyword>
      <Keyword language="en">Levenshtein distance</Keyword>
      <Keyword language="en">oBDS format</Keyword>
      <Keyword language="en">systemic therapy</Keyword>
    </SubjectGroup>
    <DatePublishedList>
      <DatePublished>20251103</DatePublished>
    </DatePublishedList>
    <Language>engl</Language>
    <License license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/4.0/">
      <AltText language="en">This is an Open Access article distributed under the terms of the Creative Commons Attribution 4.0 License.</AltText>
      <AltText language="de">Dieser Artikel ist ein Open-Access-Artikel und steht unter den Lizenzbedingungen der Creative Commons Attribution 4.0 License (Namensnennung).</AltText>
    </License>
    <SourceGroup>
      <Meeting>
        <MeetingId>M0631</MeetingId>
        <MeetingSequence>086</MeetingSequence>
        <MeetingCorporation>Deutsche Gesellschaft f&#252;r Medizinische Informatik, Biometrie und Epidemiologie</MeetingCorporation>
        <MeetingName>70. Jahrestagung der Deutschen Gesellschaft f&#252;r Medizinische Informatik, Biometrie und Epidemiologie e. V. (GMDS)</MeetingName>
        <MeetingTitle></MeetingTitle>
        <MeetingSession>V: Register</MeetingSession>
        <MeetingCity>Jena</MeetingCity>
        <MeetingDate>
          <DateFrom>20250907</DateFrom>
          <DateTo>20250911</DateTo>
        </MeetingDate>
      </Meeting>
    </SourceGroup>
    <ArticleNo>Abstr. 198</ArticleNo>
  </MetaData>
  <OrigData>
    <TextBlock name="Text" linked="yes">
      <MainHeadline>Text</MainHeadline><Pgraph><Mark1>Introduction:</Mark1> In the German cancer registration, diagnostic or treatment events are reported using the oBDS format <TextLink reference="1"></TextLink>. This format defines all relevant data fields, types, and values, except for few cases such as substances used in systemic therapies, which are reported as free text. These substances are supposed to describe exactly one agent per data field using its generic name, but are sometimes reported under their trade names, do contain typos, and many other issues. For example, the dataset at the KLast (Klinische Landesauswertungsstelle Niedersachsen) <TextLink reference="2"></TextLink> contains 685,228 total substances comprising 8,655 unique values, but only 8&#37; of these unique values properly describe such an agent, leaving 92&#37; to be fixed manually by the documentalists which is both tedious and time-consuming.</Pgraph><Pgraph>This paper focuses on the automatic inference of agents from free text using actual substance data of the KLast and provides a two-fold contribution: We explore the dataset and characterize typical issues with the reported substances and we propose a preliminary approach using the Levenshtein distance <TextLink reference="3"></TextLink>, a metric to calculate the difference between two character sequences, to infer the agents.</Pgraph><Pgraph><Mark1>Methods:</Mark1> Initially, we import a work-in-progress dictionary of the tumor best-of task force of the Plattform 65c <TextLink reference="4"></TextLink> that maps known trade names and abbreviations to their correct agents. Then, all unknown substances (e.g. empty strings, variations of &#8216;UNKNOWN&#8217;, &#8216;NULL&#8217;, or &#8216;&#35;NAME&#63;&#8217;) are flagged as such.</Pgraph><Pgraph>Afterwards, if a substance is found in the dictionary with case-insensitive search, its agent is immediately determined. Otherwise, the Levenshtein distances between the given substance and all dictionary values are individually calculated. If only one value has the smallest distance to the given substance, it is considered a match. Otherwise, no match can be determined.</Pgraph><Pgraph><Mark1>Results:</Mark1> We restrict ourselves to small Levenshtein distances of 1 to 3 between given substances and values in the dictionary. Using this approach, 609,655 of the total substances have been resolved, which accounts for 2,021 (23&#37;) of all unique values. When ignoring our self-imposed rule of allowing small distances only, a theoretical maximum of 6,488 (75&#37;) of all unique values may be resolved. Conversely, 2,167 (25&#37;) of all unique values are always unresolvable.</Pgraph><Pgraph><Mark1>Discussion:</Mark1> To avoid misclassifications, we currently advise against larger Levenshtein distances, as substance data occasionally contains several agents, which cannot be identified, leading to erroneously omitted agents. Smaller distances instead mostly indicate typos (e.g. Acazytidin instead of Azacitidin), inversions of phonems (e.g. Cytabarin instead of Cytarabin) or export artifacts (e.g. &#8220;Dexmethasadon&#8221; instead of Dexamethason) which can be resolved rather reliably. </Pgraph><Pgraph><Mark1>Conclusion:</Mark1> To improve the approach, tokenization of the given substances before calculating Levenshtein distances seems worthwhile to handle cases where more than one agent has been reported at the same time. Moreover, applying phonetic algorithms such as the German Cologne phonetics <TextLink reference="5"></TextLink> may be suitable to address the phonetic issues. Furthermore, the ratio of Levenshtein distance to total length of a given substance as well as the delta between closest and next-closest matches could be incorporated to assess the reliability. In any case, the approach should especially avoid false positive matches, since a fallback to human documentalists still is an option.</Pgraph><Pgraph>The authors declare that they have no competing interests.</Pgraph><Pgraph>The authors declare that an ethics committee vote is not required.</Pgraph></TextBlock>
    <References linked="yes">
      <Reference refNo="1">
        <RefAuthor>Anonym</RefAuthor>
        <RefTitle></RefTitle>
        <RefYear></RefYear>
        <RefBookTitle>Bundeseinheitlicher onkologischer Basisdatensatz. Aktuelle Versionen der Dateien zur XML-Schnittstelle oBDS</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>Bundeseinheitlicher onkologischer Basisdatensatz. Aktuelle Versionen der Dateien zur XML-Schnittstelle oBDS. Arbeitsgemeinschaft Deutscher Tumorzentren e.V.; &#91;cited 2025 Apr 16&#93;. Available from: https:&#47;&#47;www.basisdatensatz.de&#47;xml&#47;</RefTotal>
        <RefLink>https:&#47;&#47;www.basisdatensatz.de&#47;xml&#47;</RefLink>
      </Reference>
      <Reference refNo="2">
        <RefAuthor>Klinische Landesauswertungsstelle Niedersachsen</RefAuthor>
        <RefTitle></RefTitle>
        <RefYear></RefYear>
        <RefBookTitle>Aufgaben der KLast</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>Klinische Landesauswertungsstelle Niedersachsen. Aufgaben der KLast. OFFIS CARE GmbH; &#91;cited 2025 Apr 16&#93;. Available from: https:&#47;&#47;www.klast-n.de&#47;aufgaben-ziele.html</RefTotal>
        <RefLink>https:&#47;&#47;www.klast-n.de&#47;aufgaben-ziele.html</RefLink>
      </Reference>
      <Reference refNo="3">
        <RefAuthor>Levenshtein VI</RefAuthor>
        <RefTitle>Binary codes capable of correcting deletions, insertions, and reversals</RefTitle>
        <RefYear>1966</RefYear>
        <RefJournal>Soviet Physics Doklady</RefJournal>
        <RefPage>707-710</RefPage>
        <RefTotal>Levenshtein VI. Binary codes capable of correcting deletions, insertions, and reversals. Soviet Physics Doklady. 1966;10(8):707-710.</RefTotal>
      </Reference>
      <Reference refNo="4">
        <RefAuthor>Anonym</RefAuthor>
        <RefTitle></RefTitle>
        <RefYear></RefYear>
        <RefBookTitle>Plattform &#167; 65c</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>Plattform &#167; 65c. Klinische Krebsregister Sachsen-Anhalt GmbH; &#91;cited 2025 Apr 16&#93;. Available from: https:&#47;&#47;plattform65c.de&#47;</RefTotal>
        <RefLink>https:&#47;&#47;plattform65c.de&#47;</RefLink>
      </Reference>
      <Reference refNo="5">
        <RefAuthor>Postel HJ</RefAuthor>
        <RefTitle>Die K&#246;lner Phonetik. Ein Verfahren zur Identifizierung von Personennamen auf der Grundlage der Gestaltanalyse</RefTitle>
        <RefYear>1969</RefYear>
        <RefJournal>IBM-Nachrichten</RefJournal>
        <RefPage>925-931</RefPage>
        <RefTotal>Postel HJ. Die K&#246;lner Phonetik. Ein Verfahren zur Identifizierung von Personennamen auf der Grundlage der Gestaltanalyse. IBM-Nachrichten. 1969;19:925-931.</RefTotal>
      </Reference>
    </References>
    <Media>
      <Tables>
        <NoOfTables>0</NoOfTables>
      </Tables>
      <Figures>
        <NoOfPictures>0</NoOfPictures>
      </Figures>
      <InlineFigures>
        <NoOfPictures>0</NoOfPictures>
      </InlineFigures>
      <Attachments>
        <NoOfAttachments>0</NoOfAttachments>
      </Attachments>
    </Media>
  </OrigData>
</GmsArticle>