index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="Zero-shot clinical trial patient matching with LLMs">
  <meta property="og:title" content="Zero-shot clinical trial patient matching with LLMs"/>
  <meta property="og:description" content="Zero-shot clinical trial patient matching with LLMs"/>
  <meta property="og:url" content="https://clinicaltrialmatch.stanford.edu/"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/image/your_banner_image.png" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="Zero-shot clinical trial patient matching with LLMs">
  <meta name="twitter:description" content="Zero-shot clinical trial patient matching with LLMs">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/your_twitter_banner_image.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="clinical trials, llms, zero-shot, patient matching">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>Zero-Shot Trial Matching with LLMs</title>
  <link rel="icon" type="image/x-icon" href="static/images/favicon.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>
<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">Zero-Shot Clinical Trial Patient Matching with LLMs</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="https://michaelwornow.net/" target="_blank">Michael Wornow</a><sup>*</sup>,</span>
                <span class="author-block">
                  <a href="https://www.linkedin.com/in/ale9806/" target="_blank">Alejandro Lozano</a><sup>*</sup>,</span>
                  <span class="author-block">
                    <a href="https://profiles.stanford.edu/debadutta-dash" target="_blank">Dev Dash</a>,</span>
                  </span>
                  <span class="author-block">
                    <a href="https://www.linkedin.com/in/jenellejindal/" target="_blank">Jenelle Jindal</a>,</span>
                  </span>

                  <span class="author-block">
                    <a href="https://profiles.stanford.edu/kenneth-mahaffey" target="_blank">Kenneth W. Mahaffey</a>,</span>
                  </span>
                  <span class="author-block">
                    <a href="https://med.stanford.edu/profiles/nigam-shah" target="_blank">Nigam H. Shah</a></span>
                  </span>
                  
                  
                  </div>

                  <div class="is-size-5 publication-authors">
                    <span class="author-block">Stanford University<br>NEJM AI 2024</span>
                    <span class="eql-cntrb"><small><br><sup>*</sup>Indicates Equal Contribution</small></span>
                  </div>

                  <div class="column has-text-centered">
                    <div class="publication-links">
                         <!-- Arxiv PDF link -->
                      <span class="link-block">
                        <a href="https://arxiv.org/pdf/2402.05125.pdf" target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
                        <span class="icon">
                          <i class="fas fa-file-pdf"></i>
                        </span>
                        <span>Paper</span>
                      </a>
                    </span>

                    <!-- Supplementary PDF link -->
                 

                  <!-- Github link -->
                  <span class="link-block">
                    <a href="https://github.com/som-shahlab/clinical_trial_patient_matching" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>

                
                </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <img src="static/images/figure_1.png" alt="MY ALT TEXT"/>
        <div class="content has-text-justified">
          <p>
            <br>
            <b>TL;DR: </b>We explore zero-shot clinical trial patient matching with large language models (LLMs)  under two system designs (traditional prompting vs reduced prompting via retrieval augmentation):
            (a) We inject the patient's entire set of notes into a prompt input into an Assessment LLM (e.g. GPT-4) for evaluation. (b) In our two-stage retrieval pipeline, we first query the top-k most relevant chunks from the patient's notes, then inject only those top-k chunks into the prompt input into an Assessment LLM. Both paradigms are compared using the same prompting strategies, with the only distinction being the amount of patient information included in the prompt.
            <br>
            </p>
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- End teaser video -->

<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Matching patients to clinical trials is a key unsolved challenge in bringing new drugs to market. Today, identifying patients who meet a trial's eligibility criteria is highly manual, taking up to 1 hour per patient. LLMs offer a promising solution. In this work, we explore their application to trial matching. First, we design an LLM-based system which, given a patient's medical history as unstructured clinical text, evaluates whether that patient meets a set of inclusion criteria. Our zero-shot system achieves state-of-the-art scores on the n2c2 2018 cohort selection benchmark. Second, we improve the data and cost efficiency of our method by identifying a prompting strategy which matches patients an order of magnitude faster and more cheaply than the status quo, and develop a two-stage retrieval pipeline that reduces the number of tokens processed by up to a third while retaining high performance. Third, we evaluate the interpretability of our system by having clinicians evaluate the natural language justifications generated by the LLM for each eligibility decision, and show that it can output coherent explanations for 97% of its correct decisions and 75% of its incorrect ones. Our results establish the feasibility of using LLMs to accelerate clinical trial operations.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->


<!-- End image carousel -->


<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <br><br>
        <h2 class="title is-3">Results</h2>
        <div class="content has-text-justified">
          
          <p>
            For our initial zero-shot evaluation, we feed the entire patient's medical history into the LLM and have it predict all criteria at once. All of the models we test are able to fit each patient's history into their context windows (Table 1). Despite not being tuned for trial matching or provided any in-context examples, GPT-4 beats the state-of-the-art by a margin of +6 Macro-F1 and +2 Micro-F1 points. 
            <br>
            <br>
            <b>Table 1:</b> Zero-shot 2018 n2c2 benchmark results using the <i>ACIN</i> prompt strategy. We use versions of each model with at least 32k context length, with the exception of GPT-3.5 (limited to 16k tokens) and Llama-3-70b (limited to 8k tokens). Bootstrapped 95% confidence intervals on the test set (1000 samples) are shown in subscript.
            </p>
          </p>
        </div>
        <img src="static/images/table1.png" alt="Accuracy"/>
      </div>
    </div>
  </div>
</section>


<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <br><br>
        <div class="content has-text-justified">
          
          <p>
         
            <b>Table 2:</b> Performance and efficiency across different prompt strategies. Cost and data efficiency of prompting strategies Considering one criterion/note at a time improves performance. "Tokens" includes both prompt and completion tokens (i.e. inputs and outputs). "API Calls" is the total number of times the LLM was queried. "Cost" is based on OpenAI’s pricing as of January 25, 2024
            </p>
          </p>
        </div>
        <img src="static/images/table2.png" alt="Cost"/>
        <br>
        <br>
      </div>
    </div>
  </div>
</section>


<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      
      <div class="column is-four-fifths">
        <h4 class="title is-3">   <br>  Retrieval Pipeline</h4>
       
        <div class="content has-text-justified">
          <p>

We are able to surpass the prior state-of-the-art on Macro-F1 using roughly one-third and one-half as many tokens as needed in the vanilla ICAN and ACAN strategies, respectively (using a patient's  full note). 

<br>
<br>


            <br>
            
            <b>Figure 3: </b>Model performance increases as the number (k) of retrieved documents increases, but quickly
            plateaus with diminishing returns. We test k ∈ {1, 3, 5, 10}. Each subfigure is a different prompting
            strategy. The y-axis is model performance (Macro/Micro-F1) and the x-axis is the total number
            of tokens processed by the model. <b style="color:rgb(255, 149, 0);">Orange </b>is GPT-4, <b style="color:rgb(0, 72, 255);">blue</b> is GPT-3.5, and the <b style="color:rgb(16, 148, 44);">green </b> line is the
            prior state-of-the-art. Stars represent each model’s best performance when feeding in all notes.
            The MiniLM embedding model is the dashed line, while BGE is the solid line.
            <br>
            </p>
          </p>
        </div>
        <img src="static/images/retrieval.png" alt="MY ALT TEXT"/>
      </div>
    </div>
  </div>
</section>

<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h4 class="title is-3">   <br>   Interpretability</h4>
        <div class="content has-text-justified">
         
          <p>
           

We sample 468 rationales generated by GPT-4 using the and have two clinicians evaluate their veracity. Each rationale was evaluated on a 3-part scale: 
<b style="color:rgb(16, 148, 44);">Correct</b>, 
<b style="color:rgb(216, 184, 5);">Partially Correct</b>, and 
<b style="color:rgb(255, 0, 0);">Incorrect</b>, based on how accurately it aligned to the relevant patient's EHR. 
The results show that GPT-4 is able to provide legitimate rationales for most its decisions. When GPT-4 makes a correct eligibility decision (Figure 4), 89% of its rationales were judged as fully correct, 8% as partially correct, and 3% as incorrect. When GPT-4 made an incorrect eligibility decision (Figure 5), its rationales were split 67/8/25%. 
            <br>
            <br>
            <b>Figure 4: </b> <i><b>A (top).</b></i> Clinician assessment of the rationales generated by GPT-4 for its <b>correct</b> eligibility decisions. <i><b>B (bottom).</b></i> </b>Clinician assessment of the rationales generated by GPT-4 for its <b>incorrect</b> eligibility decisions.</b>
            </p>
          </p>
        </div>
        <img src="static/images/clinician_rationale.png" alt="clinician_rationale.png"/>
      </div>
    </div>
  </div>
</section>


<!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@article{wornow2025zero,
  title={Zero-shot clinical trial patient matching with llms},
  author={Wornow, Michael and Lozano, Alejandro and Dash, Dev and Jindal, Jenelle and Mahaffey, Kenneth W and Shah, Nigam H},
  journal={NEJM AI},
  volume={2},
  number={1},
  pages={AIcs2400360},
  year={2025},
  publisher={Massachusetts Medical Society}
}</code></pre>
    </div>
</section>
<!--End BibTex citation -->


  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            You are free to borrow the of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->
  
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>