stanfordnlp · ramisbahi · Feb 27, 2025 · Feb 27, 2025 · Mar 2, 2025 · Mar 2, 2025
diff --git a/.gitignore b/.gitignore
@@ -44,6 +44,7 @@ build/
 *.egg-info/
 # *.jsonl
 # *.json
+gcg_log.json
 !/data/*.json
 /dist/
 # **/*.pkl
@@ -63,5 +64,7 @@ docs/docs/**/*.json*
 *.index
 *.pkl
 *.tar.gz
+dspy_env/
+play.ipynb
 
 test_before_pypi/
diff --git a/docs/docs/learn/optimization/optimizers.md b/docs/docs/learn/optimization/optimizers.md
@@ -201,7 +201,50 @@ optimized_program = teleprompter.compile(YOUR_PROGRAM_HERE, trainset=YOUR_TRAINS
         ```
 
         An informal run similar to this on DSPy 2.5.29 raises GPT-4o-mini's score 66% to 87%.
+
+    === "Optimizing prompts with EvilTwin"
+        The `EvilTwin` optimizer generates **evil twin prompts**—inputs that may appear garbled or obfuscated but still induce similar outputs in a DSPy program. This is based on the ["Prompts have evil twins"](https://arxiv.org/abs/2311.07064) paper. It uses the **Greedy Coordinate Gradient (GCG)** algorithm to iteratively modify a prompt while minimizing KL divergence from the original response distribution.
+
+        EvilTwin is useful for exploring the resilience of language models to perturbations, identifying potential vulnerabilities, or simply generating non-human-like prompts that behave similarly to natural prompts.
+
+        ```python linenums="1"
+        from dspy.teleprompt.evil_twin import EvilTwin
+
+        # Declare DSPy module
+        predictor = dspy.Predict('question -> answer')
+
+        q = "Describe the definition of artificial intelligence in one sentence."
+
+        # Run the optimizer with explicit prompt by passing the input field name as a keyword argument.
+        optimizer = EvilTwin(question=q)
+        optimized_predictor = optimizer.compile(program=predictor)
+
+        # Retrieve the final optimized (evil twin) prompt
+        print("Optimized Evil Twin Prompt:", optimizer.optimized_prompt)
+
+        # Test outputs
+        original_response = predictor(question=q)
+        evil_twin_response = optimized_predictor(question=q)
+
+        print("Original Output:", original_response.answer)
+        print("Evil Twin Output:", evil_twin_response.answer)
+        ```
+
+        **How it works**  
+        EvilTwin first runs the DSPy program with the given prompt to **sample documents**. It then computes the **log probability of those outputs** and iteratively replaces tokens in the prompt to minimize **KL divergence** while preserving response consistency.
+
+        **Why EvilTwin requires a local model (GPU recommended)**  
+        Unlike other DSPy optimizers, EvilTwin **does not use API-based LLMs** (e.g., OpenAI, Anthropic) because its algorithm **requires access to model internals**—including **gradients, logits, and token likelihoods**, which APIs do not expose. Instead, EvilTwin runs a **local model** (default: `"EleutherAI/gpt-neo-125M"`) that allows full control over token replacements and optimization. Note that the DSPy program/module itself runs with your configured LM via the API as normal to generate its outputs.
+
+        Running EvilTwin on a **CPU is possible but slow**, especially for large models or high `n_epochs`. If a GPU is available, it will be used automatically.
 
+        **Customization options:**
+        - `n_epochs`: Number of optimization iterations (default: 500).
+        - `batch_size`: Number of samples evaluated per iteration (default: 5).
+        - `top_k`: Number of token candidates considered for replacement per iteration (default: 256).
+        - `gamma`: Fluency penalty coefficient, controlling preference for natural-looking prompts (default: 0.0).
+        - `local_model_name`: The Hugging Face model used for tokenization and log probability estimation (default: `"EleutherAI/gpt-neo-125M"`).
+        - **(Planned feature)** Warm start initialization, as described in the Evil Twins paper.
 
 ## Saving and loading optimizer output
 

diff --git a/dspy/teleprompt/__init__.py b/dspy/teleprompt/__init__.py
@@ -5,6 +5,7 @@
 from dspy.teleprompt.copro_optimizer import COPRO
 from dspy.teleprompt.ensemble import Ensemble
 from dspy.teleprompt.knn_fewshot import KNNFewShot
+from dspy.teleprompt.evil_twin import EvilTwin
 
 from dspy.teleprompt.mipro_optimizer_v2 import MIPROv2
 from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch
@@ -21,6 +22,7 @@
     "BootstrapFinetune",
     "COPRO",
     "Ensemble",
+    "EvilTwin",
     "KNNFewShot",
     "MIPROv2",
     "BootstrapFewShotWithRandomSearch",