Skip to content
120 changes: 117 additions & 3 deletions lib/EmailObfuscator.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public static function obfuscate($content) {

// Ersetze E-Mailadressen
if (!$emailobfuscator->getConfig('mailto_only')) {
$content = preg_replace_callback('/(?<![\/\w])([\w\-\+\.]+)@([\w\-\.]+\.[\w]{2,})(?![\w\/])/', 'emailobfuscator::encodeEmailUnicorn', $content);
$content = self::obfuscateEmailsNotInAttributes($content);
}

// Injiziere CSS vors schließende </head> im Seitenkopf
Expand Down Expand Up @@ -150,6 +150,70 @@ private static function encodeEmailLinksUnicorn($matches) {
return 'javascript:decryptUnicorn(' . $mail . ')';
}

/**
* Obfuscate emails but skip those within HTML attribute values.
*
* This method uses a heuristic approach (e.g., quote counting) to determine whether an email address
* is inside an HTML attribute value. As such, it has several limitations:
* - It may not handle escaped quotes within attribute values (e.g., `\"`).
* - It may not correctly handle mixed quote types (single and double quotes) within attributes.
* - The algorithm assumes well-formed HTML and may not work correctly on malformed HTML.
* - It is not a full HTML parser and may fail on complex or edge-case HTML constructs.
*
* @param string $content Content to process
* @return string Processed content
*/
private static function obfuscateEmailsNotInAttributes($content) {
$pattern = '/(?<![\/\w])([\w\-\+\.]+)@([\w\-\.]+\.[\w]{2,})(?![\w\/])/';
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The regex pattern uses negative lookahead/lookbehind to avoid matching emails that are part of URLs or other contexts ((?<![\/\w]) and (?![\w\/])). However, this pattern would still match file extensions like [email protected] in contexts where they're not within HTML attributes.

While the attribute detection logic should catch most cases, this regex could match false positives in text content like "Check out our [email protected] file". Consider if this is intended behavior or if the pattern should be more restrictive to avoid matching obvious file patterns even in text content.

Suggested change
$pattern = '/(?<![\/\w])([\w\-\+\.]+)@([\w\-\.]+\.[\w]{2,})(?![\w\/])/';
// Avoid matching emails that are actually filenames like [email protected]
// Add negative lookahead for common file extensions after the email
$pattern = '/(?<![\/\w])([\w\-\+\.]+)@([\w\-\.]+\.[\w]{2,})(?![\w\/])(?!(\.(png|jpg|jpeg|gif|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ico|bmp|tiff|psd|ai|eps|csv|json|xml|yml|yaml|txt|log|md|html|htm|php|js|css|scss|less|c|cpp|h|hpp|py|rb|go|rs|sh|bat|exe|dll|bin|dat|bak|tmp)))/i';

Copilot uses AI. Check for mistakes.

$offset = 0;
$result = $content;
// Precompute all HTML tag ranges in the content
if (!isset($tagRanges)) {
$tagRanges = [];
if (preg_match_all('/<[^>]*>/', $result, $tagMatches, PREG_OFFSET_CAPTURE)) {
foreach ($tagMatches[0] as $tagMatch) {
$tagStart = $tagMatch[1];
$tagEnd = $tagStart + strlen($tagMatch[0]);
$tagRanges[] = [$tagStart, $tagEnd];
}
}
}

while (preg_match($pattern, $result, $matches, PREG_OFFSET_CAPTURE, $offset)) {
$email = $matches[0][0];
$pos = $matches[0][1];

// Check if the email is inside any HTML tag
$shouldObfuscate = true;
foreach ($tagRanges as $range) {
if ($pos >= $range[0] && $pos < $range[1]) {
$shouldObfuscate = false;
break;
}
}

if ($shouldObfuscate) {
// Check whitelist
if ((isset($_SERVER['REQUEST_METHOD']) && $_SERVER['REQUEST_METHOD'] == 'POST' && isset($_POST) && self::in_array_r($email, $_POST)) || self::in_array_r($email, self::$whitelist)) {
$shouldObfuscate = false;
}
}

if ($shouldObfuscate) {
// Obfuscate the email
$replacement = $matches[1][0] . '<span class="unicorn"><span>_at_</span></span>' . $matches[2][0];
$result = substr_replace($result, $replacement, $pos, strlen($email));
$offset = $pos + strlen($replacement);
} else {
// Skip this match
$offset = $pos + strlen($email);
}
}

return $result;
}

/**
* Encode E-Mail address
* @param string[] $matches
Expand Down Expand Up @@ -179,8 +243,58 @@ private static function in_array_r($needle, $haystack, $strict = false) {
*/
private static function makeEmailClickable($ret) {
$ret = ' ' . $ret;
// in testing, using arrays here was found to be faster
$ret = preg_replace_callback('#([\s>])([.0-9a-z_+-]+)@(([0-9a-z-]+\.)+[0-9a-z]{2,})#i', 'emailobfuscator::make_email_clickable_callback', $ret);

// Precompute all HTML tag ranges in the content
$tagRanges = [];
if (preg_match_all('/<[^>]*>/', $ret, $tagMatches, PREG_OFFSET_CAPTURE)) {
foreach ($tagMatches[0] as $tagMatch) {
$tagStart = $tagMatch[1];
$tagEnd = $tagStart + strlen($tagMatch[0]);
$tagRanges[] = [$tagStart, $tagEnd];
}
}

// Process emails but skip those in HTML attributes
$pattern = '#([\s>])([.0-9a-z_+-]+)@(([0-9a-z-]+\.)+[0-9a-z]{2,})#i';
$offset = 0;

while (preg_match($pattern, $ret, $matches, PREG_OFFSET_CAPTURE, $offset)) {
$fullMatch = $matches[0][0];
$pos = $matches[0][1];
$email = $matches[2][0] . '@' . $matches[3][0];

// Skip retina image patterns like @2x.png, @3x.jpg, etc.
if (preg_match('/^[^@]+@\d+x\./i', $email)) {
$offset = $pos + strlen($fullMatch);
continue;
}

// Check if the email is inside any HTML tag
$shouldMakeClickable = true;
foreach ($tagRanges as $range) {
if ($pos >= $range[0] && $pos < $range[1]) {
$shouldMakeClickable = false;
break;
}
}

if ($shouldMakeClickable) {
// Check whitelist for consistency with obfuscateEmailsNotInAttributes
if (self::in_array_r($email, self::$whitelist)) {
$shouldMakeClickable = false;
}
}

if ($shouldMakeClickable) {
// Make clickable
$replacement = $matches[1][0] . "<a href=\"mailto:$email\">$email</a>";
$ret = substr_replace($ret, $replacement, $pos, strlen($fullMatch));
$offset = $pos + strlen($replacement);
} else {
// Skip this match
$offset = $pos + strlen($fullMatch);
}
}

// this one is not in an array because we need it to run last, for cleanup of accidental links within links
$ret = preg_replace("#(<a( [^>]+?>|>))<a [^>]+?>([^>]+?)</a></a>#i", "$1$3</a>", $ret);
Expand Down