Skip to content
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ vendor/
phpunit.xml
phpcs.xml
.phpcs.xml
.phpunit.result.cache
.phpunit.cache
47 changes: 47 additions & 0 deletions features/distignore.feature
Original file line number Diff line number Diff line change
Expand Up @@ -379,3 +379,50 @@ Feature: Generate a distribution archive of a project with .distignore
"""
Error: Broken symlink at /symlink. Target missing at
"""

Scenario: Efficiently ignores directories with many files
# Performance test: ensure ignored directories are not scanned
# @see https://github.com/wp-cli/dist-archive-command/issues/XXX
Given an empty directory
And a foo/.distignore file:
"""
node_modules
.git
"""
And a foo/plugin.php file:
"""
<?php
/**
* Plugin Name: Test Plugin
* Version: 1.0.0
*/
"""
And a foo/readme.txt file:
"""
=== Test Plugin ===
"""

When I run `mkdir -p foo/node_modules/package1 foo/node_modules/package2 foo/node_modules/package3`
Then STDERR should be empty

When I run `sh -c 'i=1; while [ $i -le 50 ]; do touch foo/node_modules/package1/file$i.js; i=$((i+1)); done'`
Then STDERR should be empty

When I run `sh -c 'i=1; while [ $i -le 50 ]; do touch foo/node_modules/package2/file$i.js; i=$((i+1)); done'`
Then STDERR should be empty

When I run `sh -c 'i=1; while [ $i -le 50 ]; do touch foo/node_modules/package3/file$i.js; i=$((i+1)); done'`
Then STDERR should be empty

When I run `wp dist-archive foo`
Then STDOUT should match /^Success: Created foo\.[^ ]+ \(Size: \d+(?:\.\d*)? [a-zA-Z]{1,3}\)$/
And STDERR should be empty

When I run `rm -rf foo`
Then the foo directory should not exist

When I try `unzip foo.*.zip`
Then the foo directory should exist
And the foo/plugin.php file should exist
And the foo/readme.txt file should exist
And the foo/node_modules directory should not exist
26 changes: 26 additions & 0 deletions phpunit.xml.dist
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/4.8/phpunit.xsd"
bootstrap="vendor/autoload.php"
backupGlobals="false"
beStrictAboutCoversAnnotation="true"
beStrictAboutOutputDuringTests="true"
beStrictAboutTestsThatDoNotTestAnything="true"
beStrictAboutTodoAnnotatedTests="true"
convertErrorsToExceptions="true"
convertWarningsToExceptions="true"
convertNoticesToExceptions="true"
convertDeprecationsToExceptions="true"
colors="true"
verbose="true">
<testsuites>
<testsuite name="wp-cli/dist-archive-command tests">
<directory suffix="Test.php">tests</directory>
</testsuite>
</testsuites>

<coverage processUncoveredFiles="false">
<include>
<directory suffix=".php">src</directory>
</include>
</coverage>
</phpunit>
50 changes: 30 additions & 20 deletions src/Dist_Archive_Command.php
Original file line number Diff line number Diff line change
Expand Up @@ -493,10 +493,11 @@ protected function is_path_contains_symlink( $source_dir_path ) {
private function get_file_list( $source_dir_path, $excluded = false ) {

$included_files = [];
$excluded_files = [];

$iterator = new RecursiveIteratorIterator(
new RecursiveDirectoryIterator( $source_dir_path, RecursiveDirectoryIterator::SKIP_DOTS ),
$directory_iterator = new RecursiveDirectoryIterator( $source_dir_path, RecursiveDirectoryIterator::SKIP_DOTS );
$filter_iterator = new Distignore_Filter_Iterator( $directory_iterator, $this->checker, $source_dir_path );
$iterator = new RecursiveIteratorIterator(
$filter_iterator,
RecursiveIteratorIterator::SELF_FIRST
);

Expand All @@ -505,34 +506,43 @@ private function get_file_list( $source_dir_path, $excluded = false ) {
*/
foreach ( $iterator as $item ) {
$relative_filepath = str_replace( $source_dir_path, '', $item->getPathname() );
try {
if ( $this->checker->isPathIgnored( $relative_filepath ) ) {
$excluded_files[] = $relative_filepath;
} else {
$included_files[] = $relative_filepath;
}
} catch ( \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException $exception ) {

// Check if this item had an error during filtering.
$error = $filter_iterator->getErrorForItem( $relative_filepath );
if ( $error ) {
if ( $item->isLink() && ! file_exists( (string) readlink( $item->getPathname() ) ) ) {
WP_CLI::error( "Broken symlink at {$relative_filepath}. Target missing at {$item->getLinkTarget()}." );
} else {
WP_CLI::error( $exception->getMessage() );
WP_CLI::error( $error->getMessage() );
}
}
}

// Check all excluded directories and remove them from the excluded list if they contain included files.
foreach ( $excluded_files as $excluded_file_index => $excluded_relative_path ) {
if ( ! is_dir( $source_dir_path . $excluded_relative_path ) ) {
continue;
// Check if this item is ignored (directories may still be yielded even if ignored).
if ( ! $filter_iterator->isPathIgnoredCached( $relative_filepath ) ) {
$included_files[] = $relative_filepath;
}
foreach ( $included_files as $included_relative_path ) {
if ( 0 === strpos( $included_relative_path, $excluded_relative_path ) ) {
unset( $excluded_files[ $excluded_file_index ] );
}

if ( $excluded ) {
// Get excluded files from the filter iterator.
$excluded_files = $filter_iterator->getExcludedFiles();

// Check all excluded directories and remove them from the excluded list if they contain included files.
foreach ( $excluded_files as $excluded_file_index => $excluded_relative_path ) {
if ( ! is_dir( $source_dir_path . $excluded_relative_path ) ) {
continue;
}
foreach ( $included_files as $included_relative_path ) {
if ( 0 === strpos( $included_relative_path, $excluded_relative_path ) ) {
unset( $excluded_files[ $excluded_file_index ] );
}
}
}

return $excluded_files;
}

return $excluded ? $excluded_files : $included_files;
return $included_files;
}

/**
Expand Down
229 changes: 229 additions & 0 deletions src/Distignore_Filter_Iterator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
<?php

use Inmarelibero\GitIgnoreChecker\GitIgnoreChecker;

/**
* Filter iterator that skips descending into ignored directories to improve performance.
*
* This filter prevents RecursiveIteratorIterator from descending into
* directories that are marked as ignored in .distignore, avoiding unnecessary
* iteration through thousands of files in directories like node_modules.
* However, it still yields the ignored directories themselves so they can
* be properly tracked in exclude lists.
*/
class Distignore_Filter_Iterator extends RecursiveFilterIterator {
/**
* @var GitIgnoreChecker
*/
private $checker;

/**
* @var string
*/
private $source_dir_path;

/**
* Cache for ignored status to avoid duplicate checks.
*
* @var array<string, bool>
*/
private $ignored_cache = [];

/**
* List of excluded file paths (relative).
*
* @var string[]
*/
private $excluded_files = [];

/**
* List of items that had errors during checking.
*
* @var array<string, \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException>
*/
private $error_items = [];

/**
* Constructor.
*
* @param RecursiveIterator<string, SplFileInfo> $iterator Iterator to filter.
* @param GitIgnoreChecker $checker GitIgnore checker instance.
* @param string $source_dir_path Base directory path.
*/
public function __construct( RecursiveIterator $iterator, GitIgnoreChecker $checker, $source_dir_path ) {
parent::__construct( $iterator );
$this->checker = $checker;
$this->source_dir_path = $source_dir_path;
}

/**
* Check whether the current element of the iterator is acceptable.
* Filters out ignored files so they don't appear in the iteration.
* For directories, we're more conservative - we only filter them out
* if we're certain they and all their contents should be ignored.
*
* @return bool True if the element should be included, false otherwise.
*/
#[\ReturnTypeWillChange]
public function accept() {
/** @var SplFileInfo $item */
$item = $this->current();

// Get relative path.
$pathname = $item->getPathname();
$source_path_length = strlen( $this->source_dir_path );

if ( 0 === strpos( $pathname, $this->source_dir_path ) ) {
$relative_filepath = substr( $pathname, $source_path_length );
} else {
$relative_filepath = $pathname;
}

try {
$is_ignored = $this->isPathIgnoredCached( $relative_filepath );

if ( $is_ignored ) {
// Track this as excluded.
$this->excluded_files[] = $relative_filepath;

// For files, we can safely filter them out.
if ( ! $item->isDir() ) {
return false;
}

// For directories, only filter out if we're not going to descend
// (hasChildren will handle that check).
// We need to yield ignored directories so they can be tracked in exclude lists.
return true;
}

return true;
} catch ( \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException $exception ) {
// Store the error and yield the item so get_file_list can handle it.
$this->error_items[ $relative_filepath ] = $exception;
return true;
}
}

/**
* Check if a path is ignored, with caching to avoid duplicate checks.
*
* @param string $relative_filepath Relative file path to check.
* @return bool True if the path is ignored, false otherwise.
* @throws \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException
*/
public function isPathIgnoredCached( $relative_filepath ) {
if ( ! isset( $this->ignored_cache[ $relative_filepath ] ) ) {
$this->ignored_cache[ $relative_filepath ] = $this->checker->isPathIgnored( $relative_filepath );
}
return $this->ignored_cache[ $relative_filepath ];
}

/**
* Check whether the current element has children that should be recursed into.
* We return false for certain ignored directories to prevent descending into them.
*
* This optimization only applies to directories that appear to be "leaf" ignore
* patterns (simple directory names without wildcards), to safely handle cases
* like `node_modules` while still correctly processing complex patterns with
* negations like `frontend/*` with `!/frontend/build/`.
*
* @return bool True if we should descend into this directory, false otherwise.
*/
#[\ReturnTypeWillChange]
public function hasChildren() {
/** @var SplFileInfo $item */
$item = $this->current();

// If it's not a directory, it has no children.
if ( ! $item->isDir() ) {
return false;
}

// For directories, check if they should be ignored.
$pathname = $item->getPathname();
$source_path_length = strlen( $this->source_dir_path );

// Extract relative path by removing the source directory prefix.
if ( 0 === strpos( $pathname, $this->source_dir_path ) ) {
$relative_filepath = substr( $pathname, $source_path_length );
} else {
// Fallback if path doesn't start with source path (shouldn't happen).
$relative_filepath = $pathname;
}

try {
$is_ignored = $this->isPathIgnoredCached( $relative_filepath );

if ( ! $is_ignored ) {
// Not ignored, so descend.
return true;
}

// Directory is ignored. Check if it's safe to skip descent.
// We only skip for single-level directories (no slashes except leading/trailing)
// to avoid issues with wildcard patterns and negations.
$path_parts = explode( '/', trim( $relative_filepath, '/' ) );
if ( count( $path_parts ) === 1 ) {
// This is a top-level ignored directory like "/node_modules" or "/.git".
// It's likely safe to skip descent as these are typically simple patterns.
// However, we still need to be conservative. Let's check if a child would be ignored.
// We use 'test' as a probe filename to check if children would be ignored.
// The actual name doesn't matter; we just need to verify the pattern applies to children.
$test_child = $relative_filepath . '/test';
try {
$child_ignored = $this->isPathIgnoredCached( $test_child );
if ( $child_ignored ) {
// Child is also ignored, safe to skip descent.
return false;
}
} catch ( \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException $exception ) {
// On error, descend to be safe.
return true;
}
}

// For nested directories or if test shows children might not be ignored, descend.
return true;
} catch ( \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException $exception ) {
// If there's an error checking, allow descending (error will be handled in get_file_list).
return true;
}
}

/**
* Return the inner iterator's children wrapped in this filter.
*
* @return RecursiveFilterIterator
*/
#[\ReturnTypeWillChange]
public function getChildren() {
/** @var RecursiveDirectoryIterator $inner */
$inner = $this->getInnerIterator();
// Pass the same arrays by reference so they accumulate across all levels.
$child = new self( $inner->getChildren(), $this->checker, $this->source_dir_path );
$child->excluded_files = &$this->excluded_files;
$child->ignored_cache = &$this->ignored_cache;
$child->error_items = &$this->error_items;
return $child;
}

/**
* Get the list of excluded files that were filtered out.
*
* @return string[]
*/
public function getExcludedFiles() {
return $this->excluded_files;
}

/**
* Check if an item had an error during processing.
*
* @param string $relative_filepath Relative file path to check.
* @return \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException|null
*/
public function getErrorForItem( $relative_filepath ) {
return $this->error_items[ $relative_filepath ] ?? null;
}
}
Loading
Loading