Skip to content

v2.0.0

Compare
Choose a tag to compare
@SUPERCILEX SUPERCILEX released this 18 Mar 18:08
· 50 commits to master since this release
64d5be9

The quality of directory shapes was dramatically improved.

Results

  • Per directory file count follows a much broader and non-normal distribution resulting in more varied directories.
  • Total file count has lasered into the true mean instead of skewing upwards proportional to max depth (atypical parameters still result in skew, but it is at least a normal-ish distribution).
Old New
Files per directory image image
Total file count image image
Total file count Old New
Low file count image image
High depth image image

Appendix

Sample collection code:

Subject: [PATCH] Upgrade deps
---
Index: src/core/scheduler.rs
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/core/scheduler.rs b/src/core/scheduler.rs
--- a/src/core/scheduler.rs	(revision 6b7e88c4d05ed8fd247178bd9ec86440fa923d5f)
+++ b/src/core/scheduler.rs	(date 1679085314728)
@@ -11,6 +11,7 @@
     core::{
         files::GeneratorTaskOutcome,
         tasks::{QueueErrors, QueueOutcome, TaskGenerator},
+        TOTAL_FILE_COUNT,
     },
     generator::Error,
     utils::{with_dir_name, FastPathBuf},
@@ -165,6 +166,11 @@
         handle_task_result(task, &mut stats)?;
     }
 
+    {
+        use std::io::Write;
+        let mut count = TOTAL_FILE_COUNT.lock().unwrap();
+        writeln!(count, "{}", stats.files).unwrap();
+    }
     Ok(stats)
 }
 
Index: src/core/mod.rs
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/core/mod.rs b/src/core/mod.rs
--- a/src/core/mod.rs	(revision 6b7e88c4d05ed8fd247178bd9ec86440fa923d5f)
+++ b/src/core/mod.rs	(date 1679085426452)
@@ -1,3 +1,9 @@
+use std::{
+    cell::LazyCell,
+    fs::File,
+    sync::{LazyLock, Mutex},
+};
+
 pub use scheduler::{run, GeneratorStats};
 pub use tasks::{DynamicGenerator, GeneratorBytes, StaticGenerator};
 
@@ -5,3 +11,31 @@
 mod files;
 mod scheduler;
 mod tasks;
+
+static FILE_COUNT: LazyLock<Mutex<File>, fn() -> Mutex<File>> = LazyLock::new(|| {
+    Mutex::new(
+        File::options()
+            .create(true)
+            .append(true)
+            .open("file_count.samples")
+            .unwrap(),
+    )
+});
+static DIR_COUNT: LazyLock<Mutex<File>, fn() -> Mutex<File>> = LazyLock::new(|| {
+    Mutex::new(
+        File::options()
+            .create(true)
+            .append(true)
+            .open("dir_count.samples")
+            .unwrap(),
+    )
+});
+static TOTAL_FILE_COUNT: LazyLock<Mutex<File>, fn() -> Mutex<File>> = LazyLock::new(|| {
+    Mutex::new(
+        File::options()
+            .create(true)
+            .append(true)
+            .open("total_file_count.samples")
+            .unwrap(),
+    )
+});
Index: src/lib.rs
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/lib.rs b/src/lib.rs
--- a/src/lib.rs	(revision 6b7e88c4d05ed8fd247178bd9ec86440fa923d5f)
+++ b/src/lib.rs	(date 1679084880740)
@@ -3,6 +3,7 @@
 #![feature(let_chains)]
 #![feature(const_option)]
 #![feature(inline_const)]
+#![feature(once_cell)]
 #![allow(clippy::multiple_crate_versions)]
 #![allow(clippy::module_name_repetitions)]
 
Index: src/core/tasks.rs
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/core/tasks.rs b/src/core/tasks.rs
--- a/src/core/tasks.rs	(revision 6b7e88c4d05ed8fd247178bd9ec86440fa923d5f)
+++ b/src/core/tasks.rs	(date 1679085314852)
@@ -12,6 +12,7 @@
             PreDefinedGeneratedFileContents,
         },
         files::{create_files_and_dirs, GeneratorTaskOutcome, GeneratorTaskParams},
+        DIR_COUNT, FILE_COUNT,
     },
     utils::FastPathBuf,
 };
@@ -122,6 +123,16 @@
         } else {
             0
         };
+        {
+            use std::io::Write;
+            let mut count = FILE_COUNT.lock().unwrap();
+            writeln!(count, "{}", num_files).unwrap();
+        }
+        {
+            use std::io::Write;
+            let mut count = DIR_COUNT.lock().unwrap();
+            writeln!(count, "{}", num_dirs).unwrap();
+        }
 
         macro_rules! build_params {
             ($file_contents:expr) => {{

Collection:

cargo b --features dry_run
for d in [0, 5, 20] { for n in [10, 10_000, 1_000_000] { for i in 0..1000 { ./target/debug/ftzz g -n $n /tmp/foo -d $d --seed $i out> /dev/null }; mkdir $"($d)d_($n)n"; mv *.samples $"($d)d_($n)n/" } }

Plots:

!unzip samples.zip
import numpy as np
import matplotlib.pyplot as plt
import os

for dir in os.listdir():
  if not dir.endswith('n'):
    continue

  # List of files containing samples
  files = ['file_count.samples', 'total_file_count.samples']

  # Loop over each file and plot its samples

  for file in files:
      # Load samples from file into a list
      file = dir + '/' + file
      with open(file, 'r') as f:
          samples = [int(x.strip()) for x in f.readlines()]

      # Calculate some statistics on the samples
      mean = np.mean(samples)
      std = np.std(samples)

      # Plot the histogram of the samples
      plt.hist(samples, bins=50, density=True, alpha=0.5)

      # Add a vertical line at the mean of the distribution
      plt.axvline(x=mean, color='red', linestyle='--')

      # Add labels and title to the plot
      plt.xlabel('Values')
      plt.ylabel('Frequency')
      plt.title(f'Distribution of samples from {file}')

      # Show the plot
      plt.show()