Skip to content

Commit

Permalink
Rename ShardedFileInfo as ShardedPath and shorten a few method na…
Browse files Browse the repository at this point in the history
…mes.

This is so that we can use the same APIs for sharding directories as well.

PiperOrigin-RevId: 690827035
  • Loading branch information
fniksic authored and copybara-github committed Oct 31, 2024
1 parent 345eef1 commit 6e7cc14
Show file tree
Hide file tree
Showing 13 changed files with 126 additions and 121 deletions.
6 changes: 3 additions & 3 deletions centipede/analyze_corpora.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ std::vector<CorpusRecord> ReadCorpora(std::string_view binary_name,
std::string(binary_hash), /*my_shard_index=*/0);
std::vector<std::string> corpus_paths;
CHECK_OK(
RemoteGlobMatch(workdir.CorpusFiles().AllShardsGlob(), corpus_paths));
RemoteGlobMatch(workdir.CorpusFilePaths().AllShardsGlob(), corpus_paths));
std::vector<std::string> features_paths;
CHECK_OK(
RemoteGlobMatch(workdir.FeaturesFiles().AllShardsGlob(), features_paths));
CHECK_OK(RemoteGlobMatch(workdir.FeaturesFilePaths().AllShardsGlob(),
features_paths));

CHECK_EQ(corpus_paths.size(), features_paths.size());
std::vector<CorpusRecord> corpus;
Expand Down
28 changes: 14 additions & 14 deletions centipede/centipede.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ Centipede::Centipede(const Environment &env, CentipedeCallbacks &user_callbacks,

void Centipede::CorpusToFiles(const Environment &env, std::string_view dir) {
std::vector<std::string> sharded_corpus_files;
CHECK_OK(RemoteGlobMatch(WorkDir{env}.CorpusFiles().AllShardsGlob(),
CHECK_OK(RemoteGlobMatch(WorkDir{env}.CorpusFilePaths().AllShardsGlob(),
sharded_corpus_files));
ExportCorpus(sharded_corpus_files, dir);
}
Expand All @@ -158,25 +158,25 @@ void Centipede::CorpusFromFiles(const Environment &env, std::string_view dir) {
// Iterate over all shards.
size_t inputs_added = 0;
size_t inputs_ignored = 0;
const auto corpus_files = WorkDir{env}.CorpusFiles();
const auto corpus_file_paths = WorkDir{env}.CorpusFilePaths();
for (size_t shard = 0; shard < env.total_shards; shard++) {
const std::string corpus_path = corpus_files.ShardPath(shard);
const std::string corpus_file_path = corpus_file_paths.Shard(shard);
size_t num_shard_bytes = 0;
// Read the shard (if it exists), collect input hashes from it.
absl::flat_hash_set<std::string> existing_hashes;
if (RemotePathExists(corpus_path)) {
if (RemotePathExists(corpus_file_path)) {
auto reader = DefaultBlobFileReaderFactory();
// May fail to open if file doesn't exist.
reader->Open(corpus_path).IgnoreError();
reader->Open(corpus_file_path).IgnoreError();
ByteSpan blob;
while (reader->Read(blob).ok()) {
existing_hashes.insert(Hash(blob));
}
}
// Add inputs to the current shard, if the shard doesn't have them already.
auto appender = DefaultBlobFileWriterFactory(env.riegeli);
CHECK_OK(appender->Open(corpus_path, "a"))
<< "Failed to open corpus file: " << corpus_path;
CHECK_OK(appender->Open(corpus_file_path, "a"))
<< "Failed to open corpus file: " << corpus_file_path;
ByteArray shard_data;
for (const auto &path : sharded_paths[shard]) {
std::string input;
Expand Down Expand Up @@ -455,8 +455,8 @@ void Centipede::LoadShard(const Environment &load_env, size_t shard_index,
// See serialize_shard_loads on why we may want to serialize shard loads.
// TODO(kcc): remove serialize_shard_loads when LoadShards() uses less RAM.
const WorkDir wd{load_env};
const std::string corpus_path = wd.CorpusFiles().ShardPath(shard_index);
const std::string features_path = wd.FeaturesFiles().ShardPath(shard_index);
const std::string corpus_path = wd.CorpusFilePaths().Shard(shard_index);
const std::string features_path = wd.FeaturesFilePaths().Shard(shard_index);
if (env_.serialize_shard_loads) {
ABSL_CONST_INIT static absl::Mutex load_shard_mu{absl::kConstInit};
absl::MutexLock lock(&load_shard_mu);
Expand Down Expand Up @@ -490,7 +490,7 @@ void Centipede::LoadAllShardsInRandomOrder(const Environment &load_env,

void Centipede::Rerun(std::vector<ByteArray> &to_rerun) {
if (to_rerun.empty()) return;
auto features_file_path = wd_.FeaturesFiles().ShardPath(env_.my_shard_index);
auto features_file_path = wd_.FeaturesFilePaths().Shard(env_.my_shard_index);
auto features_file = DefaultBlobFileWriterFactory(env_.riegeli);
CHECK_OK(features_file->Open(features_file_path, "a"));

Expand Down Expand Up @@ -636,7 +636,7 @@ void Centipede::MergeFromOtherCorpus(std::string_view merge_from_dir,
if (new_corpus_size > initial_corpus_size) {
auto appender = DefaultBlobFileWriterFactory(env_.riegeli);
CHECK_OK(
appender->Open(wd_.CorpusFiles().ShardPath(env_.my_shard_index), "a"));
appender->Open(wd_.CorpusFilePaths().Shard(env_.my_shard_index), "a"));
for (size_t idx = initial_corpus_size; idx < new_corpus_size; ++idx) {
CHECK_OK(appender->Write(corpus_.Get(idx)));
}
Expand All @@ -655,7 +655,7 @@ void Centipede::ReloadAllShardsAndWriteDistilledCorpus() {

// Save the distilled corpus to a file in workdir and possibly to a hashed
// file in the first corpus dir passed in `--corpus_dir`.
const auto distill_to_path = wd_.DistilledCorpusFiles().MyShardPath();
const auto distill_to_path = wd_.DistilledCorpusFilePaths().MyShard();
LOG(INFO) << "Distilling: shard: " << env_.my_shard_index
<< " output: " << distill_to_path << " "
<< " distilled size: " << corpus_.NumActive();
Expand Down Expand Up @@ -718,10 +718,10 @@ void Centipede::FuzzingLoop() {

if (env_.load_shards_only) return;

auto corpus_path = wd_.CorpusFiles().ShardPath(env_.my_shard_index);
auto corpus_path = wd_.CorpusFilePaths().Shard(env_.my_shard_index);
auto corpus_file = DefaultBlobFileWriterFactory(env_.riegeli);
CHECK_OK(corpus_file->Open(corpus_path, "a"));
auto features_path = wd_.FeaturesFiles().ShardPath(env_.my_shard_index);
auto features_path = wd_.FeaturesFilePaths().Shard(env_.my_shard_index);
auto features_file = DefaultBlobFileWriterFactory(env_.riegeli);
CHECK_OK(features_file->Open(features_path, "a"));

Expand Down
7 changes: 4 additions & 3 deletions centipede/centipede_interface.cc
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ SeedCorpusConfig GetSeedCorpusConfig(const Environment &env,
// We're using the previously distilled corpus files as seeds.
.shard_rel_glob =
std::filesystem::path{
workdir.DistilledCorpusFiles().AllShardsGlob()}
workdir.DistilledCorpusFilePaths().AllShardsGlob()}
.filename(),
.sampled_fraction_or_count = 1.0f,
}},
Expand All @@ -462,7 +462,8 @@ SeedCorpusConfig GetSeedCorpusConfig(const Environment &env,
.dir_path = env.workdir,
// We're seeding the current corpus files.
.shard_rel_glob =
std::filesystem::path{workdir.CorpusFiles().AllShardsGlob()}
std::filesystem::path{
workdir.CorpusFilePaths().AllShardsGlob()}
.filename(),
.shard_index_digits = WorkDir::kDigitsInShardIndex,
.num_shards = static_cast<uint32_t>(env.num_threads),
Expand Down Expand Up @@ -593,7 +594,7 @@ int UpdateCorpusDatabaseForFuzzTests(
}
CHECK_OK(RemoteMkdir(coverage_dir.c_str()));
std::vector<std::string> distilled_corpus_files;
CHECK_OK(RemoteGlobMatch(workdir.DistilledCorpusFiles().AllShardsGlob(),
CHECK_OK(RemoteGlobMatch(workdir.DistilledCorpusFilePaths().AllShardsGlob(),
distilled_corpus_files));
for (const std::string &corpus_file : distilled_corpus_files) {
const std::string file_name =
Expand Down
8 changes: 4 additions & 4 deletions centipede/corpus_io_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,11 @@ TEST(ExportCorpusTest, ExportsCorpusToIndividualFiles) {
CHECK(std::filesystem::create_directory(out_dir));
const WorkDir workdir{temp_dir.c_str(), "fake_binary_name",
"fake_binary_hash", /*my_shard_index=*/0};
const auto corpus_files = workdir.CorpusFiles();
WriteBlobsToFile(corpus_files.ShardPath(0), {ByteArray{1, 2}, ByteArray{3}});
WriteBlobsToFile(corpus_files.ShardPath(1), {ByteArray{4}, ByteArray{5, 6}});
const auto corpus_file_paths = workdir.CorpusFilePaths();
WriteBlobsToFile(corpus_file_paths.Shard(0), {ByteArray{1, 2}, ByteArray{3}});
WriteBlobsToFile(corpus_file_paths.Shard(1), {ByteArray{4}, ByteArray{5, 6}});

ExportCorpus({corpus_files.ShardPath(0), corpus_files.ShardPath(1)},
ExportCorpus({corpus_file_paths.Shard(0), corpus_file_paths.Shard(1)},
out_dir.c_str());

EXPECT_THAT(ReadInputsFromFiles(out_dir.c_str()),
Expand Down
12 changes: 6 additions & 6 deletions centipede/distill.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ class InputCorpusShardReader {
: workdir_{env}, log_prefix_{LogPrefix(env)} {}

perf::MemSize EstimateRamFootprint(size_t shard_idx) const {
const auto corpus_path = workdir_.CorpusFiles().ShardPath(shard_idx);
const auto features_path = workdir_.FeaturesFiles().ShardPath(shard_idx);
const auto corpus_path = workdir_.CorpusFilePaths().Shard(shard_idx);
const auto features_path = workdir_.FeaturesFilePaths().Shard(shard_idx);
const perf::MemSize corpus_file_size =
ValueOrDie(RemoteFileGetSize(corpus_path));
const perf::MemSize features_file_size =
Expand All @@ -134,8 +134,8 @@ class InputCorpusShardReader {

// Reads and returns a single shard's elements. Thread-safe.
CorpusEltVec ReadShard(size_t shard_idx) {
const auto corpus_path = workdir_.CorpusFiles().ShardPath(shard_idx);
const auto features_path = workdir_.FeaturesFiles().ShardPath(shard_idx);
const auto corpus_path = workdir_.CorpusFilePaths().Shard(shard_idx);
const auto features_path = workdir_.FeaturesFilePaths().Shard(shard_idx);
VLOG(1) << log_prefix_ << "reading input shard " << shard_idx << ":\n"
<< VV(corpus_path) << "\n"
<< VV(features_path);
Expand Down Expand Up @@ -167,8 +167,8 @@ class CorpusShardWriter {
CorpusShardWriter(const Environment &env, bool append)
: workdir_{env},
log_prefix_{LogPrefix(env)},
corpus_path_{workdir_.DistilledCorpusFiles().MyShardPath()},
features_path_{workdir_.DistilledFeaturesFiles().MyShardPath()},
corpus_path_{workdir_.DistilledCorpusFilePaths().MyShard()},
features_path_{workdir_.DistilledFeaturesFilePaths().MyShard()},
corpus_writer_{DefaultBlobFileWriterFactory()},
feature_writer_{DefaultBlobFileWriterFactory()} {
CHECK_OK(corpus_writer_->Open(corpus_path_, append ? "a" : "w"));
Expand Down
8 changes: 4 additions & 4 deletions centipede/distill_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ using InputVec = std::vector<ByteArray>;
void WriteToShard(const Environment &env, const TestCorpusRecord &record,
size_t shard_index) {
const WorkDir wd{env};
const auto corpus_path = wd.CorpusFiles().ShardPath(shard_index);
const auto features_path = wd.FeaturesFiles().ShardPath(shard_index);
const auto corpus_path = wd.CorpusFilePaths().Shard(shard_index);
const auto features_path = wd.FeaturesFilePaths().Shard(shard_index);
const auto corpus_appender = DefaultBlobFileWriterFactory(env.riegeli);
const auto features_appender = DefaultBlobFileWriterFactory(env.riegeli);
CHECK_OK(corpus_appender->Open(corpus_path, "a"));
Expand All @@ -80,9 +80,9 @@ void WriteToShard(const Environment &env, const TestCorpusRecord &record,
// Reads and returns the distilled corpus record from
// `wd.DistilledCorpusPath()` and `wd.DistilledFeaturesPath()`.
std::vector<TestCorpusRecord> ReadFromDistilled(const WorkDir &wd) {
const auto distilled_corpus_path = wd.DistilledCorpusFiles().MyShardPath();
const auto distilled_corpus_path = wd.DistilledCorpusFilePaths().MyShard();
const auto distilled_features_path =
wd.DistilledFeaturesFiles().MyShardPath();
wd.DistilledFeaturesFilePaths().MyShard();

std::vector<TestCorpusRecord> result;
auto shard_reader_callback = [&result](ByteArray input, FeatureVec features) {
Expand Down
22 changes: 11 additions & 11 deletions centipede/seed_corpus_maker_lib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,10 @@ absl::Status SampleSeedCorpusElementsFromSource( //
const auto work_dir = WorkDir::FromCorpusShardPath( //
corpus_fname, coverage_binary_name, coverage_binary_hash);
const std::string features_fname =
work_dir.CorpusFiles().IsShardPath(corpus_fname)
? work_dir.FeaturesFiles().MyShardPath()
: work_dir.DistilledCorpusFiles().IsShardPath(corpus_fname)
? work_dir.DistilledFeaturesFiles().MyShardPath()
work_dir.CorpusFilePaths().IsShard(corpus_fname)
? work_dir.FeaturesFilePaths().MyShard()
: work_dir.DistilledCorpusFilePaths().IsShard(corpus_fname)
? work_dir.DistilledFeaturesFilePaths().MyShard()
: "";

VLOG(2) << "Reading elements from source shard " << shard
Expand Down Expand Up @@ -356,21 +356,21 @@ absl::Status WriteSeedCorpusElementsToDestination( //
const auto work_dir = WorkDir::FromCorpusShardPath( //
corpus_fname, coverage_binary_name, coverage_binary_hash);

if (corpus_fname != work_dir.CorpusFiles().MyShardPath() &&
corpus_fname != work_dir.DistilledCorpusFiles().MyShardPath()) {
if (corpus_fname != work_dir.CorpusFilePaths().MyShard() &&
corpus_fname != work_dir.DistilledCorpusFilePaths().MyShard()) {
return absl::InvalidArgumentError(absl::StrCat(
"Bad config: generated destination corpus filename '",
corpus_fname, "' doesn't match one of two expected forms '",
work_dir.CorpusFiles().MyShardPath(), "' or '",
work_dir.DistilledCorpusFiles().MyShardPath(),
work_dir.CorpusFilePaths().MyShard(), "' or '",
work_dir.DistilledCorpusFilePaths().MyShard(),
"'; make sure binary name in config matches explicitly passed '",
coverage_binary_name, "'"));
}

const std::string features_fname =
work_dir.CorpusFiles().IsShardPath(corpus_fname)
? work_dir.FeaturesFiles().MyShardPath()
: work_dir.DistilledFeaturesFiles().MyShardPath();
work_dir.CorpusFilePaths().IsShard(corpus_fname)
? work_dir.FeaturesFilePaths().MyShard()
: work_dir.DistilledFeaturesFilePaths().MyShard();
CHECK(!features_fname.empty());

VLOG(2) << "Writing " << std::distance(elt_range_begin, elt_range_end)
Expand Down
26 changes: 14 additions & 12 deletions centipede/seed_corpus_maker_lib_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,23 @@ void VerifyShardsExist( //
std::string{binary_hash},
/*my_shard_index=*/0,
};
const WorkDir::ShardedFileInfo corpus_files =
shard_type == kNormal ? wd.CorpusFiles() : wd.DistilledCorpusFiles();
const WorkDir::ShardedFileInfo features_files =
shard_type == kNormal ? wd.FeaturesFiles() : wd.DistilledFeaturesFiles();
const WorkDir::ShardedPath corpus_file_paths =
shard_type == kNormal ? wd.CorpusFilePaths()
: wd.DistilledCorpusFilePaths();
const WorkDir::ShardedPath features_file_paths =
shard_type == kNormal ? wd.FeaturesFilePaths()
: wd.DistilledFeaturesFilePaths();
for (int shard = 0; shard < num_shards + 2; ++shard) {
if (shard < num_shards) {
ASSERT_TRUE(fs::exists(corpus_files.ShardPath(shard)))
<< VV(shard) << VV(corpus_files.ShardPath(shard));
ASSERT_TRUE(fs::exists(features_files.ShardPath(shard)))
<< VV(shard) << VV(features_files.ShardPath(shard));
ASSERT_TRUE(fs::exists(corpus_file_paths.Shard(shard)))
<< VV(shard) << VV(corpus_file_paths.Shard(shard));
ASSERT_TRUE(fs::exists(features_file_paths.Shard(shard)))
<< VV(shard) << VV(features_file_paths.Shard(shard));
} else {
ASSERT_FALSE(fs::exists(corpus_files.ShardPath(shard)))
<< VV(shard) << VV(corpus_files.ShardPath(shard));
ASSERT_FALSE(fs::exists(features_files.ShardPath(shard)))
<< VV(shard) << VV(features_files.ShardPath(shard));
ASSERT_FALSE(fs::exists(corpus_file_paths.Shard(shard)))
<< VV(shard) << VV(corpus_file_paths.Shard(shard));
ASSERT_FALSE(fs::exists(features_file_paths.Shard(shard)))
<< VV(shard) << VV(features_file_paths.Shard(shard));
}
}
}
Expand Down
26 changes: 14 additions & 12 deletions centipede/seed_corpus_maker_proto_lib_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,23 @@ void VerifyShardsExist( //
std::string{binary_hash},
/*my_shard_index=*/0,
};
const WorkDir::ShardedFileInfo corpus_files =
shard_type == kNormal ? wd.CorpusFiles() : wd.DistilledCorpusFiles();
const WorkDir::ShardedFileInfo features_files =
shard_type == kNormal ? wd.FeaturesFiles() : wd.DistilledFeaturesFiles();
const WorkDir::ShardedPath corpus_file_paths =
shard_type == kNormal ? wd.CorpusFilePaths()
: wd.DistilledCorpusFilePaths();
const WorkDir::ShardedPath features_file_paths =
shard_type == kNormal ? wd.FeaturesFilePaths()
: wd.DistilledFeaturesFilePaths();
for (int shard = 0; shard < num_shards + 2; ++shard) {
if (shard < num_shards) {
ASSERT_TRUE(fs::exists(corpus_files.ShardPath(shard)))
<< VV(shard) << VV(corpus_files.ShardPath(shard));
ASSERT_TRUE(fs::exists(features_files.ShardPath(shard)))
<< VV(shard) << VV(features_files.ShardPath(shard));
ASSERT_TRUE(fs::exists(corpus_file_paths.Shard(shard)))
<< VV(shard) << VV(corpus_file_paths.Shard(shard));
ASSERT_TRUE(fs::exists(features_file_paths.Shard(shard)))
<< VV(shard) << VV(features_file_paths.Shard(shard));
} else {
ASSERT_FALSE(fs::exists(corpus_files.ShardPath(shard)))
<< VV(shard) << VV(corpus_files.ShardPath(shard));
ASSERT_FALSE(fs::exists(features_files.ShardPath(shard)))
<< VV(shard) << VV(features_files.ShardPath(shard));
ASSERT_FALSE(fs::exists(corpus_file_paths.Shard(shard)))
<< VV(shard) << VV(corpus_file_paths.Shard(shard));
ASSERT_FALSE(fs::exists(features_file_paths.Shard(shard)))
<< VV(shard) << VV(features_file_paths.Shard(shard));
}
}
}
Expand Down
Loading

0 comments on commit 6e7cc14

Please sign in to comment.