Skip to content

Commit 17fcd27

Browse files
committed
[Index] Add option to compress unit and record files
This option reduces the size of the index in a mixed clang + Swift project I have tested by about 70% while increasing the indexing overhead by about 10-15% overhead (from 1.07% to ~1.2%).
1 parent 9db20e6 commit 17fcd27

File tree

13 files changed

+189
-41
lines changed

13 files changed

+189
-41
lines changed

clang/include/clang/Driver/Options.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,11 @@ def index_ignore_pcms : Flag<["-"], "index-ignore-pcms">,
736736
Visibility<[ClangOption, CC1Option]>,
737737
HelpText<"Ignore symbols from imported pcm modules">,
738738
MarshallingInfoFlag<FrontendOpts<"IndexIgnorePcms">>;
739+
def index_store_record_compression
740+
: Flag<["-"], "index-store-compress">,
741+
Visibility<[ClangOption, CC1Option]>,
742+
HelpText<"Whether to compress unit and record files in the index store">,
743+
MarshallingInfoFlag<FrontendOpts<"IndexStoreCompress">>;
739744

740745
// Make sure all other -ccc- options are rejected.
741746
def ccc_ : Joined<["-"], "ccc-">, Group<internal_Group>, Flags<[Unsupported]>;

clang/include/clang/Frontend/FrontendOptions.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,9 @@ class FrontendOptions {
477477
std::string IndexStorePath;
478478
std::string IndexUnitOutputPath;
479479

480+
/// Whether to compress the unit and record files in the index store.
481+
bool IndexStoreCompress = false;
482+
480483
/// The input kind, either specified via -x argument or deduced from the input
481484
/// file name.
482485
InputKind DashX;

clang/include/clang/Index/IndexRecordWriter.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,12 @@ typedef llvm::function_ref<Symbol(OpaqueDecl, SmallVectorImpl<char> &Scratch)>
5353
/// beginRecord, and if the file does not already exist, then proceed to add
5454
/// all symbol occurrences (addOccurrence) and finally finish with endRecord.
5555
class IndexRecordWriter {
56+
/// Whether to compress the index record using zlib.
57+
bool Compress;
5658
SmallString<64> RecordsPath; ///< The records directory path.
5759
void *Record = nullptr; ///< The state of the current record.
5860
public:
59-
IndexRecordWriter(StringRef IndexPath);
61+
IndexRecordWriter(StringRef IndexPath, bool Compress);
6062

6163
enum class Result {
6264
Success,

clang/include/clang/Index/IndexUnitWriter.h

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ class IndexUnitWriter {
5050
SmallString<64> UnitsPath;
5151
std::string ProviderIdentifier;
5252
std::string ProviderVersion;
53+
/// Whether to compress the index unit using zlib.
54+
bool Compress;
5355
std::string OutputFile;
5456
std::string ModuleName;
5557
OptionalFileEntryRef MainFile;
@@ -92,17 +94,12 @@ class IndexUnitWriter {
9294
/// \param IsSystem true for system module units, false otherwise.
9395
/// \param Remapper Remapper to use to standardize file paths to make them
9496
/// hermetic/reproducible. This applies to all paths emitted in the unit file.
95-
IndexUnitWriter(FileManager &FileMgr,
96-
StringRef StorePath,
97+
IndexUnitWriter(FileManager &FileMgr, StringRef StorePath,
9798
StringRef ProviderIdentifier, StringRef ProviderVersion,
98-
StringRef OutputFile,
99-
StringRef ModuleName,
100-
OptionalFileEntryRef MainFile,
101-
bool IsSystem,
102-
bool IsModuleUnit,
103-
bool IsDebugCompilation,
104-
StringRef TargetTriple,
105-
StringRef SysrootPath,
99+
bool Compress, StringRef OutputFile, StringRef ModuleName,
100+
OptionalFileEntryRef MainFile, bool IsSystem,
101+
bool IsModuleUnit, bool IsDebugCompilation,
102+
StringRef TargetTriple, StringRef SysrootPath,
106103
const PathRemapper &Remapper,
107104
writer::ModuleInfoWriterCallback GetInfoForModule);
108105
~IndexUnitWriter();

clang/lib/Index/ClangIndexRecordWriter.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@ StringRef ClangIndexRecordWriter::getUSRNonCached(const IdentifierInfo *Name,
6161
return StringRef(Ptr, USR.size());
6262
}
6363

64-
ClangIndexRecordWriter::ClangIndexRecordWriter(ASTContext &Ctx,
64+
ClangIndexRecordWriter::ClangIndexRecordWriter(ASTContext &Ctx, bool Compress,
6565
RecordingOptions Opts)
66-
: Impl(Opts.DataDirPath), Ctx(Ctx), RecordOpts(std::move(Opts)) {
66+
: Impl(Opts.DataDirPath, Compress), Ctx(Ctx), RecordOpts(std::move(Opts)) {
6767
if (Opts.RecordSymbolCodeGenName)
6868
ASTNameGen.reset(new ASTNameGenerator(Ctx));
6969
}

clang/lib/Index/ClangIndexRecordWriter.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class ClangIndexRecordWriter {
3535
llvm::DenseMap<const void *, StringRef> USRByDecl;
3636

3737
public:
38-
ClangIndexRecordWriter(ASTContext &Ctx, RecordingOptions Opts);
38+
ClangIndexRecordWriter(ASTContext &Ctx, bool Compress, RecordingOptions Opts);
3939
~ClangIndexRecordWriter();
4040

4141
ASTContext &getASTContext() { return Ctx; }

clang/lib/Index/IndexRecordReader.cpp

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,12 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "clang/Index/IndexRecordReader.h"
10-
#include "IndexDataStoreUtils.h"
1110
#include "BitstreamVisitor.h"
11+
#include "IndexDataStoreUtils.h"
1212
#include "clang/Index/IndexDataStoreSymbolUtils.h"
1313
#include "llvm/ADT/ArrayRef.h"
1414
#include "llvm/Bitstream/BitstreamReader.h"
15+
#include "llvm/Support/Compression.h"
1516
#include "llvm/Support/FileSystem.h"
1617
#include "llvm/Support/MemoryBuffer.h"
1718
#include "llvm/Support/Path.h"
@@ -368,7 +369,42 @@ IndexRecordReader::createWithBuffer(std::unique_ptr<llvm::MemoryBuffer> Buffer,
368369
std::unique_ptr<IndexRecordReader> Reader;
369370
Reader.reset(new IndexRecordReader());
370371
auto &Impl = Reader->Impl;
371-
Impl.Buffer = std::move(Buffer);
372+
if (Buffer->getBuffer().starts_with("CIDXR")) {
373+
if (!llvm::compression::zlib::isAvailable()) {
374+
Error = "zlib not available to decompress compressed index record";
375+
return nullptr;
376+
}
377+
378+
ArrayRef compressedBuffer =
379+
llvm::arrayRefFromStringRef(Buffer->getBuffer());
380+
381+
// Slice off the `CIDXR` marker we checked above.
382+
compressedBuffer = compressedBuffer.slice(5);
383+
384+
// Read the uncompressed size of the record.
385+
if (compressedBuffer.size() < 4) {
386+
Error = "Unexpectedly found end of record file";
387+
return nullptr;
388+
}
389+
size_t uncompressedSize =
390+
llvm::support::endian::read32le(compressedBuffer.data());
391+
compressedBuffer = compressedBuffer.slice(4);
392+
393+
// Decompress the record
394+
llvm::SmallVector<uint8_t, 0> decompressed;
395+
llvm::Error decompressError = llvm::compression::zlib::decompress(
396+
compressedBuffer, decompressed, uncompressedSize);
397+
if (decompressError) {
398+
llvm::raw_string_ostream ErrorOS(Error);
399+
ErrorOS << "Failed to decompress index record: " << decompressError;
400+
return nullptr;
401+
}
402+
Impl.Buffer = llvm::MemoryBuffer::getMemBufferCopy(
403+
llvm::toStringRef(decompressed),
404+
Buffer->getBufferIdentifier() + " decompressed");
405+
} else {
406+
Impl.Buffer = std::move(Buffer);
407+
}
372408
llvm::BitstreamCursor Stream(*Impl.Buffer);
373409

374410
if (Stream.AtEndOfStream()) {

clang/lib/Index/IndexRecordWriter.cpp

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "llvm/ADT/DenseMap.h"
1515
#include "llvm/ADT/StringSet.h"
1616
#include "llvm/Bitstream/BitstreamWriter.h"
17+
#include "llvm/Support/Compression.h"
1718
#include "llvm/Support/Errc.h"
1819
#include "llvm/Support/FileSystem.h"
1920
#include "llvm/Support/Path.h"
@@ -234,8 +235,8 @@ static void writeDecls(BitstreamWriter &Stream, ArrayRef<DeclInfo> Decls,
234235
Stream.ExitBlock();
235236
}
236237

237-
IndexRecordWriter::IndexRecordWriter(StringRef IndexPath)
238-
: RecordsPath(IndexPath) {
238+
IndexRecordWriter::IndexRecordWriter(StringRef IndexPath, bool Compress)
239+
: Compress(Compress), RecordsPath(IndexPath) {
239240
store::appendRecordSubDir(RecordsPath);
240241
}
241242

@@ -319,7 +320,44 @@ IndexRecordWriter::endRecord(std::string &Error,
319320
}
320321

321322
raw_fd_ostream OS(TempFD, /*shouldClose=*/true);
322-
OS.write(State.Buffer.data(), State.Buffer.size());
323+
if (Compress) {
324+
if (!llvm::compression::zlib::isAvailable()) {
325+
Error = "Zlib not available to compress record file";
326+
return Result::Failure;
327+
}
328+
329+
// Higher compression levels add marginal improvements to the compressed
330+
// size while having a a measurable impact on compile time. An analysis on a
331+
// mixed clang / Swift project showed the following results:
332+
// - BestSpeed: Compresses the index store by 66% while increasing the
333+
// index-while-building overhead by 15% (from 1.07% to 1.23%)
334+
// - Default: Compression of 68.1%, increases index-while-building overhead
335+
// by 23%
336+
// - BestSize: Compression of 68.2%, increases index-while-building
337+
// overhead by 37%
338+
// Based on those numbers, BestSpeed seems like the best choice. If clients
339+
// need to compress the index store further, they should run a compression
340+
// algorithm across all files in the index store.
341+
auto compressionLevel = compression::zlib::BestSpeedCompression;
342+
ArrayRef<uint8_t> bufferRef = llvm::arrayRefFromStringRef(State.Buffer);
343+
llvm::SmallVector<uint8_t, 0> compressed;
344+
llvm::compression::zlib::compress(bufferRef, compressed, compressionLevel);
345+
346+
// Write the `CIDXR` (compressed index record) marker to indicate that this
347+
// is a compressed record file.
348+
OS << "CIDXR";
349+
350+
// Write the size of the uncompressed record so that we can allocate a
351+
// buffer of the corresponding size when decompressing it.
352+
char Buf[4];
353+
llvm::support::endian::write32le(Buf, bufferRef.size());
354+
OS.write(Buf, sizeof(Buf));
355+
356+
// Write the acutal compressed data
357+
OS << llvm::toStringRef(compressed);
358+
} else {
359+
OS << State.Buffer;
360+
}
323361
OS.close();
324362

325363
if (OS.has_error()) {

clang/lib/Index/IndexUnitReader.cpp

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,42 @@ bool IndexUnitReaderImpl::init(std::unique_ptr<MemoryBuffer> Buf,
282282
sys::TimePoint<> ModTime,
283283
std::string &Error) {
284284
this->ModTime = ModTime;
285-
this->MemBuf = std::move(Buf);
285+
286+
if (Buf->getBuffer().starts_with("CIDXU")) {
287+
if (!llvm::compression::zlib::isAvailable()) {
288+
Error = "zlib not available to decompress compressed index unit";
289+
return true;
290+
}
291+
292+
ArrayRef compressedBuffer = llvm::arrayRefFromStringRef(Buf->getBuffer());
293+
294+
// Slice off the `CIDXU` marker we checked above.
295+
compressedBuffer = compressedBuffer.slice(5);
296+
297+
// Read the uncompressed size of the unit.
298+
if (compressedBuffer.size() < 4) {
299+
Error = "Unexpectedly found end of record unit";
300+
return true;
301+
}
302+
size_t uncompressedSize =
303+
llvm::support::endian::read32le(compressedBuffer.data());
304+
compressedBuffer = compressedBuffer.slice(4);
305+
306+
// Decompress the unit
307+
llvm::SmallVector<uint8_t, 0> decompressed;
308+
llvm::Error decompressError = llvm::compression::zlib::decompress(
309+
compressedBuffer, decompressed, uncompressedSize);
310+
if (decompressError) {
311+
llvm::raw_string_ostream ErrorOS(Error);
312+
ErrorOS << "Failed to decompress index unit: " << decompressError;
313+
return true;
314+
}
315+
this->MemBuf = llvm::MemoryBuffer::getMemBufferCopy(
316+
llvm::toStringRef(decompressed),
317+
Buf->getBufferIdentifier() + " decompressed");
318+
} else {
319+
this->MemBuf = std::move(Buf);
320+
}
286321
llvm::BitstreamCursor Stream(*MemBuf);
287322

288323
if (Stream.AtEndOfStream()) {

clang/lib/Index/IndexUnitWriter.cpp

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "llvm/ADT/StringMap.h"
1616
#include "llvm/Bitstream/BitstreamWriter.h"
1717
#include "llvm/Support/Allocator.h"
18+
#include "llvm/Support/Compression.h"
1819
#include "llvm/Support/Errc.h"
1920
#include "llvm/Support/FileSystem.h"
2021
#include "llvm/Support/Path.h"
@@ -116,21 +117,14 @@ class IndexUnitWriter::PathStorage {
116117
}
117118
};
118119

119-
IndexUnitWriter::IndexUnitWriter(FileManager &FileMgr,
120-
StringRef StorePath,
121-
StringRef ProviderIdentifier,
122-
StringRef ProviderVersion,
123-
StringRef OutputFile,
124-
StringRef ModuleName,
125-
OptionalFileEntryRef MainFile,
126-
bool IsSystem,
127-
bool IsModuleUnit,
128-
bool IsDebugCompilation,
129-
StringRef TargetTriple,
130-
StringRef SysrootPath,
131-
const PathRemapper &Remapper,
132-
writer::ModuleInfoWriterCallback GetInfoForModule)
133-
: FileMgr(FileMgr), Remapper(Remapper) {
120+
IndexUnitWriter::IndexUnitWriter(
121+
FileManager &FileMgr, StringRef StorePath, StringRef ProviderIdentifier,
122+
StringRef ProviderVersion, bool Compress, StringRef OutputFile,
123+
StringRef ModuleName, OptionalFileEntryRef MainFile, bool IsSystem,
124+
bool IsModuleUnit, bool IsDebugCompilation, StringRef TargetTriple,
125+
StringRef SysrootPath, const PathRemapper &Remapper,
126+
writer::ModuleInfoWriterCallback GetInfoForModule)
127+
: FileMgr(FileMgr), Compress(Compress), Remapper(Remapper) {
134128
this->UnitsPath = StorePath;
135129
store::appendUnitSubDir(this->UnitsPath);
136130
this->ProviderIdentifier = std::string(ProviderIdentifier);
@@ -393,7 +387,34 @@ bool IndexUnitWriter::write(std::string &Error) {
393387
}
394388

395389
raw_fd_ostream OS(TempFD, /*shouldClose=*/true);
396-
OS.write(Buffer.data(), Buffer.size());
390+
if (Compress) {
391+
if (!llvm::compression::zlib::isAvailable()) {
392+
Error = "Zlib not available to compress record file";
393+
return true;
394+
}
395+
396+
// See comment in `IndexRecordWriter::endRecord` for a rational why we use
397+
// `BestSpeed`.
398+
auto compressionLevel = compression::zlib::BestSpeedCompression;
399+
ArrayRef<uint8_t> bufferRef = llvm::arrayRefFromStringRef(Buffer);
400+
llvm::SmallVector<uint8_t, 0> compressed;
401+
llvm::compression::zlib::compress(bufferRef, compressed, compressionLevel);
402+
403+
// Write the `CIDXU` (compressed index unit) marker to indicate that this
404+
// is a compressed unit file.
405+
OS << "CIDXU";
406+
407+
// Write the size of the uncompressed unit so that we can allocate a
408+
// buffer of the corresponding size when decompressing it.
409+
char Buf[4];
410+
llvm::support::endian::write32le(Buf, bufferRef.size());
411+
OS.write(Buf, sizeof(Buf));
412+
413+
// Write the acutal compressed data
414+
OS << llvm::toStringRef(compressed);
415+
} else {
416+
OS << Buffer;
417+
}
397418
OS.close();
398419

399420
if (OS.has_error()) {

0 commit comments

Comments
 (0)