Skip to content

Commit 30d5946

Browse files
committed
[clang][AST] Support AST files larger than 512M
Summary: Clang uses 32-bit integers for storing bit offsets from the beginning of the file that results in 512M limit on AST file. This diff replaces absolute offsets with relative offsets from the beginning of corresponding data structure when it is possible. And uses 64-bit offsets for DeclOffests and TypeOffssts because these coder AST section may easily exceeds 512M alone. This diff breaks AST file format compatibility so VERSION_MAJOR bumped. Test Plan: Existing clang AST serialization tests Tested on clangd with ~700M and ~900M preamble files Reviewers: rsmith, dexonsmith Subscribers: ilya-biryukov, kadircet, usaxena95, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D76594
1 parent f701d8f commit 30d5946

File tree

8 files changed

+81
-40
lines changed

8 files changed

+81
-40
lines changed

clang/include/clang/Serialization/ASTBitCodes.h

+21-6
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ namespace serialization {
4141
/// Version 4 of AST files also requires that the version control branch and
4242
/// revision match exactly, since there is no backward compatibility of
4343
/// AST files at this time.
44-
const unsigned VERSION_MAJOR = 9;
44+
const unsigned VERSION_MAJOR = 10;
4545

4646
/// AST file minor version number supported by this version of
4747
/// Clang.
@@ -181,7 +181,7 @@ namespace serialization {
181181
/// Raw source location of end of range.
182182
unsigned End;
183183

184-
/// Offset in the AST file.
184+
/// Offset in the AST file relative to ModuleFile::MacroOffsetsBase.
185185
uint32_t BitOffset;
186186

187187
PPEntityOffset(SourceRange R, uint32_t BitOffset)
@@ -221,12 +221,18 @@ namespace serialization {
221221
/// Raw source location.
222222
unsigned Loc = 0;
223223

224-
/// Offset in the AST file.
225-
uint32_t BitOffset = 0;
224+
/// Offset in the AST file. Split 64-bit integer into low/high parts
225+
/// to keep structure alignment 32-bit and don't have padding gap.
226+
/// This structure is serialized "as is" to the AST file and undefined
227+
/// value in the padding affects AST hash.
228+
uint32_t BitOffsetLow = 0;
229+
uint32_t BitOffsetHigh = 0;
226230

227231
DeclOffset() = default;
228-
DeclOffset(SourceLocation Loc, uint32_t BitOffset)
229-
: Loc(Loc.getRawEncoding()), BitOffset(BitOffset) {}
232+
DeclOffset(SourceLocation Loc, uint64_t BitOffset) {
233+
setLocation(Loc);
234+
setBitOffset(BitOffset);
235+
}
230236

231237
void setLocation(SourceLocation L) {
232238
Loc = L.getRawEncoding();
@@ -235,6 +241,15 @@ namespace serialization {
235241
SourceLocation getLocation() const {
236242
return SourceLocation::getFromRawEncoding(Loc);
237243
}
244+
245+
void setBitOffset(uint64_t Offset) {
246+
BitOffsetLow = Offset;
247+
BitOffsetHigh = Offset >> 32;
248+
}
249+
250+
uint64_t getBitOffset() const {
251+
return BitOffsetLow | (uint64_t(BitOffsetHigh) << 32);
252+
}
238253
};
239254

240255
/// The number of predefined preprocessed entity IDs.

clang/include/clang/Serialization/ASTReader.h

+4-3
Original file line numberDiff line numberDiff line change
@@ -723,9 +723,10 @@ class ASTReader
723723

724724
struct PendingMacroInfo {
725725
ModuleFile *M;
726-
uint64_t MacroDirectivesOffset;
726+
/// Offset relative to ModuleFile::MacroOffsetsBase.
727+
uint32_t MacroDirectivesOffset;
727728

728-
PendingMacroInfo(ModuleFile *M, uint64_t MacroDirectivesOffset)
729+
PendingMacroInfo(ModuleFile *M, uint32_t MacroDirectivesOffset)
729730
: M(M), MacroDirectivesOffset(MacroDirectivesOffset) {}
730731
};
731732

@@ -2205,7 +2206,7 @@ class ASTReader
22052206
/// \param MacroDirectivesOffset Offset of the serialized macro directive
22062207
/// history.
22072208
void addPendingMacro(IdentifierInfo *II, ModuleFile *M,
2208-
uint64_t MacroDirectivesOffset);
2209+
uint32_t MacroDirectivesOffset);
22092210

22102211
/// Read the set of macros defined by this external macro source.
22112212
void ReadDefinedMacros() override;

clang/include/clang/Serialization/ASTWriter.h

+6-4
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ class ASTWriter : public ASTDeserializationListener,
243243

244244
/// Offset of each type in the bitstream, indexed by
245245
/// the type's ID.
246-
std::vector<uint32_t> TypeOffsets;
246+
std::vector<uint64_t> TypeOffsets;
247247

248248
/// The first ID number we can use for our own identifiers.
249249
serialization::IdentID FirstIdentID = serialization::NUM_PREDEF_IDENT_IDS;
@@ -277,7 +277,8 @@ class ASTWriter : public ASTDeserializationListener,
277277
/// The macro infos to emit.
278278
std::vector<MacroInfoToEmitData> MacroInfosToEmit;
279279

280-
llvm::DenseMap<const IdentifierInfo *, uint64_t> IdentMacroDirectivesOffsetMap;
280+
llvm::DenseMap<const IdentifierInfo *, uint32_t>
281+
IdentMacroDirectivesOffsetMap;
281282

282283
/// @name FlushStmt Caches
283284
/// @{
@@ -464,7 +465,8 @@ class ASTWriter : public ASTDeserializationListener,
464465
const Preprocessor &PP);
465466
void WritePreprocessor(const Preprocessor &PP, bool IsModule);
466467
void WriteHeaderSearch(const HeaderSearch &HS);
467-
void WritePreprocessorDetail(PreprocessingRecord &PPRec);
468+
void WritePreprocessorDetail(PreprocessingRecord &PPRec,
469+
uint64_t MacroOffsetsBase);
468470
void WriteSubmodules(Module *WritingModule);
469471

470472
void WritePragmaDiagnosticMappings(const DiagnosticsEngine &Diag,
@@ -588,7 +590,7 @@ class ASTWriter : public ASTDeserializationListener,
588590
/// Determine the ID of an already-emitted macro.
589591
serialization::MacroID getMacroID(MacroInfo *MI);
590592

591-
uint64_t getMacroDirectivesOffset(const IdentifierInfo *Name);
593+
uint32_t getMacroDirectivesOffset(const IdentifierInfo *Name);
592594

593595
/// Emit a reference to a type.
594596
void AddTypeRef(QualType T, RecordDataImpl &Record);

clang/include/clang/Serialization/ModuleFile.h

+9-1
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,10 @@ class ModuleFile {
251251
/// The base offset in the source manager's view of this module.
252252
unsigned SLocEntryBaseOffset = 0;
253253

254+
/// Base file offset for the offsets in SLocEntryOffsets. Real file offset
255+
/// for the entry is SLocEntryOffsetsBase + SLocEntryOffsets[i].
256+
uint64_t SLocEntryOffsetsBase = 0;
257+
254258
/// Offsets for all of the source location entries in the
255259
/// AST file.
256260
const uint32_t *SLocEntryOffsets = nullptr;
@@ -302,6 +306,10 @@ class ModuleFile {
302306
/// The number of macros in this AST file.
303307
unsigned LocalNumMacros = 0;
304308

309+
/// Base file offset for the offsets in MacroOffsets. Real file offset for
310+
/// the entry is MacroOffsetsBase + MacroOffsets[i].
311+
uint64_t MacroOffsetsBase = 0;
312+
305313
/// Offsets of macros in the preprocessor block.
306314
///
307315
/// This array is indexed by the macro ID (-1), and provides
@@ -450,7 +458,7 @@ class ModuleFile {
450458

451459
/// Offset of each type within the bitstream, indexed by the
452460
/// type ID, or the representation of a Type*.
453-
const uint32_t *TypeOffsets = nullptr;
461+
const uint64_t *TypeOffsets = nullptr;
454462

455463
/// Base type ID for types local to this module as represented in
456464
/// the global type ID space.

clang/lib/Serialization/ASTReader.cpp

+12-8
Original file line numberDiff line numberDiff line change
@@ -1470,6 +1470,7 @@ bool ASTReader::ReadSLocEntry(int ID) {
14701470

14711471
ModuleFile *F = GlobalSLocEntryMap.find(-ID)->second;
14721472
if (llvm::Error Err = F->SLocEntryCursor.JumpToBit(
1473+
F->SLocEntryOffsetsBase +
14731474
F->SLocEntryOffsets[ID - F->SLocEntryBaseID])) {
14741475
Error(std::move(Err));
14751476
return true;
@@ -1932,9 +1933,8 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d,
19321933
return HFI;
19331934
}
19341935

1935-
void ASTReader::addPendingMacro(IdentifierInfo *II,
1936-
ModuleFile *M,
1937-
uint64_t MacroDirectivesOffset) {
1936+
void ASTReader::addPendingMacro(IdentifierInfo *II, ModuleFile *M,
1937+
uint32_t MacroDirectivesOffset) {
19381938
assert(NumCurrentElementsDeserializing > 0 &&"Missing deserialization guard");
19391939
PendingMacroIDs[II].push_back(PendingMacroInfo(M, MacroDirectivesOffset));
19401940
}
@@ -2099,7 +2099,8 @@ void ASTReader::resolvePendingMacro(IdentifierInfo *II,
20992099

21002100
BitstreamCursor &Cursor = M.MacroCursor;
21012101
SavedStreamPosition SavedPosition(Cursor);
2102-
if (llvm::Error Err = Cursor.JumpToBit(PMInfo.MacroDirectivesOffset)) {
2102+
if (llvm::Error Err =
2103+
Cursor.JumpToBit(M.MacroOffsetsBase + PMInfo.MacroDirectivesOffset)) {
21032104
Error(std::move(Err));
21042105
return;
21052106
}
@@ -3098,7 +3099,7 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) {
30983099
Error("duplicate TYPE_OFFSET record in AST file");
30993100
return Failure;
31003101
}
3101-
F.TypeOffsets = (const uint32_t *)Blob.data();
3102+
F.TypeOffsets = reinterpret_cast<const uint64_t *>(Blob.data());
31023103
F.LocalNumTypes = Record[0];
31033104
unsigned LocalBaseTypeIndex = Record[1];
31043105
F.BaseTypeIndex = getTotalNumTypes();
@@ -3376,6 +3377,7 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) {
33763377
F.SLocEntryOffsets = (const uint32_t *)Blob.data();
33773378
F.LocalNumSLocEntries = Record[0];
33783379
unsigned SLocSpaceSize = Record[1];
3380+
F.SLocEntryOffsetsBase = Record[2];
33793381
std::tie(F.SLocEntryBaseID, F.SLocEntryBaseOffset) =
33803382
SourceMgr.AllocateLoadedSLocEntries(F.LocalNumSLocEntries,
33813383
SLocSpaceSize);
@@ -3694,6 +3696,7 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) {
36943696
F.MacroOffsets = (const uint32_t *)Blob.data();
36953697
F.LocalNumMacros = Record[0];
36963698
unsigned LocalBaseMacroID = Record[1];
3699+
F.MacroOffsetsBase = Record[2];
36973700
F.BaseMacroID = getTotalNumMacros();
36983701

36993702
if (F.LocalNumMacros > 0) {
@@ -5907,8 +5910,8 @@ PreprocessedEntity *ASTReader::ReadPreprocessedEntity(unsigned Index) {
59075910
}
59085911

59095912
SavedStreamPosition SavedPosition(M.PreprocessorDetailCursor);
5910-
if (llvm::Error Err =
5911-
M.PreprocessorDetailCursor.JumpToBit(PPOffs.BitOffset)) {
5913+
if (llvm::Error Err = M.PreprocessorDetailCursor.JumpToBit(
5914+
M.MacroOffsetsBase + PPOffs.BitOffset)) {
59125915
Error(std::move(Err));
59135916
return nullptr;
59145917
}
@@ -8427,7 +8430,8 @@ MacroInfo *ASTReader::getMacro(MacroID ID) {
84278430
assert(I != GlobalMacroMap.end() && "Corrupted global macro map");
84288431
ModuleFile *M = I->second;
84298432
unsigned Index = ID - M->BaseMacroID;
8430-
MacrosLoaded[ID] = ReadMacroRecord(*M, M->MacroOffsets[Index]);
8433+
MacrosLoaded[ID] =
8434+
ReadMacroRecord(*M, M->MacroOffsetsBase + M->MacroOffsets[Index]);
84318435

84328436
if (DeserializationListener)
84338437
DeserializationListener->MacroRead(ID + NUM_PREDEF_MACRO_IDS,

clang/lib/Serialization/ASTReaderDecl.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -2870,7 +2870,7 @@ ASTReader::DeclCursorForID(DeclID ID, SourceLocation &Loc) {
28702870
const DeclOffset &DOffs =
28712871
M->DeclOffsets[ID - M->BaseDeclID - NUM_PREDEF_DECL_IDS];
28722872
Loc = TranslateSourceLocation(*M, DOffs.getLocation());
2873-
return RecordLocation(M, DOffs.BitOffset);
2873+
return RecordLocation(M, DOffs.getBitOffset());
28742874
}
28752875

28762876
ASTReader::RecordLocation ASTReader::getLocalBitOffset(uint64_t GlobalOffset) {

clang/lib/Serialization/ASTWriter.cpp

+26-15
Original file line numberDiff line numberDiff line change
@@ -1893,6 +1893,7 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr,
18931893
// Write out the source location entry table. We skip the first
18941894
// entry, which is always the same dummy entry.
18951895
std::vector<uint32_t> SLocEntryOffsets;
1896+
uint64_t SLocEntryOffsetsBase = Stream.GetCurrentBitNo();
18961897
RecordData PreloadSLocs;
18971898
SLocEntryOffsets.reserve(SourceMgr.local_sloc_entry_size() - 1);
18981899
for (unsigned I = 1, N = SourceMgr.local_sloc_entry_size();
@@ -1903,7 +1904,9 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr,
19031904
assert(&SourceMgr.getSLocEntry(FID) == SLoc);
19041905

19051906
// Record the offset of this source-location entry.
1906-
SLocEntryOffsets.push_back(Stream.GetCurrentBitNo());
1907+
uint64_t Offset = Stream.GetCurrentBitNo() - SLocEntryOffsetsBase;
1908+
assert((Offset >> 32) == 0 && "SLocEntry offset too large");
1909+
SLocEntryOffsets.push_back(Offset);
19071910

19081911
// Figure out which record code to use.
19091912
unsigned Code;
@@ -2011,12 +2014,14 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr,
20112014
Abbrev->Add(BitCodeAbbrevOp(SOURCE_LOCATION_OFFSETS));
20122015
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 16)); // # of slocs
20132016
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 16)); // total size
2017+
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 32)); // base offset
20142018
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // offsets
20152019
unsigned SLocOffsetsAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
20162020
{
20172021
RecordData::value_type Record[] = {
20182022
SOURCE_LOCATION_OFFSETS, SLocEntryOffsets.size(),
2019-
SourceMgr.getNextLocalOffset() - 1 /* skip dummy */};
2023+
SourceMgr.getNextLocalOffset() - 1 /* skip dummy */,
2024+
SLocEntryOffsetsBase};
20202025
Stream.EmitRecordWithBlob(SLocOffsetsAbbrev, Record,
20212026
bytes(SLocEntryOffsets));
20222027
}
@@ -2093,9 +2098,11 @@ static bool shouldIgnoreMacro(MacroDirective *MD, bool IsModule,
20932098
/// Writes the block containing the serialized form of the
20942099
/// preprocessor.
20952100
void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
2101+
uint64_t MacroOffsetsBase = Stream.GetCurrentBitNo();
2102+
20962103
PreprocessingRecord *PPRec = PP.getPreprocessingRecord();
20972104
if (PPRec)
2098-
WritePreprocessorDetail(*PPRec);
2105+
WritePreprocessorDetail(*PPRec, MacroOffsetsBase);
20992106

21002107
RecordData Record;
21012108
RecordData ModuleMacroRecord;
@@ -2156,7 +2163,8 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
21562163
// identifier they belong to.
21572164
for (const IdentifierInfo *Name : MacroIdentifiers) {
21582165
MacroDirective *MD = PP.getLocalMacroDirectiveHistory(Name);
2159-
auto StartOffset = Stream.GetCurrentBitNo();
2166+
uint64_t StartOffset = Stream.GetCurrentBitNo() - MacroOffsetsBase;
2167+
assert((StartOffset >> 32) == 0 && "Macro identifiers offset too large");
21602168

21612169
// Emit the macro directives in reverse source order.
21622170
for (; MD; MD = MD->getPrevious()) {
@@ -2229,14 +2237,12 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
22292237

22302238
// Record the local offset of this macro.
22312239
unsigned Index = ID - FirstMacroID;
2232-
if (Index == MacroOffsets.size())
2233-
MacroOffsets.push_back(Stream.GetCurrentBitNo());
2234-
else {
2235-
if (Index > MacroOffsets.size())
2236-
MacroOffsets.resize(Index + 1);
2240+
if (Index >= MacroOffsets.size())
2241+
MacroOffsets.resize(Index + 1);
22372242

2238-
MacroOffsets[Index] = Stream.GetCurrentBitNo();
2239-
}
2243+
uint64_t Offset = Stream.GetCurrentBitNo() - MacroOffsetsBase;
2244+
assert((Offset >> 32) == 0 && "Macro offset too large");
2245+
MacroOffsets[Index] = Offset;
22402246

22412247
AddIdentifierRef(Name, Record);
22422248
AddSourceLocation(MI->getDefinitionLoc(), Record);
@@ -2287,17 +2293,20 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
22872293
Abbrev->Add(BitCodeAbbrevOp(MACRO_OFFSET));
22882294
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // # of macros
22892295
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // first ID
2296+
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 32)); // base offset
22902297
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
22912298

22922299
unsigned MacroOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
22932300
{
22942301
RecordData::value_type Record[] = {MACRO_OFFSET, MacroOffsets.size(),
2295-
FirstMacroID - NUM_PREDEF_MACRO_IDS};
2302+
FirstMacroID - NUM_PREDEF_MACRO_IDS,
2303+
MacroOffsetsBase};
22962304
Stream.EmitRecordWithBlob(MacroOffsetAbbrev, Record, bytes(MacroOffsets));
22972305
}
22982306
}
22992307

2300-
void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec) {
2308+
void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec,
2309+
uint64_t MacroOffsetsBase) {
23012310
if (PPRec.local_begin() == PPRec.local_end())
23022311
return;
23032312

@@ -2334,8 +2343,10 @@ void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec) {
23342343
(void)++E, ++NumPreprocessingRecords, ++NextPreprocessorEntityID) {
23352344
Record.clear();
23362345

2346+
uint64_t Offset = Stream.GetCurrentBitNo() - MacroOffsetsBase;
2347+
assert((Offset >> 32) == 0 && "Preprocessed entity offset too large");
23372348
PreprocessedEntityOffsets.push_back(
2338-
PPEntityOffset((*E)->getSourceRange(), Stream.GetCurrentBitNo()));
2349+
PPEntityOffset((*E)->getSourceRange(), Offset));
23392350

23402351
if (auto *MD = dyn_cast<MacroDefinitionRecord>(*E)) {
23412352
// Record this macro definition's ID.
@@ -5144,7 +5155,7 @@ MacroID ASTWriter::getMacroID(MacroInfo *MI) {
51445155
return MacroIDs[MI];
51455156
}
51465157

5147-
uint64_t ASTWriter::getMacroDirectivesOffset(const IdentifierInfo *Name) {
5158+
uint32_t ASTWriter::getMacroDirectivesOffset(const IdentifierInfo *Name) {
51485159
return IdentMacroDirectivesOffsetMap.lookup(Name);
51495160
}
51505161

clang/lib/Serialization/ASTWriterDecl.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -2434,12 +2434,12 @@ void ASTWriter::WriteDecl(ASTContext &Context, Decl *D) {
24342434
SourceLocation Loc = D->getLocation();
24352435
unsigned Index = ID - FirstDeclID;
24362436
if (DeclOffsets.size() == Index)
2437-
DeclOffsets.push_back(DeclOffset(Loc, Offset));
2437+
DeclOffsets.emplace_back(Loc, Offset);
24382438
else if (DeclOffsets.size() < Index) {
24392439
// FIXME: Can/should this happen?
24402440
DeclOffsets.resize(Index+1);
24412441
DeclOffsets[Index].setLocation(Loc);
2442-
DeclOffsets[Index].BitOffset = Offset;
2442+
DeclOffsets[Index].setBitOffset(Offset);
24432443
} else {
24442444
llvm_unreachable("declarations should be emitted in ID order");
24452445
}

0 commit comments

Comments
 (0)