From aa6fbabeecd08de504640f3832e0299d65d78aca Mon Sep 17 00:00:00 2001 From: Kirill Osenkov Date: Fri, 8 Jan 2021 19:15:50 -0800 Subject: [PATCH 1/6] Deduplicate strings in binlogs When writing out a binary log we now deduplicate strings and dictionaries. This results in a significant performance increase and binlog size reduction. Performance increase is about 2x on average, size reduction is about 4x on average, but up to 30x for large binlogs. Add two new record kinds: String and NameValueList. A string record is written the first time we encounter a string we need to serialize. The next time we see the string we only write its index in the total list of strings. Similarly, NameValueList is a list of key and value strings, used for Properties, environment variables and Item metadata. The first time we're writing out a list we write a record, and subsequent times just the index. This keeps the binlog format streaming, so if the logging is interrupted midway, we will be able to read everything up to that point. We do not hold on to strings we encountered. Instead we hash them and only preserve the hash code. We rely on the hash function resulting in no collisions, otherwise it could happen that a random string in a binlog would be substituted instead of another one. The hashtables do not significantly add to MSBuild memory usage (20-30 MB for our largest binlogs). FNV-1a (64-bit hash size) was a carefully chosen hash function for its simplicity, performance, and lack of collisions on all binlogs tested so far. 32-bit hash size (such as string.GetHashCode() was not sufficient and resulted in ~800 collisions for our largest binlog with 2.7 million strings. This change includes other performance improvements such as inserting a BufferedStream around the stream we're reading or writing. This results in a significant performance improvement. We introduce a new StringStorage data structure in the binlog reader, for storing the strings on disk instead of reading them all into memory. Strings are loaded into memory on demand. This prevents OOM in 32-bit MSBuild processes when playing back large binlogs. This keeps the memory usage relatively flat when reading. --- ref/Microsoft.Build/net/Microsoft.Build.cs | 3 +- .../netstandard/Microsoft.Build.cs | 3 +- .../BuildEventArgsSerialization_Tests.cs | 4 +- .../BinaryLogger/BinaryLogRecordKind.cs | 2 + .../BinaryLogReplayEventSource.cs | 9 +- .../Logging/BinaryLogger/BinaryLogger.cs | 11 +- .../BinaryLogger/BuildEventArgsReader.cs | 445 ++++++++++++--- .../BinaryLogger/BuildEventArgsWriter.cs | 528 ++++++++++++++---- 8 files changed, 821 insertions(+), 184 deletions(-) diff --git a/ref/Microsoft.Build/net/Microsoft.Build.cs b/ref/Microsoft.Build/net/Microsoft.Build.cs index 9d481a020ac..951fa5884dc 100644 --- a/ref/Microsoft.Build/net/Microsoft.Build.cs +++ b/ref/Microsoft.Build/net/Microsoft.Build.cs @@ -1591,9 +1591,10 @@ public sealed partial class BinaryLogReplayEventSource : Microsoft.Build.Logging public void Replay(string sourceFilePath) { } public void Replay(string sourceFilePath, System.Threading.CancellationToken cancellationToken) { } } - public partial class BuildEventArgsReader + public partial class BuildEventArgsReader : System.IDisposable { public BuildEventArgsReader(System.IO.BinaryReader binaryReader, int fileFormatVersion) { } + public void Dispose() { } public Microsoft.Build.Framework.BuildEventArgs Read() { throw null; } } public delegate void ColorResetter(); diff --git a/ref/Microsoft.Build/netstandard/Microsoft.Build.cs b/ref/Microsoft.Build/netstandard/Microsoft.Build.cs index 705dfafe932..96374243b39 100644 --- a/ref/Microsoft.Build/netstandard/Microsoft.Build.cs +++ b/ref/Microsoft.Build/netstandard/Microsoft.Build.cs @@ -1585,9 +1585,10 @@ public sealed partial class BinaryLogReplayEventSource : Microsoft.Build.Logging public void Replay(string sourceFilePath) { } public void Replay(string sourceFilePath, System.Threading.CancellationToken cancellationToken) { } } - public partial class BuildEventArgsReader + public partial class BuildEventArgsReader : System.IDisposable { public BuildEventArgsReader(System.IO.BinaryReader binaryReader, int fileFormatVersion) { } + public void Dispose() { } public Microsoft.Build.Framework.BuildEventArgs Read() { throw null; } } public delegate void ColorResetter(); diff --git a/src/Build.UnitTests/BuildEventArgsSerialization_Tests.cs b/src/Build.UnitTests/BuildEventArgsSerialization_Tests.cs index ca2de8f8f3b..afd56aea915 100644 --- a/src/Build.UnitTests/BuildEventArgsSerialization_Tests.cs +++ b/src/Build.UnitTests/BuildEventArgsSerialization_Tests.cs @@ -513,7 +513,7 @@ public void ReadingCorruptedStreamThrows() memoryStream.Position = 0; var binaryReader = new BinaryReader(memoryStream); - var buildEventArgsReader = new BuildEventArgsReader(binaryReader, BinaryLogger.FileFormatVersion); + using var buildEventArgsReader = new BuildEventArgsReader(binaryReader, BinaryLogger.FileFormatVersion); Assert.Throws(() => buildEventArgsReader.Read()); } @@ -570,7 +570,7 @@ private void Roundtrip(T args, params Func[] fieldsToCompare) memoryStream.Position = 0; var binaryReader = new BinaryReader(memoryStream); - var buildEventArgsReader = new BuildEventArgsReader(binaryReader, BinaryLogger.FileFormatVersion); + using var buildEventArgsReader = new BuildEventArgsReader(binaryReader, BinaryLogger.FileFormatVersion); var deserializedArgs = (T)buildEventArgsReader.Read(); Assert.Equal(length, memoryStream.Position); diff --git a/src/Build/Logging/BinaryLogger/BinaryLogRecordKind.cs b/src/Build/Logging/BinaryLogger/BinaryLogRecordKind.cs index 61ad2576bb6..d8c90800210 100644 --- a/src/Build/Logging/BinaryLogger/BinaryLogRecordKind.cs +++ b/src/Build/Logging/BinaryLogger/BinaryLogRecordKind.cs @@ -25,5 +25,7 @@ internal enum BinaryLogRecordKind UninitializedPropertyRead, EnvironmentVariableRead, PropertyInitialValueSet, + NameValueList, + String, } } diff --git a/src/Build/Logging/BinaryLogger/BinaryLogReplayEventSource.cs b/src/Build/Logging/BinaryLogger/BinaryLogReplayEventSource.cs index 264eba384ca..c6456c1a759 100644 --- a/src/Build/Logging/BinaryLogger/BinaryLogReplayEventSource.cs +++ b/src/Build/Logging/BinaryLogger/BinaryLogReplayEventSource.cs @@ -33,7 +33,12 @@ public void Replay(string sourceFilePath, CancellationToken cancellationToken) using (var stream = new FileStream(sourceFilePath, FileMode.Open, FileAccess.Read, FileShare.Read)) { var gzipStream = new GZipStream(stream, CompressionMode.Decompress, leaveOpen: true); - var binaryReader = new BinaryReader(gzipStream); + + // wrapping the GZipStream in a buffered stream significantly improves performance + // and the max throughput is reached with a 32K buffer. See details here: + // https://github.com/dotnet/runtime/issues/39233#issuecomment-745598847 + var bufferedStream = new BufferedStream(gzipStream, 32768); + var binaryReader = new BinaryReader(bufferedStream); int fileFormatVersion = binaryReader.ReadInt32(); @@ -45,7 +50,7 @@ public void Replay(string sourceFilePath, CancellationToken cancellationToken) throw new NotSupportedException(text); } - var reader = new BuildEventArgsReader(binaryReader, fileFormatVersion); + using var reader = new BuildEventArgsReader(binaryReader, fileFormatVersion); while (true) { if (cancellationToken.IsCancellationRequested) diff --git a/src/Build/Logging/BinaryLogger/BinaryLogger.cs b/src/Build/Logging/BinaryLogger/BinaryLogger.cs index 8af5b386041..ab021e133c8 100644 --- a/src/Build/Logging/BinaryLogger/BinaryLogger.cs +++ b/src/Build/Logging/BinaryLogger/BinaryLogger.cs @@ -35,7 +35,13 @@ public sealed class BinaryLogger : ILogger // - This was used in a now-reverted change but is the same as 9. // version 9: // - new record kinds: EnvironmentVariableRead, PropertyReassignment, UninitializedPropertyRead - internal const int FileFormatVersion = 9; + // version 10: + // - new record kinds: + // * String - deduplicate strings by hashing and write a string record before it's used + // * NameValueList - deduplicate arrays of name-value pairs such as properties, items and metadata + // in a separate record and refer to those records from regular records + // where a list used to be written in-place + internal const int FileFormatVersion = 10; private Stream stream; private BinaryWriter binaryWriter; @@ -137,6 +143,7 @@ public void Initialize(IEventSource eventSource) } stream = new GZipStream(stream, CompressionLevel.Optimal); + stream = new BufferedStream(stream, bufferSize: 32768); binaryWriter = new BinaryWriter(stream); eventArgsWriter = new BuildEventArgsWriter(binaryWriter); @@ -175,8 +182,8 @@ public void Shutdown() { eventArgsWriter.WriteBlob(BinaryLogRecordKind.ProjectImportArchive, projectImportsCollector.GetAllBytes()); } - projectImportsCollector.Close(); + projectImportsCollector.Close(); projectImportsCollector = null; } diff --git a/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs b/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs index 5ee2af9780c..c565119258d 100644 --- a/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs +++ b/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.IO; using System.Reflection; +using System.Text; using Microsoft.Build.Framework; using Microsoft.Build.Framework.Profiler; @@ -11,11 +12,30 @@ namespace Microsoft.Build.Logging /// /// Deserializes and returns BuildEventArgs-derived objects from a BinaryReader /// - public class BuildEventArgsReader + public class BuildEventArgsReader : IDisposable { private readonly BinaryReader binaryReader; private readonly int fileFormatVersion; + /// + /// A list of string records we've encountered so far. If it's a small string, it will be the string directly. + /// If it's a large string, it will be a pointer into a temporary page file where the string content will be + /// written out to. This is necessary so we don't keep all the strings in memory when reading large binlogs. + /// We will OOM otherwise. + /// + private readonly List stringRecords = new List(); + + /// + /// A list of dictionaries we've encountered so far. Dictionaries are referred to by their order in this list. + /// + private readonly List<(int keyIndex, int valueIndex)[]> nameValueListRecords = new List<(int, int)[]>(); + + /// + /// A "page-file" for storing strings we've read so far. Keeping them in memory would OOM the 32-bit MSBuild + /// when reading large binlogs. This is a no-op in a 64-bit process. + /// + private StringStorage stringStorage = new StringStorage(); + // reflection is needed to set these three fields because public constructors don't provide // a way to set these from the outside private static FieldInfo buildEventArgsFieldThreadId = @@ -36,6 +56,15 @@ public BuildEventArgsReader(BinaryReader binaryReader, int fileFormatVersion) this.fileFormatVersion = fileFormatVersion; } + public void Dispose() + { + if (stringStorage != null) + { + stringStorage.Dispose(); + stringStorage = null; + } + } + /// /// Raised when the log reader encounters a binary blob embedded in the stream. /// The arguments include the blob kind and the byte buffer with the contents. @@ -49,9 +78,23 @@ public BuildEventArgs Read() { BinaryLogRecordKind recordKind = (BinaryLogRecordKind)ReadInt32(); - while (IsBlob(recordKind)) + // Skip over data storage records since they don't result in a BuildEventArgs. + // just ingest their data and continue. + while (IsAuxiliaryRecord(recordKind)) { - ReadBlob(recordKind); + // these are ordered by commonality + if (recordKind == BinaryLogRecordKind.String) + { + ReadStringRecord(); + } + else if (recordKind == BinaryLogRecordKind.NameValueList) + { + ReadNameValueList(); + } + else if (recordKind == BinaryLogRecordKind.ProjectImportArchive) + { + ReadBlob(recordKind); + } recordKind = (BinaryLogRecordKind)ReadInt32(); } @@ -131,12 +174,11 @@ public BuildEventArgs Read() return result; } - /// - /// For now it's just the ProjectImportArchive. - /// - private static bool IsBlob(BinaryLogRecordKind recordKind) + private static bool IsAuxiliaryRecord(BinaryLogRecordKind recordKind) { - return recordKind == BinaryLogRecordKind.ProjectImportArchive; + return recordKind == BinaryLogRecordKind.String + || recordKind == BinaryLogRecordKind.NameValueList + || recordKind == BinaryLogRecordKind.ProjectImportArchive; } private void ReadBlob(BinaryLogRecordKind kind) @@ -146,6 +188,52 @@ private void ReadBlob(BinaryLogRecordKind kind) OnBlobRead?.Invoke(kind, bytes); } + private void ReadNameValueList() + { + int count = ReadInt32(); + + var list = new (int, int)[count]; + for (int i = 0; i < count; i++) + { + int key = ReadInt32(); + int value = ReadInt32(); + list[i] = (key, value); + } + + nameValueListRecords.Add(list); + } + + private IDictionary GetNameValueList(int id) + { + id -= BuildEventArgsWriter.NameValueRecordStartIndex; + if (id >= 0 && id < nameValueListRecords.Count) + { + var list = nameValueListRecords[id]; + + var dictionary = new Dictionary(list.Length); + for (int i = 0; i < list.Length; i++) + { + string key = GetStringFromRecord(list[i].keyIndex); + string value = GetStringFromRecord(list[i].valueIndex); + if (key != null) + { + dictionary[key] = value; + } + } + + return dictionary; + } + + return new Dictionary(); + } + + private void ReadStringRecord() + { + string text = ReadString(); + object storedString = stringStorage.Add(text); + stringRecords.Add(storedString); + } + private BuildEventArgs ReadProjectImportedEventArgs() { var fields = ReadBuildEventArgsFields(); @@ -232,7 +320,7 @@ private BuildEventArgs ReadBuildFinishedEventArgs() private BuildEventArgs ReadProjectEvaluationStartedEventArgs() { var fields = ReadBuildEventArgsFields(); - var projectFile = ReadString(); + var projectFile = ReadDeduplicatedString(); var e = new ProjectEvaluationStartedEventArgs(fields.Message) { @@ -245,7 +333,7 @@ private BuildEventArgs ReadProjectEvaluationStartedEventArgs() private BuildEventArgs ReadProjectEvaluationFinishedEventArgs() { var fields = ReadBuildEventArgsFields(); - var projectFile = ReadString(); + var projectFile = ReadDeduplicatedString(); var e = new ProjectEvaluationFinishedEventArgs(fields.Message) { @@ -264,8 +352,11 @@ private BuildEventArgs ReadProjectEvaluationFinishedEventArgs() var d = new Dictionary(count); for (int i = 0; i < count; i++) { - d.Add(ReadEvaluationLocation(), ReadProfiledLocation()); + var evaluationLocation = ReadEvaluationLocation(); + var profiledLocation = ReadProfiledLocation(); + d[evaluationLocation] = profiledLocation; } + e.ProfilerResult = new ProfilerResult(d); } } @@ -284,10 +375,10 @@ private BuildEventArgs ReadProjectStartedEventArgs() var projectFile = ReadOptionalString(); var projectId = ReadInt32(); - var targetNames = ReadString(); + var targetNames = ReadDeduplicatedString(); var toolsVersion = ReadOptionalString(); - Dictionary globalProperties = null; + IDictionary globalProperties = null; if (fileFormatVersion > 6) { @@ -298,7 +389,7 @@ private BuildEventArgs ReadProjectStartedEventArgs() } var propertyList = ReadPropertyList(); - var itemList = ReadItems(); + var itemList = ReadProjectItems(); var e = new ProjectStartedEventArgs( projectId, @@ -361,7 +452,7 @@ private BuildEventArgs ReadTargetFinishedEventArgs() var projectFile = ReadOptionalString(); var targetFile = ReadOptionalString(); var targetName = ReadOptionalString(); - var targetOutputItemList = ReadItemList(); + var targetOutputItemList = ReadTaskItemList(); var e = new TargetFinishedEventArgs( fields.Message, @@ -525,7 +616,7 @@ private BuildEventArgs ReadEnvironmentVariableReadEventArgs() var fields = ReadBuildEventArgsFields(); var importance = (MessageImportance)ReadInt32(); - var environmentVariableName = ReadString(); + var environmentVariableName = ReadDeduplicatedString(); var e = new EnvironmentVariableReadEventArgs( environmentVariableName, @@ -542,10 +633,10 @@ private BuildEventArgs ReadPropertyReassignmentEventArgs() { var fields = ReadBuildEventArgsFields(); var importance = (MessageImportance)ReadInt32(); - string propertyName = ReadString(); - string previousValue = ReadString(); - string newValue = ReadString(); - string location = ReadString(); + string propertyName = ReadDeduplicatedString(); + string previousValue = ReadDeduplicatedString(); + string newValue = ReadDeduplicatedString(); + string location = ReadDeduplicatedString(); var e = new PropertyReassignmentEventArgs( propertyName, @@ -565,7 +656,7 @@ private BuildEventArgs ReadUninitializedPropertyReadEventArgs() { var fields = ReadBuildEventArgsFields(); var importance = (MessageImportance)ReadInt32(); - string propertyName = ReadString(); + string propertyName = ReadDeduplicatedString(); var e = new UninitializedPropertyReadEventArgs( propertyName, @@ -582,9 +673,9 @@ private BuildEventArgs ReadPropertyInitialValueSetEventArgs() { var fields = ReadBuildEventArgsFields(); var importance = (MessageImportance)ReadInt32(); - string propertyName = ReadString(); - string propertyValue = ReadString(); - string propertySource = ReadString(); + string propertyName = ReadDeduplicatedString(); + string propertyValue = ReadDeduplicatedString(); + string propertySource = ReadDeduplicatedString(); var e = new PropertyInitialValueSetEventArgs( propertyName, @@ -625,7 +716,7 @@ private BuildEventArgsFields ReadBuildEventArgsFields() if ((flags & BuildEventArgsFieldFlags.Message) != 0) { - result.Message = ReadString(); + result.Message = ReadDeduplicatedString(); } if ((flags & BuildEventArgsFieldFlags.BuildEventContext) != 0) @@ -640,12 +731,12 @@ private BuildEventArgsFields ReadBuildEventArgsFields() if ((flags & BuildEventArgsFieldFlags.HelpHeyword) != 0) { - result.HelpKeyword = ReadString(); + result.HelpKeyword = ReadDeduplicatedString(); } if ((flags & BuildEventArgsFieldFlags.SenderName) != 0) { - result.SenderName = ReadString(); + result.SenderName = ReadDeduplicatedString(); } if ((flags & BuildEventArgsFieldFlags.Timestamp) != 0) @@ -655,22 +746,22 @@ private BuildEventArgsFields ReadBuildEventArgsFields() if ((flags & BuildEventArgsFieldFlags.Subcategory) != 0) { - result.Subcategory = ReadString(); + result.Subcategory = ReadDeduplicatedString(); } if ((flags & BuildEventArgsFieldFlags.Code) != 0) { - result.Code = ReadString(); + result.Code = ReadDeduplicatedString(); } if ((flags & BuildEventArgsFieldFlags.File) != 0) { - result.File = ReadString(); + result.File = ReadDeduplicatedString(); } if ((flags & BuildEventArgsFieldFlags.ProjectFile) != 0) { - result.ProjectFile = ReadString(); + result.ProjectFile = ReadDeduplicatedString(); } if ((flags & BuildEventArgsFieldFlags.LineNumber) != 0) @@ -761,16 +852,33 @@ private BuildEventContext ReadBuildEventContext() return result; } - private Dictionary ReadStringDictionary() + private IDictionary ReadStringDictionary() { - int count = ReadInt32(); + if (fileFormatVersion < 10) + { + return ReadLegacyStringDictionary(); + } + + int index = ReadInt32(); + if (index == 0) + { + return null; + } + var record = GetNameValueList(index); + return record; + } + + private IDictionary ReadLegacyStringDictionary() + { + int count = ReadInt32(); if (count == 0) { return null; } - Dictionary result = new Dictionary(count); + var result = new Dictionary(count); + for (int i = 0; i < count; i++) { string key = ReadString(); @@ -783,16 +891,29 @@ private BuildEventContext ReadBuildEventContext() private class TaskItem : ITaskItem { + private static readonly Dictionary emptyMetadata = new Dictionary(); + public string ItemSpec { get; set; } - public Dictionary Metadata { get; } = new Dictionary(); + public IDictionary Metadata { get; } + + public TaskItem() + { + Metadata = new Dictionary(); + } + + public TaskItem(string itemSpec, IDictionary metadata) + { + ItemSpec = itemSpec; + Metadata = metadata ?? emptyMetadata; + } public int MetadataCount => Metadata.Count; - public ICollection MetadataNames => Metadata.Keys; + public ICollection MetadataNames => (ICollection)Metadata.Keys; public IDictionary CloneCustomMetadata() { - return Metadata; + return (IDictionary)Metadata; } public void CopyMetadataTo(ITaskItem destinationItem) @@ -814,25 +935,23 @@ public void SetMetadata(string metadataName, string metadataValue) { throw new NotImplementedException(); } - } - private ITaskItem ReadItem() - { - var item = new TaskItem(); - item.ItemSpec = ReadString(); - - int count = ReadInt32(); - for (int i = 0; i < count; i++) + public override string ToString() { - string name = ReadString(); - string value = ReadString(); - item.Metadata[name] = value; + return $"{ItemSpec} Metadata: {MetadataCount}"; } + } + + private ITaskItem ReadTaskItem() + { + string itemSpec = ReadDeduplicatedString(); + var metadata = ReadStringDictionary(); - return item; + var taskItem = new TaskItem(itemSpec, metadata); + return taskItem; } - private IEnumerable ReadItems() + private IEnumerable ReadProjectItems() { int count = ReadInt32(); if (count == 0) @@ -840,19 +959,40 @@ private IEnumerable ReadItems() return null; } - var list = new List(count); + List list; - for (int i = 0; i < count; i++) + // starting with format version 10 project items are grouped by name + // so we only have to write the name once, and then the count of items + // with that name. When reading a legacy binlog we need to read the + // old style flat list where the name is duplicated for each item. + if (fileFormatVersion < 10) { - string key = ReadString(); - ITaskItem item = ReadItem(); - list.Add(new DictionaryEntry(key, item)); + list = new List(count); + for (int i = 0; i < count; i++) + { + string itemName = ReadString(); + ITaskItem item = ReadTaskItem(); + list.Add(new DictionaryEntry(itemName, item)); + } + } + else + { + list = new List(); + for (int i = 0; i < count; i++) + { + string itemName = ReadDeduplicatedString(); + var items = ReadTaskItemList(); + foreach (var item in items) + { + list.Add(new DictionaryEntry(itemName, item)); + } + } } return list; } - private IEnumerable ReadItemList() + private IEnumerable ReadTaskItemList() { int count = ReadInt32(); if (count == 0) @@ -864,28 +1004,68 @@ private IEnumerable ReadItemList() for (int i = 0; i < count; i++) { - ITaskItem item = ReadItem(); + ITaskItem item = ReadTaskItem(); list.Add(item); } return list; } + private string ReadString() + { + return binaryReader.ReadString(); + } + private string ReadOptionalString() { - if (ReadBoolean()) + if (fileFormatVersion < 10) { - return ReadString(); + if (ReadBoolean()) + { + return ReadString(); + } + else + { + return null; + } } - else + + return ReadDeduplicatedString(); + } + + private string ReadDeduplicatedString() + { + if (fileFormatVersion < 10) { - return null; + return ReadString(); } + + int index = ReadInt32(); + return GetStringFromRecord(index); } - private string ReadString() + private string GetStringFromRecord(int index) { - return binaryReader.ReadString(); + if (index == 0) + { + return null; + } + else if (index == 1) + { + return string.Empty; + } + + // we reserve numbers 2-9 for future use. + // the writer assigns 10 as the index of the first string + index -= BuildEventArgsWriter.StringStartIndex; + if (index >= 0 && index < this.stringRecords.Count) + { + object storedString = stringRecords[index]; + string result = stringStorage.Get(storedString); + return result; + } + + return string.Empty; } private int ReadInt32() @@ -972,10 +1152,149 @@ private EvaluationLocation ReadEvaluationLocation() { parentId = ReadInt64(); } + return new EvaluationLocation(id, parentId, evaluationPass, evaluationDescription, file, line, elementName, description, kind); } return new EvaluationLocation(0, null, evaluationPass, evaluationDescription, file, line, elementName, description, kind); } + + /// + /// Locates the string in the page file. + /// + internal class StringPosition + { + /// + /// Offset in the file. + /// + public long FilePosition; + + /// + /// The length of the string in chars (not bytes). + /// + public int StringLength; + } + + /// + /// Stores large strings in a temp file on disk, to avoid keeping all strings in memory. + /// Only creates a file for 32-bit MSBuild.exe, just returns the string directly on 64-bit. + /// + internal class StringStorage : IDisposable + { + private string filePath; + private FileStream stream; + private StreamWriter streamWriter; + private StreamReader streamReader; + private StringBuilder stringBuilder; + + public const int StringSizeThreshold = 1024; + + public StringStorage() + { + if (!Environment.Is64BitProcess) + { + filePath = Path.GetTempFileName(); + var utf8noBom = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false); + stream = new FileStream( + filePath, + FileMode.OpenOrCreate, + FileAccess.ReadWrite, + FileShare.None, + bufferSize: 4096, + FileOptions.RandomAccess | FileOptions.DeleteOnClose); + streamWriter = new StreamWriter(stream, utf8noBom, 65536); + streamWriter.AutoFlush = true; + streamReader = new StreamReader(stream, utf8noBom); + stringBuilder = new StringBuilder(); + } + } + + private long totalAllocatedShortStrings = 0; + + public object Add(string text) + { + if (filePath == null) + { + // on 64-bit, we have as much memory as we want + // so no need to write to the file at all + return text; + } + + // Tradeoff between not crashing with OOM on large binlogs and + // keeping the playback of smaller binlogs relatively fast. + // It is slow to store all small strings in the file and constantly + // seek to retrieve them. Instead we'll keep storing small strings + // in memory until we allocate 2 GB. After that, all strings go to + // the file. + // Win-win: small binlog playback is fast and large binlog playback + // doesn't OOM. + if (text.Length <= StringSizeThreshold && totalAllocatedShortStrings < 2_000_000_000) + { + // note that we write strings in UTF8 so we don't need to multiply by 2 as chars + // will be 1 byte on average + totalAllocatedShortStrings += text.Length; + return text; + } + + var stringPosition = new StringPosition(); + + stringPosition.FilePosition = stream.Position; + + streamWriter.Write(text); + + stringPosition.StringLength = text.Length; + return stringPosition; + } + + public string Get(object storedString) + { + if (storedString is string text) + { + return text; + } + + var position = (StringPosition)storedString; + + stream.Position = position.FilePosition; + stringBuilder.Length = position.StringLength; + for (int i = 0; i < position.StringLength; i++) + { + char ch = (char)streamReader.Read(); + stringBuilder[i] = ch; + } + + stream.Position = stream.Length; + streamReader.DiscardBufferedData(); + + string result = stringBuilder.ToString(); + stringBuilder.Clear(); + return result; + } + + public void Dispose() + { + try + { + if (streamWriter != null) + { + streamWriter.Dispose(); + streamWriter = null; + } + + if (stream != null) + { + stream.Dispose(); + stream = null; + } + } + catch + { + // The StringStorage class is not crucial for other functionality and if + // there are exceptions when closing the temp file, it's too late to do anything about it. + // Since we don't want to disrupt anything and the file is in the TEMP directory, it will + // get cleaned up at some point anyway. + } + } + } } } diff --git a/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs b/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs index bc8f203903e..8c1ba1abaed 100644 --- a/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs +++ b/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs @@ -7,6 +7,7 @@ using Microsoft.Build.Exceptions; using Microsoft.Build.Framework; using Microsoft.Build.Framework.Profiler; +using Microsoft.Build.Internal; namespace Microsoft.Build.Logging { @@ -15,7 +16,85 @@ namespace Microsoft.Build.Logging /// internal class BuildEventArgsWriter { - private readonly BinaryWriter binaryWriter; + private readonly Stream originalStream; + + /// + /// When writing the current record, first write it to a memory stream, + /// then flush to the originalStream. This is needed so that if we discover + /// that we need to write a string record in the middle of writing the + /// current record, we will write the string record to the original stream + /// and the current record will end up after the string record. + /// + private readonly MemoryStream currentRecordStream; + + /// + /// The binary writer around the originalStream. + /// + private readonly BinaryWriter originalBinaryWriter; + + /// + /// The binary writer around the currentRecordStream. + /// + private readonly BinaryWriter currentRecordWriter; + + /// + /// The binary writer we're currently using. Is pointing at the currentRecordWriter usually, + /// but sometimes we repoint it to the originalBinaryWriter temporarily, when writing string + /// and name-value records. + /// + private BinaryWriter binaryWriter; + + /// + /// Hashtable used for deduplicating strings. When we need to write a string, + /// we check in this hashtable first, and if we've seen the string before, + /// just write out its index. Otherwise write out a string record, and then + /// write the string index. A string record is guaranteed to precede its first + /// usage. + /// The reader will read the string records first and then be able to retrieve + /// a string by its index. This allows us to keep the format streaming instead + /// of writing one giant string table at the end. If a binlog is interrupted + /// we'll be able to use all the information we've discovered thus far. + /// + private readonly Dictionary stringHashes = new Dictionary(); + + /// + /// Hashtable used for deduplicating name-value lists. Same as strings. + /// + private readonly Dictionary nameValueListHashes = new Dictionary(); + + /// + /// Index 0 is null, Index 1 is the empty string. + /// Reserve indices 2-9 for future use. Start indexing actual strings at 10. + /// + internal const int StringStartIndex = 10; + + /// + /// Let's reserve a few indices for future use. + /// + internal const int NameValueRecordStartIndex = 10; + + /// + /// 0 is null, 1 is empty string + /// 2-9 are reserved for future use. + /// Start indexing at 10. + /// + private int stringRecordId = StringStartIndex; + + /// + /// The index of the next record to be written. + /// + private int nameValueRecordId = NameValueRecordStartIndex; + + /// + /// A temporary buffer we use when writing a NameValueList record. Avoids allocating a list each time. + /// + private readonly List> nameValueListBuffer = new List>(1024); + + /// + /// A temporary buffer we use when hashing a NameValueList record. Stores the indices of hashed strings + /// instead of the actual names and values. + /// + private readonly List> nameValueIndexListBuffer = new List>(1024); /// /// Initializes a new instance of BuildEventArgsWriter with a BinaryWriter @@ -23,13 +102,31 @@ internal class BuildEventArgsWriter /// A BinaryWriter to write the BuildEventArgs instances to public BuildEventArgsWriter(BinaryWriter binaryWriter) { - this.binaryWriter = binaryWriter; + this.originalStream = binaryWriter.BaseStream; + + // this doesn't exceed 30K for smaller binlogs so seems like a reasonable + // starting point to avoid reallocations in the common case + this.currentRecordStream = new MemoryStream(65536); + + this.originalBinaryWriter = binaryWriter; + this.currentRecordWriter = new BinaryWriter(currentRecordStream); + + this.binaryWriter = currentRecordWriter; } /// /// Write a provided instance of BuildEventArgs to the BinaryWriter /// public void Write(BuildEventArgs e) + { + WriteCore(e); + + // flush the current record and clear the MemoryStream to prepare for next use + currentRecordStream.WriteTo(originalStream); + currentRecordStream.SetLength(0); + } + + private void WriteCore(BuildEventArgs e) { // the cases are ordered by most used first for performance if (e is BuildMessageEventArgs) @@ -101,9 +198,15 @@ public void Write(BuildEventArgs e) public void WriteBlob(BinaryLogRecordKind kind, byte[] bytes) { + // write the blob directly to the underlying writer, + // bypassing the memory stream + binaryWriter = originalBinaryWriter; + Write(kind); Write(bytes.Length); Write(bytes); + + binaryWriter = currentRecordWriter; } private void Write(BuildStartedEventArgs e) @@ -124,15 +227,15 @@ private void Write(ProjectEvaluationStartedEventArgs e) { Write(BinaryLogRecordKind.ProjectEvaluationStarted); WriteBuildEventArgsFields(e); - Write(e.ProjectFile); + WriteDeduplicatedString(e.ProjectFile); } private void Write(ProjectEvaluationFinishedEventArgs e) { Write(BinaryLogRecordKind.ProjectEvaluationFinished); - + WriteBuildEventArgsFields(e); - Write(e.ProjectFile); + WriteDeduplicatedString(e.ProjectFile); Write(e.ProfilerResult.HasValue); if (e.ProfilerResult.HasValue) @@ -162,11 +265,11 @@ private void Write(ProjectStartedEventArgs e) Write(e.ParentProjectBuildEventContext); } - WriteOptionalString(e.ProjectFile); + WriteDeduplicatedString(e.ProjectFile); Write(e.ProjectId); - Write(e.TargetNames); - WriteOptionalString(e.ToolsVersion); + WriteDeduplicatedString(e.TargetNames); + WriteDeduplicatedString(e.ToolsVersion); if (e.GlobalProperties == null) { @@ -180,14 +283,14 @@ private void Write(ProjectStartedEventArgs e) WriteProperties(e.Properties); - WriteItems(e.Items); + WriteProjectItems(e.Items); } private void Write(ProjectFinishedEventArgs e) { Write(BinaryLogRecordKind.ProjectFinished); WriteBuildEventArgsFields(e); - WriteOptionalString(e.ProjectFile); + WriteDeduplicatedString(e.ProjectFile); Write(e.Succeeded); } @@ -195,11 +298,11 @@ private void Write(TargetStartedEventArgs e) { Write(BinaryLogRecordKind.TargetStarted); WriteBuildEventArgsFields(e); - WriteOptionalString(e.TargetName); - WriteOptionalString(e.ProjectFile); - WriteOptionalString(e.TargetFile); - WriteOptionalString(e.ParentTarget); - Write((int) e.BuildReason); + WriteDeduplicatedString(e.TargetName); + WriteDeduplicatedString(e.ProjectFile); + WriteDeduplicatedString(e.TargetFile); + WriteDeduplicatedString(e.ParentTarget); + Write((int)e.BuildReason); } private void Write(TargetFinishedEventArgs e) @@ -207,19 +310,19 @@ private void Write(TargetFinishedEventArgs e) Write(BinaryLogRecordKind.TargetFinished); WriteBuildEventArgsFields(e); Write(e.Succeeded); - WriteOptionalString(e.ProjectFile); - WriteOptionalString(e.TargetFile); - WriteOptionalString(e.TargetName); - WriteItemList(e.TargetOutputs); + WriteDeduplicatedString(e.ProjectFile); + WriteDeduplicatedString(e.TargetFile); + WriteDeduplicatedString(e.TargetName); + WriteTaskItemList(e.TargetOutputs); } private void Write(TaskStartedEventArgs e) { Write(BinaryLogRecordKind.TaskStarted); WriteBuildEventArgsFields(e); - WriteOptionalString(e.TaskName); - WriteOptionalString(e.ProjectFile); - WriteOptionalString(e.TaskFile); + WriteDeduplicatedString(e.TaskName); + WriteDeduplicatedString(e.ProjectFile); + WriteDeduplicatedString(e.TaskFile); } private void Write(TaskFinishedEventArgs e) @@ -227,19 +330,19 @@ private void Write(TaskFinishedEventArgs e) Write(BinaryLogRecordKind.TaskFinished); WriteBuildEventArgsFields(e); Write(e.Succeeded); - WriteOptionalString(e.TaskName); - WriteOptionalString(e.ProjectFile); - WriteOptionalString(e.TaskFile); + WriteDeduplicatedString(e.TaskName); + WriteDeduplicatedString(e.ProjectFile); + WriteDeduplicatedString(e.TaskFile); } private void Write(BuildErrorEventArgs e) { Write(BinaryLogRecordKind.Error); WriteBuildEventArgsFields(e); - WriteOptionalString(e.Subcategory); - WriteOptionalString(e.Code); - WriteOptionalString(e.File); - WriteOptionalString(e.ProjectFile); + WriteDeduplicatedString(e.Subcategory); + WriteDeduplicatedString(e.Code); + WriteDeduplicatedString(e.File); + WriteDeduplicatedString(e.ProjectFile); Write(e.LineNumber); Write(e.ColumnNumber); Write(e.EndLineNumber); @@ -250,10 +353,10 @@ private void Write(BuildWarningEventArgs e) { Write(BinaryLogRecordKind.Warning); WriteBuildEventArgsFields(e); - WriteOptionalString(e.Subcategory); - WriteOptionalString(e.Code); - WriteOptionalString(e.File); - WriteOptionalString(e.ProjectFile); + WriteDeduplicatedString(e.Subcategory); + WriteDeduplicatedString(e.Code); + WriteDeduplicatedString(e.File); + WriteDeduplicatedString(e.ProjectFile); Write(e.LineNumber); Write(e.ColumnNumber); Write(e.EndLineNumber); @@ -319,17 +422,17 @@ private void Write(ProjectImportedEventArgs e) Write(BinaryLogRecordKind.ProjectImported); WriteMessageFields(e); Write(e.ImportIgnored); - WriteOptionalString(e.ImportedProjectFile); - WriteOptionalString(e.UnexpandedProject); + WriteDeduplicatedString(e.ImportedProjectFile); + WriteDeduplicatedString(e.UnexpandedProject); } private void Write(TargetSkippedEventArgs e) { Write(BinaryLogRecordKind.TargetSkipped); WriteMessageFields(e); - WriteOptionalString(e.TargetFile); - WriteOptionalString(e.TargetName); - WriteOptionalString(e.ParentTarget); + WriteDeduplicatedString(e.TargetFile); + WriteDeduplicatedString(e.TargetName); + WriteDeduplicatedString(e.ParentTarget); Write((int)e.BuildReason); } @@ -343,41 +446,41 @@ private void Write(PropertyReassignmentEventArgs e) { Write(BinaryLogRecordKind.PropertyReassignment); WriteMessageFields(e); - Write(e.PropertyName); - Write(e.PreviousValue); - Write(e.NewValue); - Write(e.Location); + WriteDeduplicatedString(e.PropertyName); + WriteDeduplicatedString(e.PreviousValue); + WriteDeduplicatedString(e.NewValue); + WriteDeduplicatedString(e.Location); } private void Write(UninitializedPropertyReadEventArgs e) { Write(BinaryLogRecordKind.UninitializedPropertyRead); WriteMessageFields(e); - Write(e.PropertyName); + WriteDeduplicatedString(e.PropertyName); } private void Write(PropertyInitialValueSetEventArgs e) { Write(BinaryLogRecordKind.PropertyInitialValueSet); WriteMessageFields(e); - Write(e.PropertyName); - Write(e.PropertyValue); - Write(e.PropertySource); + WriteDeduplicatedString(e.PropertyName); + WriteDeduplicatedString(e.PropertyValue); + WriteDeduplicatedString(e.PropertySource); } private void Write(EnvironmentVariableReadEventArgs e) { Write(BinaryLogRecordKind.EnvironmentVariableRead); WriteMessageFields(e); - Write(e.EnvironmentVariableName); + WriteDeduplicatedString(e.EnvironmentVariableName); } private void Write(TaskCommandLineEventArgs e) { Write(BinaryLogRecordKind.TaskCommandLine); WriteMessageFields(e); - WriteOptionalString(e.CommandLine); - WriteOptionalString(e.TaskName); + WriteDeduplicatedString(e.CommandLine); + WriteDeduplicatedString(e.TaskName); } private void WriteBuildEventArgsFields(BuildEventArgs e) @@ -391,7 +494,7 @@ private void WriteBaseFields(BuildEventArgs e, BuildEventArgsFieldFlags flags) { if ((flags & BuildEventArgsFieldFlags.Message) != 0) { - Write(e.Message); + WriteDeduplicatedString(e.Message); } if ((flags & BuildEventArgsFieldFlags.BuildEventContext) != 0) @@ -406,12 +509,12 @@ private void WriteBaseFields(BuildEventArgs e, BuildEventArgsFieldFlags flags) if ((flags & BuildEventArgsFieldFlags.HelpHeyword) != 0) { - Write(e.HelpKeyword); + WriteDeduplicatedString(e.HelpKeyword); } if ((flags & BuildEventArgsFieldFlags.SenderName) != 0) { - Write(e.SenderName); + WriteDeduplicatedString(e.SenderName); } if ((flags & BuildEventArgsFieldFlags.Timestamp) != 0) @@ -431,22 +534,22 @@ private void WriteMessageFields(BuildMessageEventArgs e) if ((flags & BuildEventArgsFieldFlags.Subcategory) != 0) { - Write(e.Subcategory); + WriteDeduplicatedString(e.Subcategory); } if ((flags & BuildEventArgsFieldFlags.Code) != 0) { - Write(e.Code); + WriteDeduplicatedString(e.Code); } if ((flags & BuildEventArgsFieldFlags.File) != 0) { - Write(e.File); + WriteDeduplicatedString(e.File); } if ((flags & BuildEventArgsFieldFlags.ProjectFile) != 0) { - Write(e.ProjectFile); + WriteDeduplicatedString(e.ProjectFile); } if ((flags & BuildEventArgsFieldFlags.LineNumber) != 0) @@ -554,25 +657,24 @@ private static BuildEventArgsFieldFlags GetBuildEventArgsFieldFlags(BuildEventAr return flags; } - private void WriteItemList(IEnumerable items) + private void WriteTaskItemList(IEnumerable items) { var taskItems = items as IEnumerable; - if (taskItems != null) + if (taskItems == null) { - Write(taskItems.Count()); - - foreach (var item in taskItems) - { - Write(item); - } - + Write(false); return; } - Write(0); + Write(taskItems.Count()); + + foreach (var item in taskItems) + { + Write(item); + } } - private void WriteItems(IEnumerable items) + private void WriteProjectItems(IEnumerable items) { if (items == null) { @@ -580,29 +682,34 @@ private void WriteItems(IEnumerable items) return; } - var entries = items.OfType() - .Where(e => e.Key is string && e.Value is ITaskItem) + var groups = items + .OfType() + .GroupBy(entry => entry.Key as string, entry => entry.Value as ITaskItem) + .Where(group => !string.IsNullOrEmpty(group.Key)) .ToArray(); - Write(entries.Length); - foreach (DictionaryEntry entry in entries) + Write(groups.Length); + + foreach (var group in groups) { - string key = entry.Key as string; - ITaskItem item = entry.Value as ITaskItem; - Write(key); - Write(item); + WriteDeduplicatedString(group.Key); + WriteTaskItemList(group); } } private void Write(ITaskItem item) { - Write(item.ItemSpec); + WriteDeduplicatedString(item.ItemSpec); + + if (nameValueListBuffer.Count > 0) + { + nameValueListBuffer.Clear(); + } + IDictionary customMetadata = item.CloneCustomMetadata(); - Write(customMetadata.Count); foreach (string metadataName in customMetadata.Keys) { - Write(metadataName); string valueOrError; try @@ -623,8 +730,10 @@ private void Write(ITaskItem item) Debug.Fail(e.ToString()); } - Write(valueOrError); + nameValueListBuffer.Add(new KeyValuePair(metadataName, valueOrError)); } + + WriteNameValueList(); } private void WriteProperties(IEnumerable properties) @@ -635,26 +744,29 @@ private void WriteProperties(IEnumerable properties) return; } + if (nameValueListBuffer.Count > 0) + { + nameValueListBuffer.Clear(); + } + // there are no guarantees that the properties iterator won't change, so // take a snapshot and work with the readonly copy var propertiesArray = properties.OfType().ToArray(); - Write(propertiesArray.Length); - - foreach (DictionaryEntry entry in propertiesArray) + for (int i = 0; i < propertiesArray.Length; i++) { - if (entry.Key is string && entry.Value is string) + DictionaryEntry entry = propertiesArray[i]; + if (entry.Key is string key && entry.Value is string value) { - Write((string)entry.Key); - Write((string)entry.Value); + nameValueListBuffer.Add(new KeyValuePair(key, value)); } else { - // to keep the count accurate - Write(""); - Write(""); + nameValueListBuffer.Add(new KeyValuePair(string.Empty, string.Empty)); } } + + WriteNameValueList(); } private void Write(BuildEventContext buildEventContext) @@ -668,21 +780,98 @@ private void Write(BuildEventContext buildEventContext) Write(buildEventContext.EvaluationId); } - private void Write(IEnumerable> keyValuePairs) + private void Write(IEnumerable> keyValuePairs) { - if (keyValuePairs?.Any() == true) + if (nameValueListBuffer.Count > 0) + { + nameValueListBuffer.Clear(); + } + + if (keyValuePairs != null) { - Write(keyValuePairs.Count()); foreach (var kvp in keyValuePairs) { - Write(kvp.Key.ToString()); - Write(kvp.Value.ToString()); + nameValueListBuffer.Add(kvp); } } - else + + WriteNameValueList(); + } + + private void WriteNameValueList() + { + if (nameValueListBuffer.Count == 0) { - Write(false); + Write((byte)0); + return; + } + + HashKey hash = HashAllStrings(nameValueListBuffer); + if (!nameValueListHashes.TryGetValue(hash, out var recordId)) + { + recordId = nameValueRecordId; + nameValueListHashes[hash] = nameValueRecordId; + + WriteNameValueListRecord(); + + nameValueRecordId += 1; } + + Write(recordId); + } + + /// + /// In the middle of writing the current record we may discover that we want to write another record + /// preceding the current one, specifically the list of names and values we want to reuse in the + /// future. As we are writing the current record to a MemoryStream first, it's OK to temporarily + /// switch to the direct underlying stream and write the NameValueList record first. + /// When the current record is done writing, the MemoryStream will flush to the underlying stream + /// and the current record will end up after the NameValueList record, as desired. + /// + private void WriteNameValueListRecord() + { + // Switch the binaryWriter used by the Write* methods to the direct underlying stream writer. + // We want this record to precede the record we're currently writing to currentRecordWriter + // which is backed by a MemoryStream buffer + binaryWriter = this.originalBinaryWriter; + + Write(BinaryLogRecordKind.NameValueList); + Write(nameValueIndexListBuffer.Count); + for (int i = 0; i < nameValueListBuffer.Count; i++) + { + var kvp = nameValueIndexListBuffer[i]; + Write(kvp.Key); + Write(kvp.Value); + } + + // switch back to continue writing the current record to the memory stream + binaryWriter = this.currentRecordWriter; + } + + /// + /// Compute the total hash of all items in the nameValueList + /// while simultaneously filling the nameValueIndexListBuffer with the individual + /// hashes of the strings, mirroring the strings in the original nameValueList. + /// This helps us avoid hashing strings twice (once to hash the string individually + /// and the second time when hashing it as part of the nameValueList) + /// + private HashKey HashAllStrings(List> nameValueList) + { + HashKey hash = new HashKey(); + + nameValueIndexListBuffer.Clear(); + + for (int i = 0; i < nameValueList.Count; i++) + { + var kvp = nameValueList[i]; + var (keyIndex, keyHash) = HashString(kvp.Key); + var (valueIndex, valueHash) = HashString(kvp.Value); + hash = hash.Add(keyHash); + hash = hash.Add(valueHash); + nameValueIndexListBuffer.Add(new KeyValuePair(keyIndex, valueIndex)); + } + + return hash; } private void Write(BinaryLogRecordKind kind) @@ -718,34 +907,63 @@ private void Write(byte[] bytes) binaryWriter.Write(bytes); } + private void Write(byte b) + { + binaryWriter.Write(b); + } + private void Write(bool boolean) { binaryWriter.Write(boolean); } - private void Write(string text) + private void WriteDeduplicatedString(string text) { - if (text != null) - { - binaryWriter.Write(text); - } - else - { - binaryWriter.Write(false); - } + var (recordId, _) = HashString(text); + Write(recordId); } - private void WriteOptionalString(string text) + /// + /// Hash the string and write a String record if not already hashed. + /// + /// Returns the string record index as well as the hash. + private (int index, HashKey hash) HashString(string text) { if (text == null) { - Write(false); + return (0, default); } - else + else if (text.Length == 0) { - Write(true); - Write(text); + return (1, default); + } + + var hash = new HashKey(text); + if (!stringHashes.TryGetValue(hash, out var recordId)) + { + recordId = stringRecordId; + stringHashes[hash] = stringRecordId; + + WriteStringRecord(text); + + stringRecordId += 1; } + + return (recordId, hash); + } + + private void WriteStringRecord(string text) + { + // Switch the binaryWriter used by the Write* methods to the direct underlying stream writer. + // We want this record to precede the record we're currently writing to currentRecordWriter + // which is backed by a MemoryStream buffer + binaryWriter = this.originalBinaryWriter; + + Write(BinaryLogRecordKind.String); + binaryWriter.Write(text); + + // switch back to continue writing the current record to the memory stream + binaryWriter = this.currentRecordWriter; } private void Write(DateTime timestamp) @@ -761,10 +979,10 @@ private void Write(TimeSpan timeSpan) private void Write(EvaluationLocation item) { - WriteOptionalString(item.ElementName); - WriteOptionalString(item.ElementDescription); - WriteOptionalString(item.EvaluationPassDescription); - WriteOptionalString(item.File); + WriteDeduplicatedString(item.ElementName); + WriteDeduplicatedString(item.ElementDescription); + WriteDeduplicatedString(item.EvaluationPassDescription); + WriteDeduplicatedString(item.File); Write((int)item.Kind); Write((int)item.EvaluationPass); @@ -788,5 +1006,89 @@ private void Write(ProfiledLocation e) Write(e.ExclusiveTime); Write(e.InclusiveTime); } + + internal readonly struct HashKey : IEquatable + { + private readonly ulong value; + + private HashKey(ulong i) + { + value = i; + } + + public HashKey(string text) + { + if (text == null) + { + value = 0; + } + else + { + value = FnvHash64.GetHashCode(text); + } + } + + public static HashKey Combine(HashKey left, HashKey right) + { + return new HashKey(FnvHash64.Combine(left.value, right.value)); + } + + public HashKey Add(HashKey other) => Combine(this, other); + + public bool Equals(HashKey other) + { + return value == other.value; + } + + public override bool Equals(object obj) + { + if (obj is HashKey other) + { + return Equals(other); + } + + return false; + } + + public override int GetHashCode() + { + return unchecked((int)value); + } + + public override string ToString() + { + return value.ToString(); + } + } + + internal static class FnvHash64 + { + public const ulong Offset = 14695981039346656037; + public const ulong Prime = 1099511628211; + + public static ulong GetHashCode(string text) + { + ulong hash = Offset; + + unchecked + { + for (int i = 0; i < text.Length; i++) + { + char ch = text[i]; + hash = (hash ^ ch) * Prime; + } + } + + return hash; + } + + public static ulong Combine(ulong left, ulong right) + { + unchecked + { + return (left ^ right) * Prime; + } + } + } } } From b324bad839816b26f916957152116b5ae73ff510 Mon Sep 17 00:00:00 2001 From: Kirill Osenkov Date: Thu, 14 Jan 2021 16:26:08 -0800 Subject: [PATCH 2/6] Add some comments. --- src/Build/Logging/BinaryLogger/BinaryLogger.cs | 4 ++++ src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/Build/Logging/BinaryLogger/BinaryLogger.cs b/src/Build/Logging/BinaryLogger/BinaryLogger.cs index ab021e133c8..f9e9cb0b295 100644 --- a/src/Build/Logging/BinaryLogger/BinaryLogger.cs +++ b/src/Build/Logging/BinaryLogger/BinaryLogger.cs @@ -143,6 +143,10 @@ public void Initialize(IEventSource eventSource) } stream = new GZipStream(stream, CompressionLevel.Optimal); + + // wrapping the GZipStream in a buffered stream significantly improves performance + // and the max throughput is reached with a 32K buffer. See details here: + // https://github.com/dotnet/runtime/issues/39233#issuecomment-745598847 stream = new BufferedStream(stream, bufferSize: 32768); binaryWriter = new BinaryWriter(stream); eventArgsWriter = new BuildEventArgsWriter(binaryWriter); diff --git a/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs b/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs index c565119258d..9e79cf4b51e 100644 --- a/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs +++ b/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs @@ -1200,8 +1200,11 @@ public StringStorage() FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.None, - bufferSize: 4096, + bufferSize: 4096, // 4096 seems to have the best performance on SSD FileOptions.RandomAccess | FileOptions.DeleteOnClose); + + // 65536 has no particular significance, and maybe could be tuned + // but 65536 performs well enough and isn't a lot of memory for a singleton streamWriter = new StreamWriter(stream, utf8noBom, 65536); streamWriter.AutoFlush = true; streamReader = new StreamReader(stream, utf8noBom); From e6ee9a5ac26df58fce81ac18a2d9f91912105189 Mon Sep 17 00:00:00 2001 From: Kirill Osenkov Date: Thu, 14 Jan 2021 18:01:45 -0800 Subject: [PATCH 3/6] Throw exception for invalid string and NameValueTable index --- .../Logging/BinaryLogger/BuildEventArgsReader.cs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs b/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs index 9e79cf4b51e..d04a219eea0 100644 --- a/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs +++ b/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs @@ -16,6 +16,7 @@ public class BuildEventArgsReader : IDisposable { private readonly BinaryReader binaryReader; private readonly int fileFormatVersion; + private long recordNumber = 0; /// /// A list of string records we've encountered so far. If it's a small string, it will be the string directly. @@ -96,6 +97,8 @@ public BuildEventArgs Read() ReadBlob(recordKind); } + recordNumber += 1; + recordKind = (BinaryLogRecordKind)ReadInt32(); } @@ -171,6 +174,8 @@ public BuildEventArgs Read() break; } + recordNumber += 1; + return result; } @@ -224,7 +229,9 @@ private void ReadNameValueList() return dictionary; } - return new Dictionary(); + // this should never happen for valid binlogs + throw new InvalidDataException( + $"NameValueList record number {recordNumber} is invalid: index {id} is not within {stringRecords.Count}."); } private void ReadStringRecord() @@ -1065,7 +1072,9 @@ private string GetStringFromRecord(int index) return result; } - return string.Empty; + // this should never happen for valid binlogs + throw new InvalidDataException( + $"String record number {recordNumber} is invalid: string index {index} is not within {stringRecords.Count}."); } private int ReadInt32() From 9d8f857101e9b67099e64ebe125ca8787d00bf99 Mon Sep 17 00:00:00 2001 From: Kirill Osenkov Date: Sat, 16 Jan 2021 19:14:38 -0800 Subject: [PATCH 4/6] Make some fields readonly. Reduce maximum strings allocated in memory to 2GB (1 billion chars). --- src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs b/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs index d04a219eea0..bd9ae6e9481 100644 --- a/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs +++ b/src/Build/Logging/BinaryLogger/BuildEventArgsReader.cs @@ -1190,11 +1190,11 @@ internal class StringPosition /// internal class StringStorage : IDisposable { - private string filePath; + private readonly string filePath; private FileStream stream; private StreamWriter streamWriter; - private StreamReader streamReader; - private StringBuilder stringBuilder; + private readonly StreamReader streamReader; + private readonly StringBuilder stringBuilder; public const int StringSizeThreshold = 1024; @@ -1240,10 +1240,8 @@ public object Add(string text) // the file. // Win-win: small binlog playback is fast and large binlog playback // doesn't OOM. - if (text.Length <= StringSizeThreshold && totalAllocatedShortStrings < 2_000_000_000) + if (text.Length <= StringSizeThreshold && totalAllocatedShortStrings < 1_000_000_000) { - // note that we write strings in UTF8 so we don't need to multiply by 2 as chars - // will be 1 byte on average totalAllocatedShortStrings += text.Length; return text; } From 9ba9c9e767e2d091b25b27d8e8287386b1021e34 Mon Sep 17 00:00:00 2001 From: Kirill Osenkov Date: Sat, 16 Jan 2021 19:18:35 -0800 Subject: [PATCH 5/6] Introduce a RedirectionScope in BuildEventArgsWriter This avoids manually switching from currentRecordWriter to originalBinaryWriter in three different places. It's also easier this way to find the places where the switch happens. --- .../BinaryLogger/BuildEventArgsWriter.cs | 41 +++++++++++++------ 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs b/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs index 8c1ba1abaed..e35e6043b0f 100644 --- a/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs +++ b/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs @@ -200,13 +200,37 @@ public void WriteBlob(BinaryLogRecordKind kind, byte[] bytes) { // write the blob directly to the underlying writer, // bypassing the memory stream - binaryWriter = originalBinaryWriter; + using var redirection = RedirectWritesToOriginalWriter(); Write(kind); Write(bytes.Length); Write(bytes); + } + + /// + /// Switches the binaryWriter used by the Write* methods to the direct underlying stream writer + /// until the disposable is disposed. Useful to bypass the currentRecordWriter to write a string, + /// blob or NameValueRecord that should precede the record being currently written. + /// + private IDisposable RedirectWritesToOriginalWriter() + { + binaryWriter = originalBinaryWriter; + return new RedirectionScope(this); + } + + private struct RedirectionScope : IDisposable + { + private readonly BuildEventArgsWriter _writer; - binaryWriter = currentRecordWriter; + public RedirectionScope(BuildEventArgsWriter buildEventArgsWriter) + { + _writer = buildEventArgsWriter; + } + + public void Dispose() + { + _writer.binaryWriter = _writer.currentRecordWriter; + } } private void Write(BuildStartedEventArgs e) @@ -833,7 +857,7 @@ private void WriteNameValueListRecord() // Switch the binaryWriter used by the Write* methods to the direct underlying stream writer. // We want this record to precede the record we're currently writing to currentRecordWriter // which is backed by a MemoryStream buffer - binaryWriter = this.originalBinaryWriter; + using var redirectionScope = RedirectWritesToOriginalWriter(); Write(BinaryLogRecordKind.NameValueList); Write(nameValueIndexListBuffer.Count); @@ -843,9 +867,6 @@ private void WriteNameValueListRecord() Write(kvp.Key); Write(kvp.Value); } - - // switch back to continue writing the current record to the memory stream - binaryWriter = this.currentRecordWriter; } /// @@ -954,16 +975,10 @@ private void WriteDeduplicatedString(string text) private void WriteStringRecord(string text) { - // Switch the binaryWriter used by the Write* methods to the direct underlying stream writer. - // We want this record to precede the record we're currently writing to currentRecordWriter - // which is backed by a MemoryStream buffer - binaryWriter = this.originalBinaryWriter; + using var redirectionScope = RedirectWritesToOriginalWriter(); Write(BinaryLogRecordKind.String); binaryWriter.Write(text); - - // switch back to continue writing the current record to the memory stream - binaryWriter = this.currentRecordWriter; } private void Write(DateTime timestamp) From 77a4ff36bfa49cff3a7f8d4f9f92708ea277c337 Mon Sep 17 00:00:00 2001 From: Kirill Osenkov Date: Sat, 16 Jan 2021 19:21:02 -0800 Subject: [PATCH 6/6] Clear the list unconditionally. --- .../Logging/BinaryLogger/BuildEventArgsWriter.cs | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs b/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs index e35e6043b0f..3b37d1404cf 100644 --- a/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs +++ b/src/Build/Logging/BinaryLogger/BuildEventArgsWriter.cs @@ -725,10 +725,7 @@ private void Write(ITaskItem item) { WriteDeduplicatedString(item.ItemSpec); - if (nameValueListBuffer.Count > 0) - { - nameValueListBuffer.Clear(); - } + nameValueListBuffer.Clear(); IDictionary customMetadata = item.CloneCustomMetadata(); @@ -768,10 +765,7 @@ private void WriteProperties(IEnumerable properties) return; } - if (nameValueListBuffer.Count > 0) - { - nameValueListBuffer.Clear(); - } + nameValueListBuffer.Clear(); // there are no guarantees that the properties iterator won't change, so // take a snapshot and work with the readonly copy @@ -806,10 +800,7 @@ private void Write(BuildEventContext buildEventContext) private void Write(IEnumerable> keyValuePairs) { - if (nameValueListBuffer.Count > 0) - { - nameValueListBuffer.Clear(); - } + nameValueListBuffer.Clear(); if (keyValuePairs != null) {