diff --git a/src/Microsoft.Data.Analysis/DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumn.cs index 2085304925..afe3be4282 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumn.cs @@ -202,6 +202,13 @@ public void SetName(string newName) /// The new length of the column protected internal virtual void Resize(long length) => throw new NotImplementedException(); + /// + /// Clone column to produce a copy + /// + /// + /// A new + public DataFrameColumn Clone(long numberOfNullsToAppend = 0) => CloneImplementation(numberOfNullsToAppend); + /// /// Clone column to produce a copy potentially changing the order of values by supplying mapIndices and an invert flag /// @@ -209,7 +216,7 @@ public void SetName(string newName) /// /// /// A new - public virtual DataFrameColumn Clone(DataFrameColumn mapIndices = null, bool invertMapIndices = false, long numberOfNullsToAppend = 0) => CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend); + public DataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0) => CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend); /// /// Clone column to produce a copy potentially changing the order of values by supplying mapIndices and an invert flag @@ -218,7 +225,9 @@ public void SetName(string newName) /// /// /// A new - protected virtual DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices, long numberOfNullsToAppend) => throw new NotImplementedException(); + protected abstract DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices, long numberOfNullsToAppend); + + protected abstract DataFrameColumn CloneImplementation(long numberOfNullsToAppend = 0); /// /// Returns a copy of this column sorted by its values diff --git a/src/Microsoft.Data.Analysis/DataFrameColumns/ArrowStringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/ArrowStringDataFrameColumn.cs index 49f88eab6b..ac14541549 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumns/ArrowStringDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumns/ArrowStringDataFrameColumn.cs @@ -62,7 +62,6 @@ public ArrowStringDataFrameColumn(string name, ReadOnlyMemory values, Read _nullBitMapBuffers.Add(nullBitMapBuffer); _nullCount = nullCount; - } private long _nullCount; @@ -371,8 +370,32 @@ protected internal override Apache.Arrow.Array ToArrowArray(long startIndex, int /// public override DataFrameColumn Sort(bool ascending = true) => throw new NotSupportedException(); + public new ArrowStringDataFrameColumn Clone(long numberOfNullsToAppend = 0) + { + return (ArrowStringDataFrameColumn)CloneImplementation(numberOfNullsToAppend); + } + + public new ArrowStringDataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0) + { + return (ArrowStringDataFrameColumn)CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend); + } + /// - public override DataFrameColumn Clone(DataFrameColumn mapIndices = null, bool invertMapIndices = false, long numberOfNullsToAppend = 0) + protected override DataFrameColumn CloneImplementation(long numberOfNullsToAppend) + { + var ret = new ArrowStringDataFrameColumn(Name); + + for (long i = 0; i < Length; i++) + ret.Append(IsValid(i) ? GetBytes(i) : default(ReadOnlySpan)); + + for (long i = 0; i < numberOfNullsToAppend; i++) + ret.Append(default); + + return ret; + } + + /// + protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0) { ArrowStringDataFrameColumn clone; if (!(mapIndices is null)) @@ -381,27 +404,28 @@ public override DataFrameColumn Clone(DataFrameColumn mapIndices = null, bool in if (dataType != typeof(long) && dataType != typeof(int) && dataType != typeof(bool)) throw new ArgumentException(String.Format(Strings.MultipleMismatchedValueType, typeof(long), typeof(int), typeof(bool)), nameof(mapIndices)); if (mapIndices.DataType == typeof(long)) - clone = Clone(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); + clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); else if (dataType == typeof(int)) - clone = Clone(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); + clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); else - clone = Clone(mapIndices as PrimitiveDataFrameColumn); + clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn); + + for (long i = 0; i < numberOfNullsToAppend; i++) + clone.Append(default); } else { - clone = Clone(); - } - for (long i = 0; i < numberOfNullsToAppend; i++) - { - clone.Append(default); + clone = Clone(numberOfNullsToAppend); } + return clone; } - private ArrowStringDataFrameColumn Clone(PrimitiveDataFrameColumn boolColumn) + private ArrowStringDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn boolColumn) { if (boolColumn.Length > Length) throw new ArgumentException(Strings.MapIndicesExceedsColumnLength, nameof(boolColumn)); + ArrowStringDataFrameColumn ret = new ArrowStringDataFrameColumn(Name); for (long i = 0; i < boolColumn.Length; i++) { @@ -412,10 +436,11 @@ private ArrowStringDataFrameColumn Clone(PrimitiveDataFrameColumn boolColu return ret; } - private ArrowStringDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn mapIndices, bool invertMapIndices = false) + private ArrowStringDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn mapIndices, bool invertMapIndices) where U : unmanaged { ArrowStringDataFrameColumn ret = new ArrowStringDataFrameColumn(Name); + mapIndices.ApplyElementwise((U? mapIndex, long rowIndex) => { if (mapIndex == null) @@ -423,38 +448,14 @@ private ArrowStringDataFrameColumn CloneImplementation(PrimitiveDataFrameColu ret.Append(default); return mapIndex; } - if (invertMapIndices) - { - long index = mapIndices.Length - 1 - rowIndex; - ret.Append(IsValid(index) ? GetBytes(index) : default(ReadOnlySpan)); - } - else - { - ret.Append(IsValid(rowIndex) ? GetBytes(rowIndex) : default(ReadOnlySpan)); - } + + long index = invertMapIndices ? mapIndices.Length - 1 - rowIndex : rowIndex; + ret.Append(IsValid(index) ? GetBytes(index) : default(ReadOnlySpan)); + return mapIndex; }); - return ret; - } - private ArrowStringDataFrameColumn Clone(PrimitiveDataFrameColumn mapIndices = null, bool invertMapIndex = false) - { - if (mapIndices is null) - { - ArrowStringDataFrameColumn ret = new ArrowStringDataFrameColumn(Name); - for (long i = 0; i < Length; i++) - { - ret.Append(IsValid(i) ? GetBytes(i) : default(ReadOnlySpan)); - } - return ret; - } - else - return CloneImplementation(mapIndices, invertMapIndex); - } - - private ArrowStringDataFrameColumn Clone(PrimitiveDataFrameColumn mapIndices, bool invertMapIndex = false) - { - return CloneImplementation(mapIndices, invertMapIndex); + return ret; } /// diff --git a/src/Microsoft.Data.Analysis/DataFrameColumns/StringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/StringDataFrameColumn.cs index 59ded9765b..e77a71e6f0 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumns/StringDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumns/StringDataFrameColumn.cs @@ -250,6 +250,28 @@ private PrimitiveDataFrameColumn GetSortIndices(Comparer comparer, } public new StringDataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMapIndices, long numberOfNullsToAppend) + { + return (StringDataFrameColumn)CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend); + } + + public new StringDataFrameColumn Clone(long numberOfNullsToAppend = 0) + { + return (StringDataFrameColumn)CloneImplementation(numberOfNullsToAppend); + } + + protected override DataFrameColumn CloneImplementation(long numberOfNullsToAppend) + { + StringDataFrameColumn ret = new StringDataFrameColumn(Name, Length); + for (long i = 0; i < Length; i++) + ret[i] = this[i]; + + for (long i = 0; i < numberOfNullsToAppend; i++) + ret.Append(null); + + return ret; + } + + protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0) { StringDataFrameColumn clone; if (!(mapIndices is null)) @@ -258,29 +280,24 @@ public new StringDataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMa if (dataType != typeof(long) && dataType != typeof(int) && dataType != typeof(bool)) throw new ArgumentException(String.Format(Strings.MultipleMismatchedValueType, typeof(long), typeof(int), typeof(bool)), nameof(mapIndices)); if (mapIndices.DataType == typeof(long)) - clone = Clone(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); + clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); else if (dataType == typeof(int)) - clone = Clone(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); + clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); else - clone = Clone(mapIndices as PrimitiveDataFrameColumn); + clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn); + + for (long i = 0; i < numberOfNullsToAppend; i++) + clone.Append(null); } else { - clone = Clone(); + clone = Clone(numberOfNullsToAppend); } - for (long i = 0; i < numberOfNullsToAppend; i++) - { - clone.Append(null); - } - return clone; - } - protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices = null, bool invertMapIndices = false, long numberOfNullsToAppend = 0) - { - return Clone(mapIndices, invertMapIndices, numberOfNullsToAppend); + return clone; } - private StringDataFrameColumn Clone(PrimitiveDataFrameColumn boolColumn) + private StringDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn boolColumn) { if (boolColumn.Length > Length) throw new ArgumentException(Strings.MapIndicesExceedsColumnLength, nameof(boolColumn)); @@ -375,28 +392,6 @@ private StringDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn return ret; } - private StringDataFrameColumn Clone(PrimitiveDataFrameColumn mapIndices = null, bool invertMapIndex = false) - { - if (mapIndices is null) - { - StringDataFrameColumn ret = new StringDataFrameColumn(Name, Length); - for (long i = 0; i < Length; i++) - { - ret[i] = this[i]; - } - return ret; - } - else - { - return CloneImplementation(mapIndices, invertMapIndex); - } - } - - private StringDataFrameColumn Clone(PrimitiveDataFrameColumn mapIndices, bool invertMapIndex = false) - { - return CloneImplementation(mapIndices, invertMapIndex); - } - internal static DataFrame ValueCountsImplementation(Dictionary> groupedValues) { StringDataFrameColumn keys = new StringDataFrameColumn("Values", 0); diff --git a/src/Microsoft.Data.Analysis/DataFrameColumns/VBufferDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/VBufferDataFrameColumn.cs index fab98ee005..4c190d5cbe 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumns/VBufferDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumns/VBufferDataFrameColumn.cs @@ -5,7 +5,6 @@ using System; using System.Collections; using System.Collections.Generic; -using System.Data; using System.Diagnostics; using System.Runtime.CompilerServices; using Microsoft.ML; @@ -210,7 +209,7 @@ protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, D } } - private VBufferDataFrameColumn Clone(PrimitiveDataFrameColumn boolColumn) + private VBufferDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn boolColumn) { if (boolColumn.Length > Length) throw new ArgumentException(Strings.MapIndicesExceedsColumnLength, nameof(boolColumn)); @@ -224,28 +223,6 @@ private VBufferDataFrameColumn Clone(PrimitiveDataFrameColumn boolColum return ret; } - private VBufferDataFrameColumn Clone(PrimitiveDataFrameColumn mapIndices = null, bool invertMapIndex = false) - { - if (mapIndices is null) - { - VBufferDataFrameColumn ret = new VBufferDataFrameColumn(Name, Length); - for (long i = 0; i < Length; i++) - { - ret[i] = this[i]; - } - return ret; - } - else - { - return CloneImplementation(mapIndices, invertMapIndex); - } - } - - private VBufferDataFrameColumn Clone(PrimitiveDataFrameColumn mapIndices, bool invertMapIndex = false) - { - return CloneImplementation(mapIndices, invertMapIndex); - } - private VBufferDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0) where U : unmanaged { @@ -314,6 +291,16 @@ private VBufferDataFrameColumn CloneImplementation(PrimitiveDataFrameColum } public new VBufferDataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMapIndices, long numberOfNullsToAppend) + { + return (VBufferDataFrameColumn)CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend); + } + + public new VBufferDataFrameColumn Clone(long numberOfNullsToAppend = 0) + { + return (VBufferDataFrameColumn)CloneImplementation(numberOfNullsToAppend); + } + + protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0) { VBufferDataFrameColumn clone; if (!(mapIndices is null)) @@ -322,11 +309,11 @@ public new VBufferDataFrameColumn Clone(DataFrameColumn mapIndices, bool inve if (dataType != typeof(long) && dataType != typeof(int) && dataType != typeof(bool)) throw new ArgumentException(String.Format(Strings.MultipleMismatchedValueType, typeof(long), typeof(int), typeof(bool)), nameof(mapIndices)); if (mapIndices.DataType == typeof(long)) - clone = Clone(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); + clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); else if (dataType == typeof(int)) - clone = Clone(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); + clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); else - clone = Clone(mapIndices as PrimitiveDataFrameColumn); + clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn); } else { @@ -336,9 +323,14 @@ public new VBufferDataFrameColumn Clone(DataFrameColumn mapIndices, bool inve return clone; } - protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices = null, bool invertMapIndices = false, long numberOfNullsToAppend = 0) + protected override DataFrameColumn CloneImplementation(long numberOfNullsToAppend) { - return Clone(mapIndices, invertMapIndices, numberOfNullsToAppend); + var ret = new VBufferDataFrameColumn(Name, Length); + + for (long i = 0; i < Length; i++) + ret[i] = this[i]; + + return ret; } private static VectorDataViewType GetDataViewType() diff --git a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs index f29f4963d7..3d3f740318 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs @@ -63,8 +63,7 @@ public IEnumerable> GetReadOnlyDataBuffers() { for (int i = 0; i < _columnContainer.Buffers.Count; i++) { - ReadOnlyDataFrameBuffer buffer = _columnContainer.Buffers[i]; - yield return buffer.ReadOnlyMemory; + yield return _columnContainer.Buffers[i].ReadOnlyMemory; } } @@ -364,13 +363,29 @@ public override bool HasDescription() } /// - /// Returns a clone of this column + /// Returns a clone of this column. + /// + /// + /// + public new PrimitiveDataFrameColumn Clone(long numberOfNullsToAppend = 0) + { + return (PrimitiveDataFrameColumn)CloneImplementation(numberOfNullsToAppend); + } + + /// + /// Returns a clone of this column. /// /// A column who values are used as indices /// /// /// - public new PrimitiveDataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMapIndices, long numberOfNullsToAppend) + public new PrimitiveDataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0) + { + return (PrimitiveDataFrameColumn)CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend); + } + + /// + protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices, long numberOfNullsToAppend) { PrimitiveDataFrameColumn clone; if (!(mapIndices is null)) @@ -383,24 +398,31 @@ public new PrimitiveDataFrameColumn Clone(DataFrameColumn mapIndices, bool in else if (dataType == typeof(int)) clone = Clone(mapIndices as PrimitiveDataFrameColumn, invertMapIndices); else - clone = Clone(mapIndices as PrimitiveDataFrameColumn); + clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn); + + if (numberOfNullsToAppend != 0) + clone.AppendMany(null, numberOfNullsToAppend); } else { clone = Clone(); } - Debug.Assert(!ReferenceEquals(clone, null)); - clone.AppendMany(null, numberOfNullsToAppend); + return clone; } - /// - protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices, long numberOfNullsToAppend) + protected override DataFrameColumn CloneImplementation(long numberOfNullsToAppend) { - return Clone(mapIndices, invertMapIndices, numberOfNullsToAppend); + var newColumnContainer = _columnContainer.Clone(); + var clone = CreateNewColumn(Name, newColumnContainer); + + if (numberOfNullsToAppend != 0) + clone.AppendMany(null, numberOfNullsToAppend); + + return clone; } - private PrimitiveDataFrameColumn Clone(PrimitiveDataFrameColumn boolColumn) + private PrimitiveDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn boolColumn) { if (boolColumn.Length > Length) throw new ArgumentException(Strings.MapIndicesExceedsColumnLength, nameof(boolColumn)); @@ -436,21 +458,19 @@ private PrimitiveDataFrameColumn CloneImplementation(PrimitiveDataFrameCol return ret; } - public PrimitiveDataFrameColumn Clone(PrimitiveDataFrameColumn mapIndices = null, bool invertMapIndices = false) + public PrimitiveDataFrameColumn Clone(PrimitiveDataFrameColumn mapIndices, bool invertMapIndices = false) { if (mapIndices is null) - { - PrimitiveColumnContainer newColumnContainer = _columnContainer.Clone(); - return CreateNewColumn(Name, newColumnContainer); - } - else - { - return CloneImplementation(mapIndices, invertMapIndices); - } + return Clone(); + + return CloneImplementation(mapIndices, invertMapIndices); } public PrimitiveDataFrameColumn Clone(PrimitiveDataFrameColumn mapIndices, bool invertMapIndices = false) { + if (mapIndices is null) + return Clone(); + return CloneImplementation(mapIndices, invertMapIndices); }