Skip to content

Commit

Permalink
Fix inconsistency in DataFrameColumns Clone API implementation (#7100)
Browse files Browse the repository at this point in the history
* Rename all private Clone methods to CloneInternal to avoid ambiguity, make base Clone method not virtual and CloneImplementation abstract

* Final step

* Polishing

* Remove not required changes

* Undo some not necessary changes

* Minor renaming

* Rerun unit tests

* Fix comments
  • Loading branch information
asmirnov82 committed Apr 5, 2024
1 parent 0fd58cb commit 07eb681
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 128 deletions.
13 changes: 11 additions & 2 deletions src/Microsoft.Data.Analysis/DataFrameColumn.cs
Expand Up @@ -202,14 +202,21 @@ public void SetName(string newName)
/// <param name="length">The new length of the column</param>
protected internal virtual void Resize(long length) => throw new NotImplementedException();

/// <summary>
/// Clone column to produce a copy
/// </summary>
/// <param name="numberOfNullsToAppend"></param>
/// <returns>A new <see cref="DataFrameColumn"/></returns>
public DataFrameColumn Clone(long numberOfNullsToAppend = 0) => CloneImplementation(numberOfNullsToAppend);

/// <summary>
/// Clone column to produce a copy potentially changing the order of values by supplying mapIndices and an invert flag
/// </summary>
/// <param name="mapIndices"></param>
/// <param name="invertMapIndices"></param>
/// <param name="numberOfNullsToAppend"></param>
/// <returns>A new <see cref="DataFrameColumn"/></returns>
public virtual DataFrameColumn Clone(DataFrameColumn mapIndices = null, bool invertMapIndices = false, long numberOfNullsToAppend = 0) => CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend);
public DataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0) => CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend);

/// <summary>
/// Clone column to produce a copy potentially changing the order of values by supplying mapIndices and an invert flag
Expand All @@ -218,7 +225,9 @@ public void SetName(string newName)
/// <param name="invertMapIndices"></param>
/// <param name="numberOfNullsToAppend"></param>
/// <returns>A new <see cref="DataFrameColumn"/></returns>
protected virtual DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices, long numberOfNullsToAppend) => throw new NotImplementedException();
protected abstract DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices, long numberOfNullsToAppend);

protected abstract DataFrameColumn CloneImplementation(long numberOfNullsToAppend = 0);

/// <summary>
/// Returns a copy of this column sorted by its values
Expand Down
Expand Up @@ -62,7 +62,6 @@ public ArrowStringDataFrameColumn(string name, ReadOnlyMemory<byte> values, Read
_nullBitMapBuffers.Add(nullBitMapBuffer);

_nullCount = nullCount;

}

private long _nullCount;
Expand Down Expand Up @@ -371,8 +370,32 @@ protected internal override Apache.Arrow.Array ToArrowArray(long startIndex, int
/// <inheritdoc/>
public override DataFrameColumn Sort(bool ascending = true) => throw new NotSupportedException();

public new ArrowStringDataFrameColumn Clone(long numberOfNullsToAppend = 0)
{
return (ArrowStringDataFrameColumn)CloneImplementation(numberOfNullsToAppend);
}

public new ArrowStringDataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0)
{
return (ArrowStringDataFrameColumn)CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend);
}

/// <inheritdoc/>
public override DataFrameColumn Clone(DataFrameColumn mapIndices = null, bool invertMapIndices = false, long numberOfNullsToAppend = 0)
protected override DataFrameColumn CloneImplementation(long numberOfNullsToAppend)
{
var ret = new ArrowStringDataFrameColumn(Name);

for (long i = 0; i < Length; i++)
ret.Append(IsValid(i) ? GetBytes(i) : default(ReadOnlySpan<byte>));

for (long i = 0; i < numberOfNullsToAppend; i++)
ret.Append(default);

return ret;
}

/// <inheritdoc/>
protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0)
{
ArrowStringDataFrameColumn clone;
if (!(mapIndices is null))
Expand All @@ -381,27 +404,28 @@ public override DataFrameColumn Clone(DataFrameColumn mapIndices = null, bool in
if (dataType != typeof(long) && dataType != typeof(int) && dataType != typeof(bool))
throw new ArgumentException(String.Format(Strings.MultipleMismatchedValueType, typeof(long), typeof(int), typeof(bool)), nameof(mapIndices));
if (mapIndices.DataType == typeof(long))
clone = Clone(mapIndices as PrimitiveDataFrameColumn<long>, invertMapIndices);
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<long>, invertMapIndices);
else if (dataType == typeof(int))
clone = Clone(mapIndices as PrimitiveDataFrameColumn<int>, invertMapIndices);
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<int>, invertMapIndices);
else
clone = Clone(mapIndices as PrimitiveDataFrameColumn<bool>);
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<bool>);

for (long i = 0; i < numberOfNullsToAppend; i++)
clone.Append(default);
}
else
{
clone = Clone();
}
for (long i = 0; i < numberOfNullsToAppend; i++)
{
clone.Append(default);
clone = Clone(numberOfNullsToAppend);
}

return clone;
}

private ArrowStringDataFrameColumn Clone(PrimitiveDataFrameColumn<bool> boolColumn)
private ArrowStringDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn<bool> boolColumn)
{
if (boolColumn.Length > Length)
throw new ArgumentException(Strings.MapIndicesExceedsColumnLength, nameof(boolColumn));

ArrowStringDataFrameColumn ret = new ArrowStringDataFrameColumn(Name);
for (long i = 0; i < boolColumn.Length; i++)
{
Expand All @@ -412,49 +436,26 @@ private ArrowStringDataFrameColumn Clone(PrimitiveDataFrameColumn<bool> boolColu
return ret;
}

private ArrowStringDataFrameColumn CloneImplementation<U>(PrimitiveDataFrameColumn<U> mapIndices, bool invertMapIndices = false)
private ArrowStringDataFrameColumn CloneImplementation<U>(PrimitiveDataFrameColumn<U> mapIndices, bool invertMapIndices)
where U : unmanaged
{
ArrowStringDataFrameColumn ret = new ArrowStringDataFrameColumn(Name);

mapIndices.ApplyElementwise((U? mapIndex, long rowIndex) =>
{
if (mapIndex == null)
{
ret.Append(default);
return mapIndex;
}
if (invertMapIndices)
{
long index = mapIndices.Length - 1 - rowIndex;
ret.Append(IsValid(index) ? GetBytes(index) : default(ReadOnlySpan<byte>));
}
else
{
ret.Append(IsValid(rowIndex) ? GetBytes(rowIndex) : default(ReadOnlySpan<byte>));
}
long index = invertMapIndices ? mapIndices.Length - 1 - rowIndex : rowIndex;
ret.Append(IsValid(index) ? GetBytes(index) : default(ReadOnlySpan<byte>));
return mapIndex;
});
return ret;
}

private ArrowStringDataFrameColumn Clone(PrimitiveDataFrameColumn<long> mapIndices = null, bool invertMapIndex = false)
{
if (mapIndices is null)
{
ArrowStringDataFrameColumn ret = new ArrowStringDataFrameColumn(Name);
for (long i = 0; i < Length; i++)
{
ret.Append(IsValid(i) ? GetBytes(i) : default(ReadOnlySpan<byte>));
}
return ret;
}
else
return CloneImplementation(mapIndices, invertMapIndex);
}

private ArrowStringDataFrameColumn Clone(PrimitiveDataFrameColumn<int> mapIndices, bool invertMapIndex = false)
{
return CloneImplementation(mapIndices, invertMapIndex);
return ret;
}

/// <inheritdoc/>
Expand Down
Expand Up @@ -250,6 +250,28 @@ private PrimitiveDataFrameColumn<long> GetSortIndices(Comparer<string> comparer,
}

public new StringDataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMapIndices, long numberOfNullsToAppend)
{
return (StringDataFrameColumn)CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend);
}

public new StringDataFrameColumn Clone(long numberOfNullsToAppend = 0)
{
return (StringDataFrameColumn)CloneImplementation(numberOfNullsToAppend);
}

protected override DataFrameColumn CloneImplementation(long numberOfNullsToAppend)
{
StringDataFrameColumn ret = new StringDataFrameColumn(Name, Length);
for (long i = 0; i < Length; i++)
ret[i] = this[i];

for (long i = 0; i < numberOfNullsToAppend; i++)
ret.Append(null);

return ret;
}

protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0)
{
StringDataFrameColumn clone;
if (!(mapIndices is null))
Expand All @@ -258,29 +280,24 @@ public new StringDataFrameColumn Clone(DataFrameColumn mapIndices, bool invertMa
if (dataType != typeof(long) && dataType != typeof(int) && dataType != typeof(bool))
throw new ArgumentException(String.Format(Strings.MultipleMismatchedValueType, typeof(long), typeof(int), typeof(bool)), nameof(mapIndices));
if (mapIndices.DataType == typeof(long))
clone = Clone(mapIndices as PrimitiveDataFrameColumn<long>, invertMapIndices);
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<long>, invertMapIndices);
else if (dataType == typeof(int))
clone = Clone(mapIndices as PrimitiveDataFrameColumn<int>, invertMapIndices);
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<int>, invertMapIndices);
else
clone = Clone(mapIndices as PrimitiveDataFrameColumn<bool>);
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<bool>);

for (long i = 0; i < numberOfNullsToAppend; i++)
clone.Append(null);
}
else
{
clone = Clone();
clone = Clone(numberOfNullsToAppend);
}
for (long i = 0; i < numberOfNullsToAppend; i++)
{
clone.Append(null);
}
return clone;
}

protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices = null, bool invertMapIndices = false, long numberOfNullsToAppend = 0)
{
return Clone(mapIndices, invertMapIndices, numberOfNullsToAppend);
return clone;
}

private StringDataFrameColumn Clone(PrimitiveDataFrameColumn<bool> boolColumn)
private StringDataFrameColumn CloneImplementation(PrimitiveDataFrameColumn<bool> boolColumn)
{
if (boolColumn.Length > Length)
throw new ArgumentException(Strings.MapIndicesExceedsColumnLength, nameof(boolColumn));
Expand Down Expand Up @@ -375,28 +392,6 @@ private StringDataFrameColumn CloneImplementation<U>(PrimitiveDataFrameColumn<U>
return ret;
}

private StringDataFrameColumn Clone(PrimitiveDataFrameColumn<long> mapIndices = null, bool invertMapIndex = false)
{
if (mapIndices is null)
{
StringDataFrameColumn ret = new StringDataFrameColumn(Name, Length);
for (long i = 0; i < Length; i++)
{
ret[i] = this[i];
}
return ret;
}
else
{
return CloneImplementation(mapIndices, invertMapIndex);
}
}

private StringDataFrameColumn Clone(PrimitiveDataFrameColumn<int> mapIndices, bool invertMapIndex = false)
{
return CloneImplementation(mapIndices, invertMapIndex);
}

internal static DataFrame ValueCountsImplementation(Dictionary<string, ICollection<long>> groupedValues)
{
StringDataFrameColumn keys = new StringDataFrameColumn("Values", 0);
Expand Down
Expand Up @@ -5,7 +5,6 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Data;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using Microsoft.ML;
Expand Down Expand Up @@ -210,7 +209,7 @@ protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, D
}
}

private VBufferDataFrameColumn<T> Clone(PrimitiveDataFrameColumn<bool> boolColumn)
private VBufferDataFrameColumn<T> CloneImplementation(PrimitiveDataFrameColumn<bool> boolColumn)
{
if (boolColumn.Length > Length)
throw new ArgumentException(Strings.MapIndicesExceedsColumnLength, nameof(boolColumn));
Expand All @@ -224,28 +223,6 @@ private VBufferDataFrameColumn<T> Clone(PrimitiveDataFrameColumn<bool> boolColum
return ret;
}

private VBufferDataFrameColumn<T> Clone(PrimitiveDataFrameColumn<long> mapIndices = null, bool invertMapIndex = false)
{
if (mapIndices is null)
{
VBufferDataFrameColumn<T> ret = new VBufferDataFrameColumn<T>(Name, Length);
for (long i = 0; i < Length; i++)
{
ret[i] = this[i];
}
return ret;
}
else
{
return CloneImplementation(mapIndices, invertMapIndex);
}
}

private VBufferDataFrameColumn<T> Clone(PrimitiveDataFrameColumn<int> mapIndices, bool invertMapIndex = false)
{
return CloneImplementation(mapIndices, invertMapIndex);
}

private VBufferDataFrameColumn<T> CloneImplementation<U>(PrimitiveDataFrameColumn<U> mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0)
where U : unmanaged
{
Expand Down Expand Up @@ -314,6 +291,16 @@ private VBufferDataFrameColumn<T> CloneImplementation<U>(PrimitiveDataFrameColum
}

public new VBufferDataFrameColumn<T> Clone(DataFrameColumn mapIndices, bool invertMapIndices, long numberOfNullsToAppend)
{
return (VBufferDataFrameColumn<T>)CloneImplementation(mapIndices, invertMapIndices, numberOfNullsToAppend);
}

public new VBufferDataFrameColumn<T> Clone(long numberOfNullsToAppend = 0)
{
return (VBufferDataFrameColumn<T>)CloneImplementation(numberOfNullsToAppend);
}

protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices, bool invertMapIndices = false, long numberOfNullsToAppend = 0)
{
VBufferDataFrameColumn<T> clone;
if (!(mapIndices is null))
Expand All @@ -322,11 +309,11 @@ public new VBufferDataFrameColumn<T> Clone(DataFrameColumn mapIndices, bool inve
if (dataType != typeof(long) && dataType != typeof(int) && dataType != typeof(bool))
throw new ArgumentException(String.Format(Strings.MultipleMismatchedValueType, typeof(long), typeof(int), typeof(bool)), nameof(mapIndices));
if (mapIndices.DataType == typeof(long))
clone = Clone(mapIndices as PrimitiveDataFrameColumn<long>, invertMapIndices);
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<long>, invertMapIndices);
else if (dataType == typeof(int))
clone = Clone(mapIndices as PrimitiveDataFrameColumn<int>, invertMapIndices);
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<int>, invertMapIndices);
else
clone = Clone(mapIndices as PrimitiveDataFrameColumn<bool>);
clone = CloneImplementation(mapIndices as PrimitiveDataFrameColumn<bool>);
}
else
{
Expand All @@ -336,9 +323,14 @@ public new VBufferDataFrameColumn<T> Clone(DataFrameColumn mapIndices, bool inve
return clone;
}

protected override DataFrameColumn CloneImplementation(DataFrameColumn mapIndices = null, bool invertMapIndices = false, long numberOfNullsToAppend = 0)
protected override DataFrameColumn CloneImplementation(long numberOfNullsToAppend)
{
return Clone(mapIndices, invertMapIndices, numberOfNullsToAppend);
var ret = new VBufferDataFrameColumn<T>(Name, Length);

for (long i = 0; i < Length; i++)
ret[i] = this[i];

return ret;
}

private static VectorDataViewType GetDataViewType()
Expand Down

0 comments on commit 07eb681

Please sign in to comment.