Skip to content

Commit

Permalink
Add ability to merge using multiple columns in JOIN condition (#5838)
Browse files Browse the repository at this point in the history
* fix #5767 issue with DataFrame Merge method

* #5820 Extend DataFrame GroupBy operations

* #5820 fix code review findings

* Avoid code duplication in Merge DataFrame method (#5657)

* Add non generic DataFrame Merge method (#5657)

* Add support for multi columns join in DataFrame (#5657)

* Fix failing tests for inner, left and right joins with nulls

* #5657 fix DataFrame outer join failing tests

* rebase to the latest main

* Avoid code duplication in Merge DataFrame method (#5657)

* Add non generic DataFrame Merge method (#5657)

* Add support for multi columns join in DataFrame (#5657)

* Fix failing tests for inner, left and right joins with nulls

* #5657 fix DataFrame outer join failing tests

* #5657 fix merge issues

* Minor changes #5657

* Add self explanatory exception text (#5657)

* Add Asserts to new unit tests (#5657)

* Minor changes (#5657)

* Fix right merge by 3-columns test fails

* fixed typos (#5657)
  • Loading branch information
Aleksei Smirnov committed Sep 28, 2021
1 parent a5e6cd3 commit d13045f
Show file tree
Hide file tree
Showing 9 changed files with 681 additions and 183 deletions.
5 changes: 5 additions & 0 deletions src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs
Expand Up @@ -660,5 +660,10 @@ public ArrowStringDataFrameColumn Apply(Func<string, string> func)
}
return ret;
}

public override Dictionary<long, ICollection<long>> GetGroupedOccurrences(DataFrameColumn other, out HashSet<long> otherColumnNullIndices)
{
return GetGroupedOccurrences<string>(other, out otherColumnNullIndices);
}
}
}
336 changes: 179 additions & 157 deletions src/Microsoft.Data.Analysis/DataFrame.Join.cs

Large diffs are not rendered by default.

38 changes: 38 additions & 0 deletions src/Microsoft.Data.Analysis/DataFrameColumn.cs
Expand Up @@ -210,6 +210,44 @@ public virtual DataFrameColumn Sort(bool ascending = true)
/// <returns>A mapping of value(<typeparamref name="TKey"/>) to the indices containing this value</returns>
public virtual Dictionary<TKey, ICollection<long>> GroupColumnValues<TKey>(out HashSet<long> nullIndices) => throw new NotImplementedException();

/// <summary>
/// Get occurences of each value from this column in other column, grouped by this value
/// </summary>
/// <param name="other"></param>
/// <param name="otherColumnNullIndices"></param>
/// <returns>A mapping of index from this column to the indices of same value in other column</returns>
public abstract Dictionary<long, ICollection<long>> GetGroupedOccurrences(DataFrameColumn other, out HashSet<long> otherColumnNullIndices);

/// <summary>
/// Get occurences of each value from this column in other column, grouped by this value
/// </summary>
/// <typeparam name="TKey"></typeparam>
/// <param name="other"></param>
/// <param name="otherColumnNullIndices"></param>
/// <returns>A mapping of index from this column to the indices of same value in other column</returns>
protected Dictionary<long, ICollection<long>> GetGroupedOccurrences<TKey>(DataFrameColumn other, out HashSet<long> otherColumnNullIndices)
{
if (this.DataType != other.DataType)
throw new ArgumentException(String.Format(Strings.MismatchedColumnValueType, this.DataType), nameof(other));

// First hash other column
Dictionary<TKey, ICollection<long>> multimap = other.GroupColumnValues<TKey>(out otherColumnNullIndices);

var ret = new Dictionary<long, ICollection<long>>();

//For each value in this column find rows from other column with equal value
for (int i = 0; i < this.Length; i++)
{
var value = this[i];
if (value != null && multimap.TryGetValue((TKey)value, out ICollection<long> otherRowIndices))
{
ret.Add(i, otherRowIndices);
}
}

return ret;
}

/// <summary>
/// Returns a DataFrame containing counts of unique values
/// </summary>
Expand Down
5 changes: 5 additions & 0 deletions src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Expand Up @@ -810,5 +810,10 @@ protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor
{
return cursor.GetGetter<T>(schemaColumn);
}

public override Dictionary<long, ICollection<long>> GetGroupedOccurrences(DataFrameColumn other, out HashSet<long> otherColumnNullIndices)
{
return GetGroupedOccurrences<T>(other, out otherColumnNullIndices);
}
}
}
5 changes: 5 additions & 0 deletions src/Microsoft.Data.Analysis/StringDataFrameColumn.cs
Expand Up @@ -511,5 +511,10 @@ protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor
{
return cursor.GetGetter<ReadOnlyMemory<char>>(schemaColumn);
}

public override Dictionary<long, ICollection<long>> GetGroupedOccurrences(DataFrameColumn other, out HashSet<long> otherColumnNullIndices)
{
return GetGroupedOccurrences<string>(other, out otherColumnNullIndices);
}
}
}
11 changes: 10 additions & 1 deletion src/Microsoft.Data.Analysis/Strings.Designer.cs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions src/Microsoft.Data.Analysis/Strings.resx
Expand Up @@ -201,6 +201,9 @@
<data name="MapIndicesExceedsColumnLenth" xml:space="preserve">
<value>MapIndices exceeds column length</value>
</data>
<data name="MismatchedArrayLengths" xml:space="preserve">
<value>Array lengths are mistmached</value>
</data>
<data name="MismatchedColumnLengths" xml:space="preserve">
<value>Column lengths are mismatched</value>
</data>
Expand Down
Expand Up @@ -91,7 +91,6 @@ public void TestGroupingWithTKeyPrimitiveType()
[Fact]
public void TestGroupingWithTKeyOfWrongType()
{

var message = string.Empty;

//Create test dataframe (numbers starting from 0 up to lenght)
Expand Down

0 comments on commit d13045f

Please sign in to comment.