Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to merge using multiple columns in JOIN condition #5838

Merged
merged 27 commits into from Sep 28, 2021
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c527c21
fix #5767 issue with DataFrame Merge method
Apr 26, 2021
a38fdf5
Merge pull request #1 from dotnet/main
Apr 29, 2021
e5f5eeb
Merge pull request #2 from dotnet/main
May 27, 2021
0580710
#5820 Extend DataFrame GroupBy operations
May 28, 2021
f7658b2
#5820 fix code review findings
May 29, 2021
dc8daf0
Avoid code duplication in Merge DataFrame method (#5657)
Jun 2, 2021
b36fbee
Merge pull request #4 from dotnet/main
Jun 4, 2021
a4735ce
Add non generic DataFrame Merge method (#5657)
Jun 4, 2021
7893bfd
Add support for multi columns join in DataFrame (#5657)
Jun 4, 2021
28fcda3
Fix failing tests for inner, left and right joins with nulls
Jun 5, 2021
5da5a37
#5657 fix DataFrame outer join failing tests
Jun 7, 2021
040f6bb
rebase to the latest main
May 29, 2021
c1b7969
Avoid code duplication in Merge DataFrame method (#5657)
Jun 2, 2021
bd22036
Add non generic DataFrame Merge method (#5657)
Jun 4, 2021
d2d5f36
Add support for multi columns join in DataFrame (#5657)
Jun 4, 2021
0494f7c
Fix failing tests for inner, left and right joins with nulls
Jun 5, 2021
976351c
#5657 fix DataFrame outer join failing tests
Jun 7, 2021
b5bff6c
#5657 fix merge issues
Jun 7, 2021
cbd02dc
Merge branch 'feature/5657_dataframe_merge_multiple_columns' of https…
Jun 7, 2021
1940f3a
Minor changes #5657
Jun 7, 2021
4232fee
Add self explanatory exception text (#5657)
Jun 7, 2021
33a99c3
Add Asserts to new unit tests (#5657)
Jun 7, 2021
12a5833
Minor changes (#5657)
Jun 7, 2021
22fc3e2
Fix right merge by 3-columns test fails
Jun 8, 2021
989711b
Merge pull request #5 from asmirnov82/feature/5657_dataframe_merge_mu…
Jun 8, 2021
2f22cd7
fixed typos (#5657)
Sep 4, 2021
3396554
Merge pull request #6 from dotnet/main
Sep 19, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs
Expand Up @@ -660,5 +660,10 @@ public ArrowStringDataFrameColumn Apply(Func<string, string> func)
}
return ret;
}

public override Dictionary<long, ICollection<long>> GetGroupedOccurrences(DataFrameColumn other, out HashSet<long> otherColumnNullIndices)
{
return GetGroupedOccurrences<string>(other, out otherColumnNullIndices);
}
}
}
336 changes: 179 additions & 157 deletions src/Microsoft.Data.Analysis/DataFrame.Join.cs

Large diffs are not rendered by default.

38 changes: 38 additions & 0 deletions src/Microsoft.Data.Analysis/DataFrameColumn.cs
Expand Up @@ -210,6 +210,44 @@ public virtual DataFrameColumn Sort(bool ascending = true)
/// <returns>A mapping of value(<typeparamref name="TKey"/>) to the indices containing this value</returns>
public virtual Dictionary<TKey, ICollection<long>> GroupColumnValues<TKey>(out HashSet<long> nullIndices) => throw new NotImplementedException();

/// <summary>
/// Get occurences of each value from this column in other column, grouped by this value
/// </summary>
/// <param name="other"></param>
/// <param name="otherColumnNullIndices"></param>
/// <returns>A mapping of index from this column to the indices of same value in other column</returns>
public abstract Dictionary<long, ICollection<long>> GetGroupedOccurrences(DataFrameColumn other, out HashSet<long> otherColumnNullIndices);

/// <summary>
/// Get occurences of each value from this column in other column, grouped by this value
/// </summary>
/// <typeparam name="TKey"></typeparam>
/// <param name="other"></param>
/// <param name="otherColumnNullIndices"></param>
/// <returns>A mapping of index from this column to the indices of same value in other column</returns>
protected Dictionary<long, ICollection<long>> GetGroupedOccurrences<TKey>(DataFrameColumn other, out HashSet<long> otherColumnNullIndices)
{
if (this.DataType != other.DataType)
throw new ArgumentException(String.Format(Strings.MismatchedColumnValueType, this.DataType), nameof(other));

// First hash other column
Dictionary<TKey, ICollection<long>> multimap = other.GroupColumnValues<TKey>(out otherColumnNullIndices);

var ret = new Dictionary<long, ICollection<long>>();

//For each value in this column find rows from other column with equal value
for (int i = 0; i < this.Length; i++)
{
var value = this[i];
if (value != null && multimap.TryGetValue((TKey)value, out ICollection<long> otherRowIndices))
pgovind marked this conversation as resolved.
Show resolved Hide resolved
{
ret.Add(i, otherRowIndices);
}
}

return ret;
}

/// <summary>
/// Returns a DataFrame containing counts of unique values
/// </summary>
Expand Down
5 changes: 5 additions & 0 deletions src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Expand Up @@ -810,5 +810,10 @@ protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor
{
return cursor.GetGetter<T>(schemaColumn);
}

public override Dictionary<long, ICollection<long>> GetGroupedOccurrences(DataFrameColumn other, out HashSet<long> otherColumnNullIndices)
{
return GetGroupedOccurrences<T>(other, out otherColumnNullIndices);
}
}
}
5 changes: 5 additions & 0 deletions src/Microsoft.Data.Analysis/StringDataFrameColumn.cs
Expand Up @@ -511,5 +511,10 @@ protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor
{
return cursor.GetGetter<ReadOnlyMemory<char>>(schemaColumn);
}

public override Dictionary<long, ICollection<long>> GetGroupedOccurrences(DataFrameColumn other, out HashSet<long> otherColumnNullIndices)
{
return GetGroupedOccurrences<string>(other, out otherColumnNullIndices);
}
}
}
11 changes: 10 additions & 1 deletion src/Microsoft.Data.Analysis/Strings.Designer.cs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions src/Microsoft.Data.Analysis/Strings.resx
Expand Up @@ -201,6 +201,9 @@
<data name="MapIndicesExceedsColumnLenth" xml:space="preserve">
<value>MapIndices exceeds column length</value>
</data>
<data name="MismatchedArrayLengths" xml:space="preserve">
<value>Array lengths are mistmached</value>
</data>
<data name="MismatchedColumnLengths" xml:space="preserve">
<value>Column lengths are mismatched</value>
</data>
Expand Down
Expand Up @@ -91,7 +91,6 @@ public void TestGroupingWithTKeyPrimitiveType()
[Fact]
public void TestGroupingWithTKeyOfWrongType()
{

var message = string.Empty;

//Create test dataframe (numbers starting from 0 up to lenght)
Expand Down