diff --git a/src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs index d05912ca6f..5065009422 100644 --- a/src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs @@ -660,5 +660,10 @@ public ArrowStringDataFrameColumn Apply(Func func) } return ret; } + + public override Dictionary> GetGroupedOccurrences(DataFrameColumn other, out HashSet otherColumnNullIndices) + { + return GetGroupedOccurrences(other, out otherColumnNullIndices); + } } } diff --git a/src/Microsoft.Data.Analysis/DataFrame.Join.cs b/src/Microsoft.Data.Analysis/DataFrame.Join.cs index da99e6254f..2af42a5566 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.Join.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.Join.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; +using System.Linq; namespace Microsoft.Data.Analysis { @@ -141,10 +142,18 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right return ret; } - // TODO: Merge API with an "On" parameter that merges on a column common to 2 dataframes + private static bool IsAnyNullValueInColumns (IReadOnlyCollection columns, long index) + { + foreach (var column in columns) + { + if (column[index] == null) + return true; + } + return false; + } /// - /// Merge DataFrames with a database style join + /// Merge DataFrames with a database style join (for backward compatibility) /// /// /// @@ -155,207 +164,220 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right /// public DataFrame Merge(DataFrame other, string leftJoinColumn, string rightJoinColumn, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) { - // A simple hash join - DataFrame ret = new DataFrame(); - DataFrame leftDataFrame = this; - DataFrame rightDataFrame = other; + return Merge(other, new[] { leftJoinColumn }, new[] { rightJoinColumn }, leftSuffix, rightSuffix, joinAlgorithm); + } - // The final table size is not known until runtime - long rowNumber = 0; - PrimitiveDataFrameColumn leftRowIndices = new PrimitiveDataFrameColumn("LeftIndices"); - PrimitiveDataFrameColumn rightRowIndices = new PrimitiveDataFrameColumn("RightIndices"); - if (joinAlgorithm == JoinAlgorithm.Left) - { - // First hash other dataframe on the rightJoinColumn - DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; - Dictionary> multimap = otherColumn.GroupColumnValues(out HashSet otherColumnNullIndices); + private static HashSet Merge(DataFrame retainedDataFrame, DataFrame supplementaryDataFrame, string[] retainedJoinColumnNames, string[] supplemetaryJoinColumnNames, out PrimitiveDataFrameColumn retainedRowIndices, out PrimitiveDataFrameColumn supplementaryRowIndices, bool isInner = false, bool calculateIntersection = false) + { + if (retainedJoinColumnNames == null) + throw new ArgumentNullException(nameof(retainedJoinColumnNames)); - // Go over the records in this dataframe and match with the dictionary - DataFrameColumn thisColumn = Columns[leftJoinColumn]; + if (supplemetaryJoinColumnNames == null) + throw new ArgumentNullException(nameof(supplemetaryJoinColumnNames)); - for (long i = 0; i < thisColumn.Length; i++) - { - var thisColumnValue = thisColumn[i]; - if (thisColumnValue != null) - { - if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection rowNumbers)) - { - foreach (long row in rowNumbers) - { - leftRowIndices.Append(i); - rightRowIndices.Append(row); - } - } - else - { - leftRowIndices.Append(i); - rightRowIndices.Append(null); - } - } - else - { - foreach (long row in otherColumnNullIndices) - { - leftRowIndices.Append(i); - rightRowIndices.Append(row); - } - } - } - } - else if (joinAlgorithm == JoinAlgorithm.Right) + if (retainedJoinColumnNames.Length != supplemetaryJoinColumnNames.Length) + throw new ArgumentException(Strings.MismatchedArrayLengths, nameof(retainedJoinColumnNames)); + + + HashSet intersection = calculateIntersection ? new HashSet() : null; + + // Get occurrences of values in columns used for join in the retained and supplementary dataframes + Dictionary> occurrences = null; + Dictionary retainedIndicesReverseMapping = null; + + HashSet supplementaryJoinColumnsNullIndices = new HashSet(); + + + for (int colNameIndex = 0; colNameIndex < retainedJoinColumnNames.Length; colNameIndex++) { - DataFrameColumn thisColumn = Columns[leftJoinColumn]; - Dictionary> multimap = thisColumn.GroupColumnValues(out HashSet thisColumnNullIndices); + DataFrameColumn shrinkedRetainedColumn = retainedDataFrame.Columns[retainedJoinColumnNames[colNameIndex]]; - DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; - for (long i = 0; i < otherColumn.Length; i++) + //shrink retained column by row occurrences from previous step + if (occurrences != null) { - var otherColumnValue = otherColumn[i]; - if (otherColumnValue != null) - { - if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection rowNumbers)) - { - foreach (long row in rowNumbers) - { - leftRowIndices.Append(row); - rightRowIndices.Append(i); - } - } - else - { - leftRowIndices.Append(null); - rightRowIndices.Append(i); - } - } - else + //only rows with occurences from previose step should go for futher processing + var shrinkedRetainedIndices = occurrences.Keys.ToArray(); + + //create reverse mapping of index of the row in the shrinked column to the index of this row in the original dataframe (new index -> original index) + var newRetainedIndicesReverseMapping = new Dictionary(shrinkedRetainedIndices.Length); + + for (int i = 0; i < shrinkedRetainedIndices.Length; i++) { - foreach (long thisColumnNullIndex in thisColumnNullIndices) - { - leftRowIndices.Append(thisColumnNullIndex); - rightRowIndices.Append(i); - } + //store reverse mapping to restore original dataframe indices from indices in shrinked row + var originalIndex = shrinkedRetainedIndices[i]; + newRetainedIndicesReverseMapping.Add(i, originalIndex); } + + retainedIndicesReverseMapping = newRetainedIndicesReverseMapping; + shrinkedRetainedColumn = shrinkedRetainedColumn.Clone(new Int64DataFrameColumn("Indices", shrinkedRetainedIndices)); } - } - else if (joinAlgorithm == JoinAlgorithm.Inner) - { - // Hash the column with the smaller RowCount - long leftRowCount = Rows.Count; - long rightRowCount = other.Rows.Count; + + DataFrameColumn supplementaryColumn = supplementaryDataFrame.Columns[supplemetaryJoinColumnNames[colNameIndex]]; + + //Find occurrenses on current step (join column) + var newOccurrences = shrinkedRetainedColumn.GetGroupedOccurrences(supplementaryColumn, out HashSet supplementaryColumnNullIndices); - bool leftColumnIsSmaller = leftRowCount <= rightRowCount; - DataFrameColumn hashColumn = leftColumnIsSmaller ? Columns[leftJoinColumn] : other.Columns[rightJoinColumn]; - DataFrameColumn otherColumn = ReferenceEquals(hashColumn, Columns[leftJoinColumn]) ? other.Columns[rightJoinColumn] : Columns[leftJoinColumn]; - Dictionary> multimap = hashColumn.GroupColumnValues(out HashSet smallerDataFrameColumnNullIndices); + //Convert indices from in key from local (shrinked row) to indices in original dataframe + if (retainedIndicesReverseMapping != null) + newOccurrences = newOccurrences.ToDictionary(kvp => retainedIndicesReverseMapping[kvp.Key], kvp => kvp.Value); - for (long i = 0; i < otherColumn.Length; i++) + supplementaryJoinColumnsNullIndices.UnionWith(supplementaryColumnNullIndices); + + // shrink join result on current column by previous join columns (if any) + // (we have to remove occurrences that doesn't exist in previous columns, because JOIN happens only if ALL left and right columns in JOIN are matched) + if (occurrences != null) { - var otherColumnValue = otherColumn[i]; - if (otherColumnValue != null) - { - if (multimap.TryGetValue((TKey)otherColumnValue, out ICollection rowNumbers)) - { - foreach (long row in rowNumbers) - { - leftRowIndices.Append(leftColumnIsSmaller ? row : i); - rightRowIndices.Append(leftColumnIsSmaller ? i : row); - } - } - } - else + var shrinkedOccurences = new Dictionary>(); + + foreach (var kvp in newOccurrences) { - foreach (long nullIndex in smallerDataFrameColumnNullIndices) + var newValue = kvp.Value.Where(i => occurrences[kvp.Key].Contains(i)).ToArray(); + if (newValue.Any()) { - leftRowIndices.Append(leftColumnIsSmaller ? nullIndex : i); - rightRowIndices.Append(leftColumnIsSmaller ? i : nullIndex); + shrinkedOccurences.Add(kvp.Key, newValue); } } + newOccurrences = shrinkedOccurences; } - } - else if (joinAlgorithm == JoinAlgorithm.FullOuter) - { - DataFrameColumn otherColumn = other.Columns[rightJoinColumn]; - Dictionary> multimap = otherColumn.GroupColumnValues(out HashSet otherColumnNullIndices); - Dictionary intersection = new Dictionary(EqualityComparer.Default); - // Go over the records in this dataframe and match with the dictionary - DataFrameColumn thisColumn = Columns[leftJoinColumn]; - Int64DataFrameColumn thisColumnNullIndices = new Int64DataFrameColumn("ThisColumnNullIndices"); + occurrences = newOccurrences; + } + + retainedRowIndices = new Int64DataFrameColumn("RetainedIndices"); + supplementaryRowIndices = new Int64DataFrameColumn("SupplementaryIndices"); - for (long i = 0; i < thisColumn.Length; i++) + //Perform Merging + var retainJoinColumns = retainedJoinColumnNames.Select(name => retainedDataFrame.Columns[name]).ToArray(); + for (long i = 0; i < retainedDataFrame.Columns.RowCount; i++) + { + if (!IsAnyNullValueInColumns(retainJoinColumns, i)) { - var thisColumnValue = thisColumn[i]; - if (thisColumnValue != null) + //Get all row indexes from supplementary dataframe that sutisfy JOIN condition + if (occurrences.TryGetValue(i, out ICollection rowIndices)) { - if (multimap.TryGetValue((TKey)thisColumnValue, out ICollection rowNumbers)) + foreach (long supplementaryRowIndex in rowIndices) { - foreach (long row in rowNumbers) + retainedRowIndices.Append(i); + supplementaryRowIndices.Append(supplementaryRowIndex); + + //store intersection if required + if (calculateIntersection) { - leftRowIndices.Append(i); - rightRowIndices.Append(row); - if (!intersection.ContainsKey((TKey)thisColumnValue)) + if (!intersection.Contains(supplementaryRowIndex)) { - intersection.Add((TKey)thisColumnValue, rowNumber); + intersection.Add(supplementaryRowIndex); } } } - else - { - leftRowIndices.Append(i); - rightRowIndices.Append(null); - } } else { - thisColumnNullIndices.Append(i); + if (isInner) + continue; + + retainedRowIndices.Append(i); + supplementaryRowIndices.Append(null); } } - for (long i = 0; i < otherColumn.Length; i++) - { - var value = otherColumn[i]; - if (value != null) + else + { + foreach (long row in supplementaryJoinColumnsNullIndices) { - if (!intersection.ContainsKey((TKey)value)) - { - leftRowIndices.Append(null); - rightRowIndices.Append(i); - } + retainedRowIndices.Append(i); + supplementaryRowIndices.Append(row); } } + } + + return intersection; + } + + public DataFrame Merge(DataFrame other, string[] leftJoinColumns, string[] rightJoinColumns, string leftSuffix = "_left", string rightSuffix = "_right", JoinAlgorithm joinAlgorithm = JoinAlgorithm.Left) + { + if (other == null) + throw new ArgumentNullException(nameof(other)); - // Now handle the null rows - foreach (long? thisColumnNullIndex in thisColumnNullIndices) - { - foreach (long otherColumnNullIndex in otherColumnNullIndices) - { - leftRowIndices.Append(thisColumnNullIndex.Value); - rightRowIndices.Append(otherColumnNullIndex); - } - if (otherColumnNullIndices.Count == 0) - { - leftRowIndices.Append(thisColumnNullIndex.Value); - rightRowIndices.Append(null); - } - } - if (thisColumnNullIndices.Length == 0) + //In Outer join the joined dataframe retains each row — even if no other matching row exists in supplementary dataframe. + //Outer joins subdivide further into left outer joins (left dataframe is retained), right outer joins (rightdataframe is retained), in full outer both are retained + + PrimitiveDataFrameColumn retainedRowIndices; + PrimitiveDataFrameColumn supplementaryRowIndices; + DataFrame supplementaryDataFrame; + DataFrame retainedDataFrame; + bool isLeftDataFrameRetained; + + if (joinAlgorithm == JoinAlgorithm.Left || joinAlgorithm == JoinAlgorithm.Right) + { + isLeftDataFrameRetained = (joinAlgorithm == JoinAlgorithm.Left); + + supplementaryDataFrame = isLeftDataFrameRetained ? other : this; + var supplementaryJoinColumns = isLeftDataFrameRetained ? rightJoinColumns : leftJoinColumns; + + retainedDataFrame = isLeftDataFrameRetained ? this : other; + var retainedJoinColumns = isLeftDataFrameRetained ? leftJoinColumns : rightJoinColumns; + + Merge(retainedDataFrame, supplementaryDataFrame, retainedJoinColumns, supplementaryJoinColumns, out retainedRowIndices, out supplementaryRowIndices); + + } + else if (joinAlgorithm == JoinAlgorithm.Inner) + { + // use as supplementary (for Hashing) the dataframe with the smaller RowCount + isLeftDataFrameRetained = (Rows.Count > other.Rows.Count); + + supplementaryDataFrame = isLeftDataFrameRetained ? other : this; + var supplementaryJoinColumns = isLeftDataFrameRetained ? rightJoinColumns : leftJoinColumns; + + retainedDataFrame = isLeftDataFrameRetained ? this : other; + var retainedJoinColumns = isLeftDataFrameRetained ? leftJoinColumns : rightJoinColumns; + + Merge(retainedDataFrame, supplementaryDataFrame, retainedJoinColumns, supplementaryJoinColumns, out retainedRowIndices, out supplementaryRowIndices, true); + } + else if (joinAlgorithm == JoinAlgorithm.FullOuter) + { + //In full outer join we would like to retain data from both side, so we do it into 2 steps: one first we do LEFT JOIN and then add lost data from the RIGHT side + + //Step 1 + //Do LEFT JOIN + isLeftDataFrameRetained = true; + + supplementaryDataFrame = isLeftDataFrameRetained ? other : this; + var supplementaryJoinColumns = isLeftDataFrameRetained ? rightJoinColumns : leftJoinColumns; + + retainedDataFrame = isLeftDataFrameRetained ? this : other; + var retainedJoinColumns = isLeftDataFrameRetained ? leftJoinColumns : rightJoinColumns; + + var intersection = Merge(retainedDataFrame, supplementaryDataFrame, retainedJoinColumns, supplementaryJoinColumns, out retainedRowIndices, out supplementaryRowIndices, calculateIntersection: true); + + //Step 2 + //Do RIGHT JOIN to retain all data from supplementary DataFrame too (take into account data intersection from the first step to avoid duplicates) + for (long i = 0; i < supplementaryDataFrame.Columns.RowCount; i++) { - foreach (long otherColumnNullIndex in otherColumnNullIndices) + var columns = supplementaryJoinColumns.Select(name => supplementaryDataFrame.Columns[name]).ToArray(); + if (!IsAnyNullValueInColumns(columns, i)) { - leftRowIndices.Append(null); - rightRowIndices.Append(otherColumnNullIndex); + if (!intersection.Contains(i)) + { + retainedRowIndices.Append(null); + supplementaryRowIndices.Append(i); + } } } } else throw new NotImplementedException(nameof(joinAlgorithm)); - - for (int i = 0; i < leftDataFrame.Columns.Count; i++) + + DataFrame ret = new DataFrame(); + + //insert columns from left dataframe (this) + for (int i = 0; i < this.Columns.Count; i++) { - ret.Columns.Insert(i, leftDataFrame.Columns[i].Clone(leftRowIndices)); + ret.Columns.Insert(i, this.Columns[i].Clone(isLeftDataFrameRetained ? retainedRowIndices : supplementaryRowIndices)); } - for (int i = 0; i < rightDataFrame.Columns.Count; i++) + + //insert columns from right dataframe (other) + for (int i = 0; i < other.Columns.Count; i++) { - DataFrameColumn column = rightDataFrame.Columns[i].Clone(rightRowIndices); + DataFrameColumn column = other.Columns[i].Clone(isLeftDataFrameRetained ? supplementaryRowIndices : retainedRowIndices); SetSuffixForDuplicatedColumnNames(ret, column, leftSuffix, rightSuffix); ret.Columns.Insert(ret.Columns.Count, column); } diff --git a/src/Microsoft.Data.Analysis/DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumn.cs index ac959ba290..88415d8bdc 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumn.cs @@ -210,6 +210,44 @@ public virtual DataFrameColumn Sort(bool ascending = true) /// A mapping of value() to the indices containing this value public virtual Dictionary> GroupColumnValues(out HashSet nullIndices) => throw new NotImplementedException(); + /// + /// Get occurences of each value from this column in other column, grouped by this value + /// + /// + /// + /// A mapping of index from this column to the indices of same value in other column + public abstract Dictionary> GetGroupedOccurrences(DataFrameColumn other, out HashSet otherColumnNullIndices); + + /// + /// Get occurences of each value from this column in other column, grouped by this value + /// + /// + /// + /// + /// A mapping of index from this column to the indices of same value in other column + protected Dictionary> GetGroupedOccurrences(DataFrameColumn other, out HashSet otherColumnNullIndices) + { + if (this.DataType != other.DataType) + throw new ArgumentException(String.Format(Strings.MismatchedColumnValueType, this.DataType), nameof(other)); + + // First hash other column + Dictionary> multimap = other.GroupColumnValues(out otherColumnNullIndices); + + var ret = new Dictionary>(); + + //For each value in this column find rows from other column with equal value + for (int i = 0; i < this.Length; i++) + { + var value = this[i]; + if (value != null && multimap.TryGetValue((TKey)value, out ICollection otherRowIndices)) + { + ret.Add(i, otherRowIndices); + } + } + + return ret; + } + /// /// Returns a DataFrame containing counts of unique values /// diff --git a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs index 0c586c8cd7..8b44bca18f 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs @@ -810,5 +810,10 @@ protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor { return cursor.GetGetter(schemaColumn); } + + public override Dictionary> GetGroupedOccurrences(DataFrameColumn other, out HashSet otherColumnNullIndices) + { + return GetGroupedOccurrences(other, out otherColumnNullIndices); + } } } diff --git a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs index e249f21557..4250c18785 100644 --- a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs @@ -511,5 +511,10 @@ protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor { return cursor.GetGetter>(schemaColumn); } + + public override Dictionary> GetGroupedOccurrences(DataFrameColumn other, out HashSet otherColumnNullIndices) + { + return GetGroupedOccurrences(other, out otherColumnNullIndices); + } } } diff --git a/src/Microsoft.Data.Analysis/Strings.Designer.cs b/src/Microsoft.Data.Analysis/Strings.Designer.cs index 4b24665bf3..9cbf90f38e 100644 --- a/src/Microsoft.Data.Analysis/Strings.Designer.cs +++ b/src/Microsoft.Data.Analysis/Strings.Designer.cs @@ -268,7 +268,7 @@ internal class Strings { } /// - /// Looks up a localized string similar to Column does not exist. + /// Looks up a localized string similar to Column '{0}' does not exist. /// internal static string InvalidColumnName { get { @@ -312,6 +312,15 @@ internal class Strings { } } + /// + /// Looks up a localized string similar to Array lengths are mistmached. + /// + internal static string MismatchedArrayLengths { + get { + return ResourceManager.GetString("MismatchedArrayLengths", resourceCulture); + } + } + /// /// Looks up a localized string similar to Column lengths are mismatched. /// diff --git a/src/Microsoft.Data.Analysis/Strings.resx b/src/Microsoft.Data.Analysis/Strings.resx index 8975448708..79764037cc 100644 --- a/src/Microsoft.Data.Analysis/Strings.resx +++ b/src/Microsoft.Data.Analysis/Strings.resx @@ -201,6 +201,9 @@ MapIndices exceeds column length + + Array lengths are mistmached + Column lengths are mismatched diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameGroupByTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameGroupByTests.cs index fdbc859f7b..2319a9b64f 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameGroupByTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameGroupByTests.cs @@ -91,7 +91,6 @@ public void TestGroupingWithTKeyPrimitiveType() [Fact] public void TestGroupingWithTKeyOfWrongType() { - var message = string.Empty; //Create test dataframe (numbers starting from 0 up to lenght) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 72e9e628da..b19b1ce5d9 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -2007,37 +2007,39 @@ public void TestMergeEdgeCases_Outer() DataFrame left = MakeDataFrameWithAllMutableColumnTypes(5); left["Int"][3] = null; DataFrame right = MakeDataFrameWithAllMutableColumnTypes(5); + right["Int"][1] = 5; + right["Int"][3] = null; + right["Int"][4] = 6; + // Creates this case: /* - * Left: Right: - * 0 0 - * 1 5 - * null(2) null(7) - * null(3) null(8) - * 4 6 + * Left: Right: RowIndex: + * 0 0 0 + * 1 5 1 + * null null 2 + * null(3) null(3) 3 + * 4 6 4 */ + /* * Merge will result in a DataFrame like: - * Int_Left Int_Right - * 0 0 - * 1 null - * 4 null - * null 5 - * null 6 - * null(2) null(7) - * null(2) null(8) - * null(3) null(7) - * null(3) null(8) + * Int_left: Int_right: Merged: Index: + * 0 0 0 - 0 0 + * 1 null 1 - N 1 + * null null 2 - 2 2 + * null null(3) 2 - 3 3 + * null(3) null 3 - 2 4 + * null(3) null(3) 3 - 3 5 + * 4 null 4 - N 6 + * null 5 N - 1 7 + * null 6 N - 4 8 */ - right["Int"][1] = 5; - right["Int"][3] = null; - right["Int"][4] = 6; DataFrame merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.FullOuter); Assert.Equal(9, merge.Rows.Count); Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - - int[] mergeRows = new int[] { 0, 5, 6, 7, 8 }; + + int[] mergeRows = new int[] { 0, 2, 3, 4, 5 }; int[] leftRows = new int[] { 0, 2, 2, 3, 3 }; int[] rightRows = new int[] { 0, 2, 3, 2, 3 }; for (long i = 0; i < mergeRows.Length; i++) @@ -2048,7 +2050,7 @@ public void TestMergeEdgeCases_Outer() MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, leftRowIndex, rightRowIndex); } - mergeRows = new int[] { 1, 2 }; + mergeRows = new int[] { 1, 6 }; leftRows = new int[] { 1, 4 }; for (long i = 0; i < mergeRows.Length; i++) { @@ -2057,7 +2059,7 @@ public void TestMergeEdgeCases_Outer() MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, leftRowIndex, null); } - mergeRows = new int[] { 3, 4 }; + mergeRows = new int[] { 7, 8 }; rightRows = new int[] { 1, 4 }; for (long i = 0; i < mergeRows.Length; i++) { @@ -2067,6 +2069,416 @@ public void TestMergeEdgeCases_Outer() } } + [Fact] + public void TestMerge_ByTwoColumns_Complex_LeftJoin() + { + //Test left merge by to int type columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2, 3, 4, 5 })); + left.Columns.Add (new Int32DataFrameColumn("G1", new[] { 0, 1, 1, 2, 2, 3 })); + left.Columns.Add (new Int32DataFrameColumn("G2", new[] { 3, 1, 2, 1, 2, 1})); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2, 3 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 2, 1, 1 })); + + // Creates this case: + /* ------------------------- + * Left | Right + * I G1 G2 | I G1 G2 + * ------------------------- + * 0 0 3 | 0 1 1 + * 1 1 1 | 1 1 2 + * 2 1 2 | 2 1 1 + * 3 2 1 | 3 2 1 + * 4 2 2 + * 5 3 1 + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 IR Merged: + * ------------------------- + * 0 0 3 0 - N + * 1 1 1 0 1 1 1 - 0 + * 1 1 1 2 1 1 1 - 2 + * 2 1 2 1 1 2 2 - 1 + * 3 2 1 3 2 1 3 - 3 + * 4 2 2 4 - N + * 5 3 1 5 - N + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, null), + (1, 0), + (1, 2), + (2, 1), + (3, 3), + (4, null), + (5, null) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + + } + + [Fact] + public void TestMerge_ByTwoColumns_Simple_ManyToMany_LeftJoin() + { + //Test left merge by to int type columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); + + // Creates this case: + /* --------------------------- + * Left | Right + * I G1 G2 | I G1 G2 + * --------------------------- + * 0 1 1 | 0 1 1 + * 1 1 1 | 1 1 1 + * 2 3 3 | 2 0 0 + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 IR Merged: + * ------------------------- + * 0 1 1 0 1 1 0 - 0 + * 0 1 1 1 1 1 0 - 1 + * 1 1 1 0 1 1 1 - 0 + * 1 1 1 1 1 1 1 - 1 + * 2 3 3 2 - N + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, 0), + (0, 1), + (1, 0), + (1, 1), + (2, null) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + + [Fact] + public void TestMerge_ByTwoColumns_Simple_ManyToMany_RightJoin() + { + //Test left merge by to int type columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); + + // Creates this case: + /* --------------------------- + * Left | Right + * I G1 G2 | I G1 G2 + * --------------------------- + * 0 1 1 | 0 1 1 + * 1 1 1 | 1 1 1 + * 2 3 3 | 2 0 0 + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 IR Merged: + * ------------------------- + * 0 1 1 0 1 1 0 - 0 + * 1 1 1 0 1 1 1 - 0 + * 0 1 1 1 1 1 0 - 1 + * 1 1 1 1 1 1 1 - 1 + * 2 0 0 N - 2 + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }, joinAlgorithm: JoinAlgorithm.Right); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, 0), + (1, 0), + (0, 1), + (1, 1), + (null, 2) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + + [Fact] + public void TestMerge_ByTwoColumns_Simple_ManyToMany_InnerJoin() + { + //Test left merge by to int type columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); + + // Creates this case: + /* --------------------------- + * Left | Right + * I G1 G2 | I G1 G2 + * --------------------------- + * 0 1 1 | 0 1 1 + * 1 1 1 | 1 1 1 + * 2 3 3 | 2 0 0 + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 IR Merged: + * ------------------------- + * 0 1 1 0 1 1 0 - 0 + * 1 1 1 0 1 1 1 - 0 + * 0 1 1 1 1 1 0 - 1 + * 1 1 1 1 1 1 1 - 1 + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }, joinAlgorithm: JoinAlgorithm.Inner); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, 0), + (1, 0), + (0, 1), + (1, 1) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + + [Fact] + public void TestMerge_ByTwoColumns_Simple_ManyToMany_OuterJoin() + { + //Test left merge by to int type columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); + + // Creates this case: + /* --------------------------- + * Left | Right + * I G1 G2 | I G1 G2 + * --------------------------- + * 0 1 1 | 0 1 1 + * 1 1 1 | 1 1 1 + * 2 3 3 | 2 0 0 + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 IR Merged: + * ------------------------- + * 0 1 1 0 1 1 0 - 0 + * 0 1 1 1 1 1 0 - 1 + * 1 1 1 0 1 1 1 - 0 + * 1 1 1 1 1 1 1 - 1 + * 2 3 3 2 - N + * 2 0 0 N - 2 + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }, joinAlgorithm: JoinAlgorithm.FullOuter); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, 0), + (0, 1), + (1, 0), + (1, 1), + (2, null), + (null, 2) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + + [Fact] + public void TestMerge_ByThreeColumns_OneToOne_LeftJoin() + { + //Test merge by LEFT join of int and string columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 2, 1 })); + left.Columns.Add(new StringDataFrameColumn("G3", new[] { "A", "B", "C" })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 0, 1, 1 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 2 })); + right.Columns.Add(new StringDataFrameColumn("G3", new[] { "Z", "Y", "B" })); + + // Creates this case: + /* ----------------------------- + * Left | Right + * I G1 G2 G3 | I G1 G2 G3 + * ------------------------------ + * 0 1 1 A | 0 0 1 Z + * 1 1 2 B | 1 1 1 Y + * 2 2 1 C | 2 1 2 B + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 G3 IR Merged: + * ------------------------- + * 0 1 1 A 0 - N + * 1 1 2 B 2 1 2 B 1 - 2 + * 2 2 1 C 2 - N + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2", "G3" }, new[] { "G1", "G2", "G3" }); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, null), + (1, 2), + (2, null) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + + [Fact] + public void TestMerge_ByThreeColumns_OneToOne_RightJoin() + { + //Test merge by RIGHT join of int and string columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 2, 1 })); + left.Columns.Add(new StringDataFrameColumn("G3", new[] { "A", "B", "C" })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 0, 1, 1 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 2 })); + right.Columns.Add(new StringDataFrameColumn("G3", new[] { "Z", "Y", "B" })); + + // Creates this case: + /* ----------------------------- + * Left | Right + * I G1 G2 G3 | I G1 G2 G3 + * ------------------------------ + * 0 1 1 A | 0 0 1 Z + * 1 1 2 B | 1 1 1 Y + * 2 2 1 C | 2 1 2 B + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 G3 IR Merged: + * ------------------------- + * 0 0 1 Z N - 0 + * 1 1 1 Y N - 1 + * 1 1 2 B 2 1 2 B 1 - 2 + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2", "G3" }, new[] { "G1", "G2", "G3" }, joinAlgorithm: JoinAlgorithm.Right); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (null, 0), + (null, 1), + (1, 2) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + [Fact] public void TestMerge_Issue5778() { @@ -2291,7 +2703,7 @@ public void TestClone(int dfLength, int intDfLength) } } } - + [Fact] public void TestColumnCreationFromExisitingColumn() {