I'm trying to join two DataTables on a dynamic number of columns. I've gotten as far as the code below. The problem is the ON statement of the join. How can I make this dynamic based on how many column names are in the list "joinColumnNames".
I was thinking I will need to build some sort of expression tree, but I can't find any examples of how to do this with multiple join columns and with the DataRow object which doesn't have properties for each column.
private DataTable Join(List<string> joinColumnNames, DataTable pullX, DataTable pullY)
{
DataTable joinedTable = new DataTable();
// Add all the columns from pullX
foreach (string colName in joinColumnNames)
{
joinedTable.Columns.Add(pullX.Columns[colName]);
}
// Add unique columns from PullY
foreach (DataColumn col in pullY.Columns)
{
if (!joinedTable.Columns.Contains((col.ColumnName)))
{
joinedTable.Columns.Add(col);
}
}
var Join = (from PX in pullX.AsEnumerable()
join PY in pullY.AsEnumerable() on
// This must be dynamic and join on every column mentioned in joinColumnNames
new { A = PX[joinColumnNames[0]], B = PX[joinColumnNames[1]] } equals new { A = PY[joinColumnNames[0]], B = PY[joinColumnNames[1]] }
into Outer
from PY in Outer.DefaultIfEmpty<DataRow>(pullY.NewRow())
select new { PX, PY });
foreach (var item in Join)
{
DataRow newRow = joinedTable.NewRow();
foreach (DataColumn col in joinedTable.Columns)
{
var pullXValue = item.PX.Table.Columns.Contains(col.ColumnName) ? item.PX[col.ColumnName] : string.Empty;
var pullYValue = item.PY.Table.Columns.Contains(col.ColumnName) ? item.PY[col.ColumnName] : string.Empty;
newRow[col.ColumnName] = (pullXValue == null || string.IsNullOrEmpty(pullXValue.ToString())) ? pullYValue : pullXValue;
}
joinedTable.Rows.Add(newRow);
}
return joinedTable;
}
Adding a specific example to show input/output using 3 join columns (Country, Company, and DateId):
Pull X:
Country Company DateId Sales United States Test1 Ltd 20160722 $25 Canada Test3 Ltd 20160723 $30 Italy Test4 Ltd 20160724 $40 India Test2 Ltd 20160725 $35
Pull Y:
Country Company DateId Downloads United States Test1 Ltd 20160722 500 Mexico Test2 Ltd 20160723 300 Italy Test4 Ltd 20160724 900
Result:
Country Company DateId Sales Downloads United States Test1 Ltd 20160722 $25 500 Canada Test3 Ltd 20160723 $30 Mexico Test2 Ltd 20160723 300 Italy Test4 Ltd 20160724 $40 900 India Test2 Ltd 20160725 $35
var Join =
from PX in pullX.AsEnumerable()
join PY in pullY.AsEnumerable()
on string.Join("\0", joinColumnNames.Select(c => PX[c]))
equals string.Join("\0", joinColumnNames.Select(c => PY[c]))
into Outer
from PY in Outer.DefaultIfEmpty<DataRow>(pullY.NewRow())
select new { PX, PY };
Another way is to have both DataTable
in a DataSet
and use DataRelation
How To: Use DataRelation to perform a join on two DataTables in a DataSet?
Since you are using LINQ to Objects, there is no need to use expression trees. You can solve your problem with a custom equality comparer.
Create an equality comparer that can compare equality between two DataRow
objects based on the values of specific columns. Here is an example:
public class MyEqualityComparer : IEqualityComparer<DataRow>
{
private readonly string[] columnNames;
public MyEqualityComparer(string[] columnNames)
{
this.columnNames = columnNames;
}
public bool Equals(DataRow x, DataRow y)
{
return columnNames.All(cn => x[cn].Equals(y[cn]));
}
public int GetHashCode(DataRow obj)
{
unchecked
{
int hash = 19;
foreach (var value in columnNames.Select(cn => obj[cn]))
{
hash = hash * 31 + value.GetHashCode();
}
return hash;
}
}
}
Then you can use it to make the join like this:
public class TwoRows
{
public DataRow Row1 { get; set; }
public DataRow Row2 { get; set; }
}
private static List<TwoRows> LeftOuterJoin(
List<string> joinColumnNames,
DataTable leftTable,
DataTable rightTable)
{
return leftTable
.AsEnumerable()
.GroupJoin(
rightTable.AsEnumerable(),
l => l,
r => r,
(l, rlist) => new {LeftValue = l, RightValues = rlist},
new MyEqualityComparer(joinColumnNames.ToArray()))
.SelectMany(
x => x.RightValues.DefaultIfEmpty(rightTable.NewRow()),
(x, y) => new TwoRows {Row1 = x.LeftValue, Row2 = y})
.ToList();
}
Please note that I am using method syntax because I don't think that you can use a custom equality comparer otherwise.
Please note that the method does a left outer join, not a full outer join. Based on the example you provided, you seem to want a full outer join. To do this you need to do two left outer joins (see this answer). Here is how the full method would look like:
private static DataTable FullOuterJoin(
List<string> joinColumnNames,
DataTable pullX,
DataTable pullY)
{
var pullYOtherColumns =
pullY.Columns
.Cast<DataColumn>()
.Where(x => !joinColumnNames.Contains(x.ColumnName))
.ToList();
var allColumns =
pullX.Columns
.Cast<DataColumn>()
.Concat(pullYOtherColumns)
.ToArray();
var allColumnsClone =
allColumns
.Select(x => new DataColumn(x.ColumnName, x.DataType))
.ToArray();
DataTable joinedTable = new DataTable();
joinedTable.Columns.AddRange(allColumnsClone);
var first =
LeftOuterJoin(joinColumnNames, pullX, pullY);
var resultRows = new List<DataRow>();
foreach (var item in first)
{
DataRow newRow = joinedTable.NewRow();
foreach (DataColumn col in joinedTable.Columns)
{
var value = pullX.Columns.Contains(col.ColumnName)
? item.Row1[col.ColumnName]
: item.Row2[col.ColumnName];
newRow[col.ColumnName] = value;
}
resultRows.Add(newRow);
}
var second =
LeftOuterJoin(joinColumnNames, pullY, pullX);
foreach (var item in second)
{
DataRow newRow = joinedTable.NewRow();
foreach (DataColumn col in joinedTable.Columns)
{
var value = pullY.Columns.Contains(col.ColumnName)
? item.Row1[col.ColumnName]
: item.Row2[col.ColumnName];
newRow[col.ColumnName] = value;
}
resultRows.Add(newRow);
}
var uniqueRows =
resultRows
.Distinct(
new MyEqualityComparer(
joinedTable.Columns
.Cast<DataColumn>()
.Select(x => x.ColumnName)
.ToArray()));
foreach (var uniqueRow in uniqueRows)
joinedTable.Rows.Add(uniqueRow);
return joinedTable;
}
Please note also how I clone the columns. You cannot use the same column object in two tables.