-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement secure boost scheme - secure evaluation and validation (during training) without local feature leakage #10079
Changes from 38 commits
8570ba5
2d00db6
ab17f5a
fb1787c
7a2a2b8
3ca3142
22dd522
52e8951
e9eef15
70e6ca6
a54ea6a
6fe61dd
1c2b7ed
b36ff2b
0707731
dce7609
6cebc31
1562f52
f31c824
6fcbe02
967e307
04cd1cb
087a8dd
5e85438
e008818
1fd1fb0
72159b9
069f811
4e3c329
d0bee2f
4624c3f
85b215d
cb6af9f
b1ca59a
2ba22dd
7cdde6f
be37fcd
090cb1a
da97000
0c854a4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -303,22 +303,35 @@ class HistEvaluator { | |
// forward enumeration: split at right bound of each bin | ||
loss_chg = | ||
static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum}, | ||
GradStats{right_sum}) - | ||
parent.root_gain); | ||
split_pt = cut_val[i]; // not used for partition based | ||
best.Update(loss_chg, fidx, split_pt, d_step == -1, false, left_sum, right_sum); | ||
GradStats{right_sum}) - parent.root_gain); | ||
if (!is_secure_) { | ||
split_pt = cut_val[i]; // not used for partition based | ||
best.Update(loss_chg, fidx, split_pt, d_step == -1, false, left_sum, right_sum); | ||
} else { | ||
// secure mode: record the best split point, rather than the actual value | ||
// since it is not accessible at this point (active party finding best-split) | ||
best.Update(loss_chg, fidx, i, d_step == -1, false, left_sum, right_sum); | ||
} | ||
} else { | ||
Comment on lines
+307
to
315
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think a policy class might help here? Or maybe there are other efficient ways to handle these conditions? I'm losing track of these conditions, considering that we have three enumeration functions:
Then we have three split modes:
So, in combination, 9 potential cases, and we haven't counted vector leaf yet. Need to find a better way to manage these conditions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it can be tricky to consolidate, since the 9 cases have high overlaps (e.g. same enumeration logic for all splits modes except when secure+passive party), some further processing only for col_split (w/ w/o secure), but irrelevant to enumeration. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another thing regarding this mode combinations: potentially with the upcoming processor interface we will be able to enable encrypted horizontal, shall we further add a row + secure mode, adding a 4th one for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For my preference, I would have put it in the |
||
// backward enumeration: split at left bound of each bin | ||
loss_chg = | ||
static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{right_sum}, | ||
GradStats{left_sum}) - | ||
parent.root_gain); | ||
if (i == imin) { | ||
split_pt = cut.MinValues()[fidx]; | ||
GradStats{left_sum}) - parent.root_gain); | ||
if (!is_secure_) { | ||
if (i == imin) { | ||
split_pt = cut.MinValues()[fidx]; | ||
} else { | ||
split_pt = cut_val[i - 1]; | ||
} | ||
best.Update(loss_chg, fidx, split_pt, d_step == -1, false, right_sum, left_sum); | ||
} else { | ||
split_pt = cut_val[i - 1]; | ||
// secure mode: record the best split point, rather than the actual value | ||
// since it is not accessible at this point (active party finding best-split) | ||
if (i != imin) { | ||
i = i - 1; | ||
} | ||
best.Update(loss_chg, fidx, i, d_step == -1, false, right_sum, left_sum); | ||
} | ||
best.Update(loss_chg, fidx, split_pt, d_step == -1, false, right_sum, left_sum); | ||
} | ||
} | ||
} | ||
|
@@ -352,7 +365,6 @@ class HistEvaluator { | |
} | ||
auto evaluator = tree_evaluator_.GetEvaluator(); | ||
auto const &cut_ptrs = cut.Ptrs(); | ||
|
||
// Under secure vertical setting, only the active party is able to evaluate the split | ||
// based on global histogram. Other parties will receive the final best split information | ||
// Hence the below computation is not performed by the passive parties | ||
|
@@ -417,6 +429,16 @@ class HistEvaluator { | |
all_entries[worker * entries.size() + nidx_in_set].split); | ||
} | ||
} | ||
if (is_secure_) { | ||
// At this point, all the workers have the best splits for all the nodes | ||
// and workers can recover the actual split value with the split index | ||
// Note that after the recovery, different workers will hold different | ||
// split_value: real value for feature owner, NaN for others | ||
for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) { | ||
auto cut_index = entries[nidx_in_set].split.split_value; | ||
entries[nidx_in_set].split.split_value = cut.Values()[cut_index]; | ||
} | ||
} | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Based on my understanding, categorical features are not yet supported right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
right, will need to find a proper use-case / testing data with categorical features to add the support, it seems the categorical feature is "experimental" according to some of the last year's release notes, is it still the case? maybe we can add the support later when we find it really necessary.