Skip to content

Commit

Permalink
[ML] Make regex more efficient
Browse files Browse the repository at this point in the history
The regex that was used to detect document IDs left over from
version 5.4 had a leading (.*) which can be very inefficient.
It's not hard to refactor the test for version 5.4 doc IDs to
use a more deterministic regex plus a simple scan for a single
character.
  • Loading branch information
droberts195 committed Feb 21, 2024
1 parent 0da5220 commit 4ed38eb
Showing 1 changed file with 7 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public final class ModelState {
*/
public static final String TYPE = "model_state";

private static final Pattern V_5_4_DOC_ID_REGEX = Pattern.compile("(.*)-\\d{10}#\\d+");
private static final Pattern V_5_4_DOC_ID_SUFFIX_REGEX = Pattern.compile("\\d{10}#\\d+");

public static String documentId(String jobId, String snapshotId, int docNum) {
return jobId + "_" + TYPE + "_" + snapshotId + "#" + docNum;
Expand All @@ -43,9 +43,13 @@ public static String extractJobId(String docId) {
* and ended with hash and an integer.
*/
private static String v54ExtractJobId(String docId) {
Matcher matcher = V_5_4_DOC_ID_REGEX.matcher(docId);
int potentialSuffixIndex = docId.lastIndexOf('-');
if (potentialSuffixIndex <= 0 || potentialSuffixIndex >= docId.length() - 1) {
return null;
}
Matcher matcher = V_5_4_DOC_ID_SUFFIX_REGEX.matcher(docId.subSequence(potentialSuffixIndex + 1, docId.length()));
if (matcher.matches()) {
return matcher.group(1);
return docId.substring(0, potentialSuffixIndex);
}
return null;
}
Expand Down

0 comments on commit 4ed38eb

Please sign in to comment.