[ML] Make regex more efficient

The regex that was used to detect document IDs left over from version 5.4 had a leading (.*) which can be very inefficient. It's not hard to refactor the test for version 5.4 doc IDs to use a more deterministic regex plus a simple scan for a single character.
droberts195 · Feb 21, 2024 · 4ed38eb · 4ed38eb
1 parent 0da5220
commit 4ed38eb
Showing 1 changed file with 7 additions and 3 deletions.
diff --git a/...rc/main/java/org/elasticsearch/xpack/core/ml/job/process/autodetect/state/ModelState.java b/...rc/main/java/org/elasticsearch/xpack/core/ml/job/process/autodetect/state/ModelState.java
@@ -21,7 +21,7 @@ public final class ModelState {
      */
     public static final String TYPE = "model_state";
 
-    private static final Pattern V_5_4_DOC_ID_REGEX = Pattern.compile("(.*)-\\d{10}#\\d+");
+    private static final Pattern V_5_4_DOC_ID_SUFFIX_REGEX = Pattern.compile("\\d{10}#\\d+");
 
     public static String documentId(String jobId, String snapshotId, int docNum) {
         return jobId + "_" + TYPE + "_" + snapshotId + "#" + docNum;
@@ -43,9 +43,13 @@ public static String extractJobId(String docId) {
      * and ended with hash and an integer.
      */
     private static String v54ExtractJobId(String docId) {
-        Matcher matcher = V_5_4_DOC_ID_REGEX.matcher(docId);
+        int potentialSuffixIndex = docId.lastIndexOf('-');
+        if (potentialSuffixIndex <= 0 || potentialSuffixIndex >= docId.length() - 1) {
+            return null;
+        }
+        Matcher matcher = V_5_4_DOC_ID_SUFFIX_REGEX.matcher(docId.subSequence(potentialSuffixIndex + 1, docId.length()));
         if (matcher.matches()) {
-            return matcher.group(1);
+            return docId.substring(0, potentialSuffixIndex);
         }
         return null;
     }