Resolves #1682: Support auto-complete via querying main Lucene direct…

…ory (#1683) * Current State is rough * Resolves #1682: Support auto-complete via querying main Lucene directory This modifies the auto-complete cursor so that it reads from the main Lucene index and then reconstitutes the original text by reading from the base record data. Co-authored-by: john_leach <jleach4@gmail.com>
FoundationDB · May 25, 2022 · 606bfb8 · 606bfb8
1 parent d0e98a7
commit 606bfb8
Show file tree

Hide file tree

Showing 13 changed files with 835 additions and 1,112 deletions.
diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
@@ -26,7 +26,7 @@ This release also updates downstream dependency versions. Most notably, the prot
 * **Bug fix** Fix 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
 * **Bug fix** Failed no-ops no longer log at `ERROR` [(Issue #1692)](https://github.com/FoundationDB/fdb-record-layer/issues/1692)
 * **Performance** Improvement 1 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
-* **Performance** Improvement 2 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
+* **Performance** Lucene auto-complete is now handled by running queries on the main index to allow it to avoid needing a separate directory [(Issue #1682)](https://github.com/FoundationDB/fdb-record-layer/issues/1682)
 * **Performance** Improvement 3 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
 * **Performance** Improvement 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
 * **Performance** Improvement 5 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)

diff --git a/...ne/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java b/...ne/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java
diff --git a/...r-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneCursorContinuation.java b/...r-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneCursorContinuation.java
@@ -0,0 +1,79 @@
+/*
+ * LuceneCursorContinuation.java
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2015-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.apple.foundationdb.record.lucene;
+
+import com.apple.foundationdb.record.RecordCursorContinuation;
+import com.google.protobuf.ByteString;
+import org.apache.lucene.search.ScoreDoc;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+/**
+ * Continuation from scanning a Lucene index. This wraps the LuceneIndexContinuation protobuf message,
+ * which contains enough information to use the Lucene
+ * {@link org.apache.lucene.search.IndexSearcher#searchAfter(ScoreDoc, org.apache.lucene.search.Query, int) searchAfter}
+ * feature to resume a query.
+ */
+class LuceneCursorContinuation implements RecordCursorContinuation {
+    @Nonnull
+    private final LuceneContinuationProto.LuceneIndexContinuation protoContinuation;
+
+    @SuppressWarnings("squid:S3077") // Byte array is immutable once created, so does not need to use atomic array
+    private volatile byte[] byteContinuation;
+
+    private LuceneCursorContinuation(@Nonnull LuceneContinuationProto.LuceneIndexContinuation protoContinuation) {
+        this.protoContinuation = protoContinuation;
+    }
+
+    @Nullable
+    @Override
+    public byte[] toBytes() {
+        if (byteContinuation == null) {
+            synchronized (this) {
+                if (byteContinuation == null) {
+                    byteContinuation = toByteString().toByteArray();
+                }
+            }
+        }
+        return byteContinuation;
+    }
+
+    @Nonnull
+    @Override
+    public ByteString toByteString() {
+        return protoContinuation.toByteString();
+    }
+
+    @Override
+    public boolean isEnd() {
+        return false;
+    }
+
+    public static LuceneCursorContinuation fromScoreDoc(ScoreDoc scoreDoc) {
+        return new LuceneCursorContinuation(LuceneContinuationProto.LuceneIndexContinuation.newBuilder()
+                .setDoc(scoreDoc.doc)
+                .setShard(scoreDoc.shardIndex)
+                .setScore(scoreDoc.score)
+                .build()
+        );
+    }
+}
diff --git a/...ayer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexMaintainer.java b/...ayer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexMaintainer.java
@@ -32,7 +32,6 @@
 import com.apple.foundationdb.record.RecordCursor;
 import com.apple.foundationdb.record.ScanProperties;
 import com.apple.foundationdb.record.TupleRange;
-import com.apple.foundationdb.record.logging.KeyValueLogMessage;
 import com.apple.foundationdb.record.logging.LogMessageKeys;
 import com.apple.foundationdb.record.lucene.directory.FDBDirectoryManager;
 import com.apple.foundationdb.record.metadata.IndexAggregateFunction;
@@ -47,10 +46,10 @@
 import com.apple.foundationdb.record.provider.foundationdb.IndexScanBounds;
 import com.apple.foundationdb.record.provider.foundationdb.indexes.InvalidIndexEntry;
 import com.apple.foundationdb.record.provider.foundationdb.indexes.StandardIndexMaintainer;
-import com.apple.foundationdb.record.provider.foundationdb.properties.RecordLayerPropertyKey;
 import com.apple.foundationdb.record.query.QueryToKeyMatcher;
 import com.apple.foundationdb.tuple.Tuple;
 import com.google.protobuf.Message;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.DoublePoint;
 import org.apache.lucene.document.Field;
@@ -64,7 +63,6 @@
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.NumericUtils;
 import org.slf4j.Logger;
@@ -73,10 +71,8 @@
 import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
 import java.io.IOException;
-import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
+import java.util.EnumMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
@@ -96,7 +92,6 @@ public class LuceneIndexMaintainer extends StandardIndexMaintainer {
     private static final Logger LOG = LoggerFactory.getLogger(LuceneIndexMaintainer.class);
     private final FDBDirectoryManager directoryManager;
     private final AnalyzerChooser indexAnalyzerChooser;
-    private final AnalyzerChooser autoCompleteIndexAnalyzerChooser;
     private final AnalyzerChooser autoCompleteQueryAnalyzerChooser;
     protected static final String PRIMARY_KEY_FIELD_NAME = "p"; // TODO: Need to find reserved names..
     protected static final String PRIMARY_KEY_SEARCH_NAME = "s"; // TODO: Need to find reserved names..
@@ -109,7 +104,6 @@ public LuceneIndexMaintainer(@Nonnull final IndexMaintainerState state, @Nonnull
         this.executor = executor;
         this.directoryManager = FDBDirectoryManager.getManager(state);
         this.indexAnalyzerChooser = LuceneAnalyzerRegistryImpl.instance().getLuceneAnalyzerChooserPair(state.index, LuceneAnalyzerType.FULL_TEXT).getLeft();
-        this.autoCompleteIndexAnalyzerChooser = LuceneAnalyzerRegistryImpl.instance().getLuceneAnalyzerChooserPair(state.index, LuceneAnalyzerType.AUTO_COMPLETE).getLeft();
         this.autoCompleteQueryAnalyzerChooser = LuceneAnalyzerRegistryImpl.instance().getLuceneAnalyzerChooserPair(state.index, LuceneAnalyzerType.AUTO_COMPLETE).getRight();
         this.autoCompleteEnabled = state.index.getBooleanOption(LuceneIndexOptions.AUTO_COMPLETE_ENABLED, false);
         this.highlightForAutoCompleteIfEnabled = state.index.getBooleanOption(LuceneIndexOptions.AUTO_COMPLETE_HIGHLIGHT, false);
@@ -152,14 +146,9 @@ public RecordCursor<IndexEntry> scan(@Nonnull final IndexScanBounds scanBounds,
                         .addLogInfo(LogMessageKeys.INDEX_NAME, state.index.getName());
             }
             LuceneScanAutoComplete scanAutoComplete = (LuceneScanAutoComplete)scanBounds;
-            try {
-                return new LuceneAutoCompleteResultCursor(getSuggester(scanAutoComplete.getGroupKey(),
-                        Collections.singletonList(scanAutoComplete.getKeyToComplete()), null), scanAutoComplete.getKeyToComplete(),
-                        executor, scanProperties, state, scanAutoComplete.getGroupKey(), highlightForAutoCompleteIfEnabled);
-            } catch (IOException ex) {
-                throw new RecordCoreException("Exception to get suggester for auto-complete search", ex)
-                        .addLogInfo(LogMessageKeys.INDEX_NAME, state.index.getName());
-            }
+            Analyzer analyzer = autoCompleteQueryAnalyzerChooser.chooseAnalyzer(scanAutoComplete.getKeyToComplete()).getAnalyzer();
+            return new LuceneAutoCompleteResultCursor(scanAutoComplete.getKeyToComplete(),
+                    executor, scanProperties, analyzer, state, scanAutoComplete.getGroupKey(), highlightForAutoCompleteIfEnabled);
         }
 
         if (scanType.equals(LuceneScanTypes.BY_LUCENE_SPELL_CHECK)) {
@@ -174,62 +163,21 @@ public RecordCursor<IndexEntry> scan(@Nonnull final IndexScanBounds scanBounds,
         throw new RecordCoreException("unsupported scan type for Lucene index: " + scanType);
     }
 
-    private boolean addTermToSuggesterIfNeeded(@Nonnull String value, @Nonnull String fieldName, @Nullable AnalyzingInfixSuggester suggester) {
-        if (suggester == null) {
-            return false;
-        }
-
-        final byte[] valueBytes = value.getBytes(StandardCharsets.UTF_8);
-        final RecordLayerPropertyKey<Integer> sizeLimitProp = LuceneRecordContextProperties.LUCENE_AUTO_COMPLETE_TEXT_SIZE_UPPER_LIMIT;
-        final int sizeLimit = Objects.requireNonNullElse(state.context.getPropertyStorage().getPropertyValue(sizeLimitProp), sizeLimitProp.getDefaultValue()).intValue();
-        // Ignore this text if its size exceeds the limitation
-        if (valueBytes.length > sizeLimit) {
-            if (LOG.isTraceEnabled()) {
-                LOG.trace(KeyValueLogMessage.of("Skip auto-complete indexing due to exceeding size limitation",
-                        LuceneLogMessageKeys.DATA_SIZE, valueBytes.length,
-                        LuceneLogMessageKeys.DATA_VALUE, value.substring(0, Math.min(value.length(), 100)),
-                        LogMessageKeys.FIELD_NAME, fieldName));
-            }
-            return false;
-        }
-
-        try {
-            suggester.add(new BytesRef(valueBytes),
-                    Set.of(new BytesRef(fieldName.getBytes(StandardCharsets.UTF_8))),
-                    state.context.getPropertyStorage().getPropertyValue(LuceneRecordContextProperties.LUCENE_AUTO_COMPLETE_DEFAULT_WEIGHT),
-                    new BytesRef(Tuple.from(fieldName).pack()));
-            if (LOG.isTraceEnabled()) {
-                LOG.trace(KeyValueLogMessage.of("Added auto-complete suggestion to suggester",
-                        LuceneLogMessageKeys.DATA_SIZE, valueBytes.length,
-                        LuceneLogMessageKeys.DATA_VALUE, value.substring(0, Math.min(value.length(), 100)),
-                        LogMessageKeys.FIELD_NAME, fieldName));
-            }
-            return true;
-        } catch (IOException ex) {
-            throw new RecordCoreException("Exception to add term into suggester", ex)
-                    .addLogInfo(LogMessageKeys.INDEX_NAME, state.index.getName());
-        }
-    }
-
     /**
      * Insert a field into the document and add a suggestion into the suggester if needed.
-     * @return whether a suggestion has been added to the suggester
      */
     @SuppressWarnings("java:S3776")
-    private boolean insertField(LuceneDocumentFromRecord.DocumentField field, final Document document,
-                             @Nullable AnalyzingInfixSuggester suggester) {
+    private void insertField(LuceneDocumentFromRecord.DocumentField field, final Document document) {
         final String fieldName = field.getFieldName();
         final Object value = field.getValue();
         final Field luceneField;
         final Field sortedField;
         final StoredField storedField;
-        boolean suggestionAdded = false;
         switch (field.getType()) {
             case TEXT:
                 luceneField = new Field(fieldName, (String) value, getTextFieldType(field));
                 sortedField = null;
                 storedField = null;
-                suggestionAdded = addTermToSuggesterIfNeeded((String) value, fieldName, suggester);
                 break;
             case STRING:
                 luceneField = new StringField(fieldName, (String)value, field.isStored() ? Field.Store.YES : Field.Store.NO);
@@ -266,38 +214,32 @@ private boolean insertField(LuceneDocumentFromRecord.DocumentField field, final
         if (storedField != null) {
             document.add(storedField);
         }
-        return suggestionAdded;
     }
 
     private void writeDocument(@Nonnull List<LuceneDocumentFromRecord.DocumentField> fields, Tuple groupingKey,
                                byte[] primaryKey) throws IOException {
         final List<String> texts = fields.stream()
                 .filter(f -> f.getType().equals(LuceneIndexExpressions.DocumentFieldType.TEXT))
                 .map(f -> (String) f.getValue()).collect(Collectors.toList());
+        Document document = new Document();
         final IndexWriter newWriter = directoryManager.getIndexWriter(groupingKey,
                 indexAnalyzerChooser.chooseAnalyzer(texts));
         BytesRef ref = new BytesRef(primaryKey);
-        Document document = new Document();
         document.add(new StoredField(PRIMARY_KEY_FIELD_NAME, ref));
         document.add(new SortedDocValuesField(PRIMARY_KEY_SEARCH_NAME, ref));
 
         Map<IndexOptions, List<LuceneDocumentFromRecord.DocumentField>> indexOptionsToFieldsMap = getIndexOptionsToFieldsMap(fields);
         for (Map.Entry<IndexOptions, List<LuceneDocumentFromRecord.DocumentField>> entry : indexOptionsToFieldsMap.entrySet()) {
-            final AnalyzingInfixSuggester suggester = autoCompleteEnabled ? getSuggester(groupingKey, texts, entry.getKey()) : null;
-            boolean suggestionAdded = false;
             for (LuceneDocumentFromRecord.DocumentField field : entry.getValue()) {
-                suggestionAdded = insertField(field, document, suggester) || suggestionAdded;
-            }
-            if (suggestionAdded) {
-                suggester.refresh();
+                insertField(field, document);
             }
         }
         newWriter.addDocument(document);
     }
 
     @Nonnull
     private Map<IndexOptions, List<LuceneDocumentFromRecord.DocumentField>> getIndexOptionsToFieldsMap(@Nonnull List<LuceneDocumentFromRecord.DocumentField> fields) {
-        final Map<IndexOptions, List<LuceneDocumentFromRecord.DocumentField>> map = new HashMap<>();
+        final Map<IndexOptions, List<LuceneDocumentFromRecord.DocumentField>> map = new EnumMap<>(IndexOptions.class);
         fields.stream().forEach(f -> {
             final IndexOptions indexOptions = getIndexOptions((String) Objects.requireNonNullElse(f.getConfig(LuceneFunctionNames.LUCENE_AUTO_COMPLETE_FIELD_INDEX_OPTIONS),
                     LuceneFunctionNames.LuceneFieldIndexOptions.DOCS_AND_FREQS_AND_POSITIONS.name()));
@@ -364,16 +306,6 @@ public <M extends Message> CompletableFuture<Void> update(@Nullable FDBIndexable
         return AsyncUtil.DONE;
     }
 
-    /**
-     * Get the {@link AnalyzingInfixSuggester} for indexing or query, from the session of the context if there exists a corresponding one, or by creating a new one.
-     * @param indexOptions the {@link IndexOptions} for suggester's {@link FieldType}. This only matters for when the suggester is for indexing.
-     * The one for query can just use an arbitrary one, so just pass in a NULL when getting a suggester for query, so the existing one from session of context can be reused.
-     */
-    private AnalyzingInfixSuggester getSuggester(@Nullable Tuple groupingKey, @Nonnull List<String> texts, @Nullable IndexOptions indexOptions) throws IOException {
-        return directoryManager.getAutocompleteSuggester(groupingKey, autoCompleteIndexAnalyzerChooser.chooseAnalyzer(texts),
-                autoCompleteQueryAnalyzerChooser.chooseAnalyzer(texts), highlightForAutoCompleteIfEnabled, indexOptions);
-    }
-
     private FieldType getTextFieldType(LuceneDocumentFromRecord.DocumentField field) {
         FieldType ft = new FieldType();
 

diff --git a/...d-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexOptions.java b/...d-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexOptions.java
@@ -33,19 +33,25 @@ public class LuceneIndexOptions {
     public static final String AUTO_COMPLETE_ENABLED = "autoCompleteEnabled";
     /**
      * The type of auto complete blender to transform the weight after search to take into account the position of the searched term into the indexed text.
+     * @deprecated this option is ignored as the blender suggester is no longer used by auto-complete queries
      */
+    @Deprecated
     public static final String AUTO_COMPLETE_BLENDER_TYPE = "autoCompleteBlenderType";
     /**
      * The number factor to multiply the number of searched elements for auto complete blender.
+     * @deprecated this option is ignored as the blender suggester is no longer used by auto-complete queries
      */
+    @Deprecated
     public static final String AUTO_COMPLETE_BLENDER_NUM_FACTOR = "autoCompleteBlenderNumFactor";
     /**
      * The minimum number of leading characters before prefix query is used for auto complete.
      */
     public static final String AUTO_COMPLETE_MIN_PREFIX_SIZE = "autoCompleteMinPrefixSize";
     /**
      * The exponent to use for auto complete when the blender type is POSITION_EXPONENTIAL_RECIPROCAL.
+     * @deprecated this option is ignored as the blender suggester is no longer used by auto-complete queries
      */
+    @Deprecated
     public static final String AUTO_COMPLETE_BLENDER_EXPONENT = "autoCompleteBlenderExponent";
     /**
      * Whether highlight suggest query in suggestions.

diff --git a/...ene/src/main/java/com/apple/foundationdb/record/lucene/LuceneRecordContextProperties.java b/...ene/src/main/java/com/apple/foundationdb/record/lucene/LuceneRecordContextProperties.java
@@ -90,19 +90,23 @@ public final class LuceneRecordContextProperties {
 
     /**
      * Maximum segment size to produce during normal merging for auto-complete search with Lucene.
+     * @deprecated No longer in use as auto-complete no longer has its own directory
      */
+    @Deprecated
     public static final RecordLayerPropertyKey<Double> LUCENE_AUTO_COMPLETE_MERGE_MAX_SIZE = RecordLayerPropertyKey.doublePropertyKey("com.apple.foundationdb.record.lucene.autoCompleteMergeMaxSize", 5.0);
 
     /**
      * Maximum number of segments to be merged at a time for auto-complete search with Lucene, during forceMerge for forceMergeDeletes.
+     * @deprecated No longer in use as auto-complete no longer has its own directory
      */
+    @Deprecated
     public static final RecordLayerPropertyKey<Integer> LUCENE_AUTO_COMPLETE_MERGE_MAX_NUMBER = RecordLayerPropertyKey.integerPropertyKey("com.apple.foundationdb.record.lucene.autoCompleteMergeMaxNum", 2);
 
     /**
      * This controls the suggester's base class to use for Lucene auto-complete search.
-     * True to use a {@link org.apache.lucene.search.suggest.analyzing.BlendedInfixSuggester}, that sorts the matches based on positions stored in term vectors.
-     * False to use a {@link com.apple.foundationdb.record.lucene.codec.LuceneOptimizedBlendedInfixSuggesterWithoutTermVectors} that does not store term vectors, and sort matches based on positions detection in memory.
+     * @deprecated No longer in use as auto-complete no longer has its own directory
      */
+    @Deprecated
     public static final RecordLayerPropertyKey<Boolean> LUCENE_AUTO_COMPLETE_WITH_TERM_VECTORS = RecordLayerPropertyKey.booleanPropertyKey("com.apple.foundationdb.record.lucene.autoComplete.withTermVectors", true);
 
     /**