Skip to content

Commit

Permalink
Resolves #1682: Support auto-complete via querying main Lucene direct…
Browse files Browse the repository at this point in the history
…ory (#1683)

* Current State is rough

* Resolves #1682: Support auto-complete via querying main Lucene directory

This modifies the auto-complete cursor so that it reads from the main Lucene index and then reconstitutes the original text by reading from the base record data.

Co-authored-by: john_leach <jleach4@gmail.com>
  • Loading branch information
alecgrieser and jleach4 committed May 25, 2022
1 parent d0e98a7 commit 606bfb8
Show file tree
Hide file tree
Showing 13 changed files with 835 additions and 1,112 deletions.
2 changes: 1 addition & 1 deletion docs/ReleaseNotes.md
Expand Up @@ -26,7 +26,7 @@ This release also updates downstream dependency versions. Most notably, the prot
* **Bug fix** Fix 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Bug fix** Failed no-ops no longer log at `ERROR` [(Issue #1692)](https://github.com/FoundationDB/fdb-record-layer/issues/1692)
* **Performance** Improvement 1 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Performance** Improvement 2 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Performance** Lucene auto-complete is now handled by running queries on the main index to allow it to avoid needing a separate directory [(Issue #1682)](https://github.com/FoundationDB/fdb-record-layer/issues/1682)
* **Performance** Improvement 3 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Performance** Improvement 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Performance** Improvement 5 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
Expand Down

Large diffs are not rendered by default.

@@ -0,0 +1,79 @@
/*
* LuceneCursorContinuation.java
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2015-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.apple.foundationdb.record.lucene;

import com.apple.foundationdb.record.RecordCursorContinuation;
import com.google.protobuf.ByteString;
import org.apache.lucene.search.ScoreDoc;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

/**
* Continuation from scanning a Lucene index. This wraps the LuceneIndexContinuation protobuf message,
* which contains enough information to use the Lucene
* {@link org.apache.lucene.search.IndexSearcher#searchAfter(ScoreDoc, org.apache.lucene.search.Query, int) searchAfter}
* feature to resume a query.
*/
class LuceneCursorContinuation implements RecordCursorContinuation {
@Nonnull
private final LuceneContinuationProto.LuceneIndexContinuation protoContinuation;

@SuppressWarnings("squid:S3077") // Byte array is immutable once created, so does not need to use atomic array
private volatile byte[] byteContinuation;

private LuceneCursorContinuation(@Nonnull LuceneContinuationProto.LuceneIndexContinuation protoContinuation) {
this.protoContinuation = protoContinuation;
}

@Nullable
@Override
public byte[] toBytes() {
if (byteContinuation == null) {
synchronized (this) {
if (byteContinuation == null) {
byteContinuation = toByteString().toByteArray();
}
}
}
return byteContinuation;
}

@Nonnull
@Override
public ByteString toByteString() {
return protoContinuation.toByteString();
}

@Override
public boolean isEnd() {
return false;
}

public static LuceneCursorContinuation fromScoreDoc(ScoreDoc scoreDoc) {
return new LuceneCursorContinuation(LuceneContinuationProto.LuceneIndexContinuation.newBuilder()
.setDoc(scoreDoc.doc)
.setShard(scoreDoc.shardIndex)
.setScore(scoreDoc.score)
.build()
);
}
}
Expand Up @@ -32,7 +32,6 @@
import com.apple.foundationdb.record.RecordCursor;
import com.apple.foundationdb.record.ScanProperties;
import com.apple.foundationdb.record.TupleRange;
import com.apple.foundationdb.record.logging.KeyValueLogMessage;
import com.apple.foundationdb.record.logging.LogMessageKeys;
import com.apple.foundationdb.record.lucene.directory.FDBDirectoryManager;
import com.apple.foundationdb.record.metadata.IndexAggregateFunction;
Expand All @@ -47,10 +46,10 @@
import com.apple.foundationdb.record.provider.foundationdb.IndexScanBounds;
import com.apple.foundationdb.record.provider.foundationdb.indexes.InvalidIndexEntry;
import com.apple.foundationdb.record.provider.foundationdb.indexes.StandardIndexMaintainer;
import com.apple.foundationdb.record.provider.foundationdb.properties.RecordLayerPropertyKey;
import com.apple.foundationdb.record.query.QueryToKeyMatcher;
import com.apple.foundationdb.tuple.Tuple;
import com.google.protobuf.Message;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.Field;
Expand All @@ -64,7 +63,6 @@
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.slf4j.Logger;
Expand All @@ -73,10 +71,8 @@
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.EnumMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
Expand All @@ -96,7 +92,6 @@ public class LuceneIndexMaintainer extends StandardIndexMaintainer {
private static final Logger LOG = LoggerFactory.getLogger(LuceneIndexMaintainer.class);
private final FDBDirectoryManager directoryManager;
private final AnalyzerChooser indexAnalyzerChooser;
private final AnalyzerChooser autoCompleteIndexAnalyzerChooser;
private final AnalyzerChooser autoCompleteQueryAnalyzerChooser;
protected static final String PRIMARY_KEY_FIELD_NAME = "p"; // TODO: Need to find reserved names..
protected static final String PRIMARY_KEY_SEARCH_NAME = "s"; // TODO: Need to find reserved names..
Expand All @@ -109,7 +104,6 @@ public LuceneIndexMaintainer(@Nonnull final IndexMaintainerState state, @Nonnull
this.executor = executor;
this.directoryManager = FDBDirectoryManager.getManager(state);
this.indexAnalyzerChooser = LuceneAnalyzerRegistryImpl.instance().getLuceneAnalyzerChooserPair(state.index, LuceneAnalyzerType.FULL_TEXT).getLeft();
this.autoCompleteIndexAnalyzerChooser = LuceneAnalyzerRegistryImpl.instance().getLuceneAnalyzerChooserPair(state.index, LuceneAnalyzerType.AUTO_COMPLETE).getLeft();
this.autoCompleteQueryAnalyzerChooser = LuceneAnalyzerRegistryImpl.instance().getLuceneAnalyzerChooserPair(state.index, LuceneAnalyzerType.AUTO_COMPLETE).getRight();
this.autoCompleteEnabled = state.index.getBooleanOption(LuceneIndexOptions.AUTO_COMPLETE_ENABLED, false);
this.highlightForAutoCompleteIfEnabled = state.index.getBooleanOption(LuceneIndexOptions.AUTO_COMPLETE_HIGHLIGHT, false);
Expand Down Expand Up @@ -152,14 +146,9 @@ public RecordCursor<IndexEntry> scan(@Nonnull final IndexScanBounds scanBounds,
.addLogInfo(LogMessageKeys.INDEX_NAME, state.index.getName());
}
LuceneScanAutoComplete scanAutoComplete = (LuceneScanAutoComplete)scanBounds;
try {
return new LuceneAutoCompleteResultCursor(getSuggester(scanAutoComplete.getGroupKey(),
Collections.singletonList(scanAutoComplete.getKeyToComplete()), null), scanAutoComplete.getKeyToComplete(),
executor, scanProperties, state, scanAutoComplete.getGroupKey(), highlightForAutoCompleteIfEnabled);
} catch (IOException ex) {
throw new RecordCoreException("Exception to get suggester for auto-complete search", ex)
.addLogInfo(LogMessageKeys.INDEX_NAME, state.index.getName());
}
Analyzer analyzer = autoCompleteQueryAnalyzerChooser.chooseAnalyzer(scanAutoComplete.getKeyToComplete()).getAnalyzer();
return new LuceneAutoCompleteResultCursor(scanAutoComplete.getKeyToComplete(),
executor, scanProperties, analyzer, state, scanAutoComplete.getGroupKey(), highlightForAutoCompleteIfEnabled);
}

if (scanType.equals(LuceneScanTypes.BY_LUCENE_SPELL_CHECK)) {
Expand All @@ -174,62 +163,21 @@ public RecordCursor<IndexEntry> scan(@Nonnull final IndexScanBounds scanBounds,
throw new RecordCoreException("unsupported scan type for Lucene index: " + scanType);
}

private boolean addTermToSuggesterIfNeeded(@Nonnull String value, @Nonnull String fieldName, @Nullable AnalyzingInfixSuggester suggester) {
if (suggester == null) {
return false;
}

final byte[] valueBytes = value.getBytes(StandardCharsets.UTF_8);
final RecordLayerPropertyKey<Integer> sizeLimitProp = LuceneRecordContextProperties.LUCENE_AUTO_COMPLETE_TEXT_SIZE_UPPER_LIMIT;
final int sizeLimit = Objects.requireNonNullElse(state.context.getPropertyStorage().getPropertyValue(sizeLimitProp), sizeLimitProp.getDefaultValue()).intValue();
// Ignore this text if its size exceeds the limitation
if (valueBytes.length > sizeLimit) {
if (LOG.isTraceEnabled()) {
LOG.trace(KeyValueLogMessage.of("Skip auto-complete indexing due to exceeding size limitation",
LuceneLogMessageKeys.DATA_SIZE, valueBytes.length,
LuceneLogMessageKeys.DATA_VALUE, value.substring(0, Math.min(value.length(), 100)),
LogMessageKeys.FIELD_NAME, fieldName));
}
return false;
}

try {
suggester.add(new BytesRef(valueBytes),
Set.of(new BytesRef(fieldName.getBytes(StandardCharsets.UTF_8))),
state.context.getPropertyStorage().getPropertyValue(LuceneRecordContextProperties.LUCENE_AUTO_COMPLETE_DEFAULT_WEIGHT),
new BytesRef(Tuple.from(fieldName).pack()));
if (LOG.isTraceEnabled()) {
LOG.trace(KeyValueLogMessage.of("Added auto-complete suggestion to suggester",
LuceneLogMessageKeys.DATA_SIZE, valueBytes.length,
LuceneLogMessageKeys.DATA_VALUE, value.substring(0, Math.min(value.length(), 100)),
LogMessageKeys.FIELD_NAME, fieldName));
}
return true;
} catch (IOException ex) {
throw new RecordCoreException("Exception to add term into suggester", ex)
.addLogInfo(LogMessageKeys.INDEX_NAME, state.index.getName());
}
}

/**
* Insert a field into the document and add a suggestion into the suggester if needed.
* @return whether a suggestion has been added to the suggester
*/
@SuppressWarnings("java:S3776")
private boolean insertField(LuceneDocumentFromRecord.DocumentField field, final Document document,
@Nullable AnalyzingInfixSuggester suggester) {
private void insertField(LuceneDocumentFromRecord.DocumentField field, final Document document) {
final String fieldName = field.getFieldName();
final Object value = field.getValue();
final Field luceneField;
final Field sortedField;
final StoredField storedField;
boolean suggestionAdded = false;
switch (field.getType()) {
case TEXT:
luceneField = new Field(fieldName, (String) value, getTextFieldType(field));
sortedField = null;
storedField = null;
suggestionAdded = addTermToSuggesterIfNeeded((String) value, fieldName, suggester);
break;
case STRING:
luceneField = new StringField(fieldName, (String)value, field.isStored() ? Field.Store.YES : Field.Store.NO);
Expand Down Expand Up @@ -266,38 +214,32 @@ private boolean insertField(LuceneDocumentFromRecord.DocumentField field, final
if (storedField != null) {
document.add(storedField);
}
return suggestionAdded;
}

private void writeDocument(@Nonnull List<LuceneDocumentFromRecord.DocumentField> fields, Tuple groupingKey,
byte[] primaryKey) throws IOException {
final List<String> texts = fields.stream()
.filter(f -> f.getType().equals(LuceneIndexExpressions.DocumentFieldType.TEXT))
.map(f -> (String) f.getValue()).collect(Collectors.toList());
Document document = new Document();
final IndexWriter newWriter = directoryManager.getIndexWriter(groupingKey,
indexAnalyzerChooser.chooseAnalyzer(texts));
BytesRef ref = new BytesRef(primaryKey);
Document document = new Document();
document.add(new StoredField(PRIMARY_KEY_FIELD_NAME, ref));
document.add(new SortedDocValuesField(PRIMARY_KEY_SEARCH_NAME, ref));

Map<IndexOptions, List<LuceneDocumentFromRecord.DocumentField>> indexOptionsToFieldsMap = getIndexOptionsToFieldsMap(fields);
for (Map.Entry<IndexOptions, List<LuceneDocumentFromRecord.DocumentField>> entry : indexOptionsToFieldsMap.entrySet()) {
final AnalyzingInfixSuggester suggester = autoCompleteEnabled ? getSuggester(groupingKey, texts, entry.getKey()) : null;
boolean suggestionAdded = false;
for (LuceneDocumentFromRecord.DocumentField field : entry.getValue()) {
suggestionAdded = insertField(field, document, suggester) || suggestionAdded;
}
if (suggestionAdded) {
suggester.refresh();
insertField(field, document);
}
}
newWriter.addDocument(document);
}

@Nonnull
private Map<IndexOptions, List<LuceneDocumentFromRecord.DocumentField>> getIndexOptionsToFieldsMap(@Nonnull List<LuceneDocumentFromRecord.DocumentField> fields) {
final Map<IndexOptions, List<LuceneDocumentFromRecord.DocumentField>> map = new HashMap<>();
final Map<IndexOptions, List<LuceneDocumentFromRecord.DocumentField>> map = new EnumMap<>(IndexOptions.class);
fields.stream().forEach(f -> {
final IndexOptions indexOptions = getIndexOptions((String) Objects.requireNonNullElse(f.getConfig(LuceneFunctionNames.LUCENE_AUTO_COMPLETE_FIELD_INDEX_OPTIONS),
LuceneFunctionNames.LuceneFieldIndexOptions.DOCS_AND_FREQS_AND_POSITIONS.name()));
Expand Down Expand Up @@ -364,16 +306,6 @@ public <M extends Message> CompletableFuture<Void> update(@Nullable FDBIndexable
return AsyncUtil.DONE;
}

/**
* Get the {@link AnalyzingInfixSuggester} for indexing or query, from the session of the context if there exists a corresponding one, or by creating a new one.
* @param indexOptions the {@link IndexOptions} for suggester's {@link FieldType}. This only matters for when the suggester is for indexing.
* The one for query can just use an arbitrary one, so just pass in a NULL when getting a suggester for query, so the existing one from session of context can be reused.
*/
private AnalyzingInfixSuggester getSuggester(@Nullable Tuple groupingKey, @Nonnull List<String> texts, @Nullable IndexOptions indexOptions) throws IOException {
return directoryManager.getAutocompleteSuggester(groupingKey, autoCompleteIndexAnalyzerChooser.chooseAnalyzer(texts),
autoCompleteQueryAnalyzerChooser.chooseAnalyzer(texts), highlightForAutoCompleteIfEnabled, indexOptions);
}

private FieldType getTextFieldType(LuceneDocumentFromRecord.DocumentField field) {
FieldType ft = new FieldType();

Expand Down
Expand Up @@ -33,19 +33,25 @@ public class LuceneIndexOptions {
public static final String AUTO_COMPLETE_ENABLED = "autoCompleteEnabled";
/**
* The type of auto complete blender to transform the weight after search to take into account the position of the searched term into the indexed text.
* @deprecated this option is ignored as the blender suggester is no longer used by auto-complete queries
*/
@Deprecated
public static final String AUTO_COMPLETE_BLENDER_TYPE = "autoCompleteBlenderType";
/**
* The number factor to multiply the number of searched elements for auto complete blender.
* @deprecated this option is ignored as the blender suggester is no longer used by auto-complete queries
*/
@Deprecated
public static final String AUTO_COMPLETE_BLENDER_NUM_FACTOR = "autoCompleteBlenderNumFactor";
/**
* The minimum number of leading characters before prefix query is used for auto complete.
*/
public static final String AUTO_COMPLETE_MIN_PREFIX_SIZE = "autoCompleteMinPrefixSize";
/**
* The exponent to use for auto complete when the blender type is POSITION_EXPONENTIAL_RECIPROCAL.
* @deprecated this option is ignored as the blender suggester is no longer used by auto-complete queries
*/
@Deprecated
public static final String AUTO_COMPLETE_BLENDER_EXPONENT = "autoCompleteBlenderExponent";
/**
* Whether highlight suggest query in suggestions.
Expand Down
Expand Up @@ -90,19 +90,23 @@ public final class LuceneRecordContextProperties {

/**
* Maximum segment size to produce during normal merging for auto-complete search with Lucene.
* @deprecated No longer in use as auto-complete no longer has its own directory
*/
@Deprecated
public static final RecordLayerPropertyKey<Double> LUCENE_AUTO_COMPLETE_MERGE_MAX_SIZE = RecordLayerPropertyKey.doublePropertyKey("com.apple.foundationdb.record.lucene.autoCompleteMergeMaxSize", 5.0);

/**
* Maximum number of segments to be merged at a time for auto-complete search with Lucene, during forceMerge for forceMergeDeletes.
* @deprecated No longer in use as auto-complete no longer has its own directory
*/
@Deprecated
public static final RecordLayerPropertyKey<Integer> LUCENE_AUTO_COMPLETE_MERGE_MAX_NUMBER = RecordLayerPropertyKey.integerPropertyKey("com.apple.foundationdb.record.lucene.autoCompleteMergeMaxNum", 2);

/**
* This controls the suggester's base class to use for Lucene auto-complete search.
* True to use a {@link org.apache.lucene.search.suggest.analyzing.BlendedInfixSuggester}, that sorts the matches based on positions stored in term vectors.
* False to use a {@link com.apple.foundationdb.record.lucene.codec.LuceneOptimizedBlendedInfixSuggesterWithoutTermVectors} that does not store term vectors, and sort matches based on positions detection in memory.
* @deprecated No longer in use as auto-complete no longer has its own directory
*/
@Deprecated
public static final RecordLayerPropertyKey<Boolean> LUCENE_AUTO_COMPLETE_WITH_TERM_VECTORS = RecordLayerPropertyKey.booleanPropertyKey("com.apple.foundationdb.record.lucene.autoComplete.withTermVectors", true);

/**
Expand Down

0 comments on commit 606bfb8

Please sign in to comment.