From 50a4cfc085bc01dc2d334c047d34d8e07a7d46b0 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Wed, 18 May 2022 17:57:46 -0700 Subject: [PATCH] Resolves #1682: Support auto-complete via querying main Lucene directory This modifies the auto-complete cursor so that it reads from the main Lucene index and then reconstitutes the original text by reading from the base record data. --- docs/ReleaseNotes.md | 2 +- .../LuceneAutoCompleteResultCursor.java | 415 ++++++++---------- .../lucene/LuceneCursorContinuation.java | 78 ++++ .../record/lucene/LuceneIndexMaintainer.java | 11 +- .../record/lucene/LuceneRecordCursor.java | 11 +- .../record/lucene/LuceneIndexTest.java | 45 +- 6 files changed, 293 insertions(+), 269 deletions(-) create mode 100644 fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneCursorContinuation.java diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md index f9db896276..5b6064bee3 100644 --- a/docs/ReleaseNotes.md +++ b/docs/ReleaseNotes.md @@ -26,7 +26,7 @@ This release also updates downstream dependency versions. Most notably, the prot * **Bug fix** Fix 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN) * **Bug fix** Fix 5 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN) * **Performance** Improvement 1 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN) -* **Performance** Improvement 2 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN) +* **Performance** Lucene auto-complete is now handled by running queries on the main index to allow it to avoid needing a separate directory [(Issue #1682)](https://github.com/FoundationDB/fdb-record-layer/issues/1682) * **Performance** Improvement 3 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN) * **Performance** Improvement 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN) * **Performance** Improvement 5 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN) diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java index 3694f46fd7..9a81c59d54 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java @@ -20,7 +20,6 @@ package com.apple.foundationdb.record.lucene; -import com.apple.foundationdb.record.ByteArrayContinuation; import com.apple.foundationdb.record.IndexEntry; import com.apple.foundationdb.record.PipelineOperation; import com.apple.foundationdb.record.RecordCoreArgumentException; @@ -29,52 +28,36 @@ import com.apple.foundationdb.record.RecordCursorContinuation; import com.apple.foundationdb.record.RecordCursorResult; import com.apple.foundationdb.record.RecordCursorVisitor; -import com.apple.foundationdb.record.RecordMetaDataProto; import com.apple.foundationdb.record.ScanProperties; import com.apple.foundationdb.record.cursors.BaseCursor; import com.apple.foundationdb.record.logging.LogMessageKeys; import com.apple.foundationdb.record.lucene.directory.FDBDirectoryManager; -import com.apple.foundationdb.record.metadata.Key; -import com.apple.foundationdb.record.metadata.expressions.KeyExpression; -import com.apple.foundationdb.record.provider.foundationdb.FDBQueriedRecord; import com.apple.foundationdb.record.provider.foundationdb.FDBStoreTimer; import com.apple.foundationdb.record.provider.foundationdb.IndexMaintainerState; -import com.apple.foundationdb.record.query.plan.plans.QueryResult; import com.apple.foundationdb.tuple.Tuple; -import com.google.protobuf.ByteString; +import com.apple.foundationdb.tuple.TupleHelpers; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.IndexNotFoundException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexableField; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.MultiDocValues; -import org.apache.lucene.index.ReaderUtil; -import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.TopFieldCollector; -import org.apache.lucene.search.TopFieldDocs; import org.apache.lucene.search.spans.FieldMaskingSpanQuery; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; -import org.apache.lucene.search.suggest.Lookup; -import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester; import org.apache.lucene.util.BytesRef; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -83,20 +66,16 @@ import javax.annotation.Nullable; import java.io.IOException; import java.io.StringReader; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; +import java.util.Collection; import java.util.HashSet; -import java.util.Iterator; import java.util.List; -import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.Executor; -import java.util.function.Supplier; -import java.util.stream.Collectors; +import java.util.function.Function; /** * This class is a Record Cursor implementation for Lucene auto complete suggestion lookup. @@ -113,20 +92,19 @@ public class LuceneAutoCompleteResultCursor implements BaseCursor { private final String query; @Nullable private final FDBStoreTimer timer; - private int limit; + private final int limit; + private final int skip; @Nullable - private RecordCursor> lookupResults = null; - private int currentPosition; + private RecordCursor lookupResults = null; @Nullable private final Tuple groupingKey; private final boolean highlight; private final Analyzer queryAnalyzer; - private final List fieldNames; public LuceneAutoCompleteResultCursor(@Nonnull String query, @Nonnull Executor executor, @Nonnull ScanProperties scanProperties, @Nonnull Analyzer queryAnalyzer, @Nonnull IndexMaintainerState state, - @Nullable Tuple groupingKey, @Nonnull List fieldNames, boolean highlight) { + @Nullable Tuple groupingKey, boolean highlight) { if (query.isEmpty()) { throw new RecordCoreArgumentException("Invalid query for auto-complete search") .addLogInfo(LogMessageKeys.QUERY, query) @@ -137,49 +115,37 @@ public LuceneAutoCompleteResultCursor(@Nonnull String query, this.executor = executor; this.limit = Math.min(scanProperties.getExecuteProperties().getReturnedRowLimitOrMax(), state.context.getPropertyStorage().getPropertyValue(LuceneRecordContextProperties.LUCENE_AUTO_COMPLETE_SEARCH_LIMITATION)); + this.skip = scanProperties.getExecuteProperties().getSkip(); this.timer = state.context.getTimer(); - this.currentPosition = 0; - if (scanProperties.getExecuteProperties().getSkip() > 0) { - this.currentPosition += scanProperties.getExecuteProperties().getSkip(); - } this.highlight = highlight; this.state = state; this.groupingKey = groupingKey; this.queryAnalyzer = queryAnalyzer; - this.fieldNames = fieldNames; } private synchronized IndexReader getIndexReader() throws IOException { return FDBDirectoryManager.getManager(state).getIndexReader(groupingKey); } - @SuppressWarnings("cast") @Nonnull @Override public CompletableFuture> onNext() { - return CompletableFuture.supplyAsync(() -> { - if (lookupResults == null) { + if (lookupResults == null) { + return CompletableFuture.>>supplyAsync(() -> { try { performLookup(); + } catch (IndexNotFoundException indexNotFoundException) { + // Trying to open an empty directory results in an IndexNotFoundException, + // but this should be interpreted as there not being any data to read + return CompletableFuture.completedFuture(RecordCursorResult.exhausted()); } catch (IOException ioException) { throw new RecordCoreException("Exception to lookup the auto complete suggestions", ioException) .addLogInfo(LogMessageKeys.QUERY, query); } - } - RecordCursorResult> next = lookupResults.getNext(); - if (next.hasNext()) { - return next.get(); - } - return RecordCursorResult.exhausted(); - }, executor); - } - - @Nonnull - private static RecordCursorContinuation continuationHelper(String key, long value, byte[] payload) { - LuceneContinuationProto.LuceneAutoCompleteIndexContinuation.Builder continuationBuilder = LuceneContinuationProto.LuceneAutoCompleteIndexContinuation.newBuilder().setKey(key); - continuationBuilder.setValue(value); - continuationBuilder.setPayload(ByteString.copyFrom(payload)); - return ByteArrayContinuation.fromNullable(continuationBuilder.build().toByteArray()); + return lookupResults.onNext(); + }).thenCompose(Function.identity()); + } + return lookupResults.onNext(); } @Override @@ -205,33 +171,30 @@ private void performLookup() throws IOException { } long startTime = System.nanoTime(); - IndexReader indexReader = getIndexReader(); - IndexSearcher searcher = new IndexSearcher(indexReader, executor); - - lookupResults = lookup(query, null, limit, true, highlight); + lookupResults = lookup().skip(skip); if (timer != null) { timer.recordSinceNanoTime(LuceneEvents.Events.LUCENE_AUTO_COMPLETE_SUGGESTIONS_SCAN, startTime); - timer.increment(LuceneEvents.Counts.LUCENE_SCAN_MATCHED_AUTO_COMPLETE_SUGGESTIONS, 6); // TODO JL } } - /** Override this method to customize the Object - * representing a single highlighted suggestions; the - * result is set on each {@link - * org.apache.lucene.search.suggest.Lookup.LookupResult#highlightKey} member. */ - protected String highlight(String text, Set matchedTokens, String prefixToken) throws IOException { + @Nullable + private String searchAllMaybeHighlight(String text, Set matchedTokens, @Nullable String prefixToken, boolean highlight) { try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); ts.reset(); - StringBuilder sb = new StringBuilder(); + StringBuilder sb = highlight ? new StringBuilder() : null; int upto = 0; + Set matchedInText = new HashSet<>(); + boolean matchedPrefix = false; while (ts.incrementToken()) { String token = termAtt.toString(); int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); if (upto < startOffset) { - addNonMatch(sb, text.substring(upto, startOffset)); + if (highlight) { + addNonMatch(sb, text.substring(upto, startOffset)); + } upto = startOffset; } else if (upto > startOffset) { continue; @@ -239,19 +202,39 @@ protected String highlight(String text, Set matchedTokens, String prefix if (matchedTokens.contains(token)) { // Token matches. - addWholeMatch(sb, text.substring(startOffset, endOffset), token); + if (highlight) { + addWholeMatch(sb, text.substring(startOffset, endOffset), token); + } upto = endOffset; + matchedInText.add(token); } else if (prefixToken != null && token.startsWith(prefixToken)) { - addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken); + if (highlight) { + addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken); + } upto = endOffset; + matchedPrefix = true; } } ts.end(); - int endOffset = offsetAtt.endOffset(); - if (upto < endOffset) { - addNonMatch(sb, text.substring(upto)); + + if ((prefixToken != null && !matchedPrefix) || (matchedInText.size() < matchedTokens.size())) { + // Query text not actually found in document text. Return null + return null; + } + + // Text was found. Return text (highlighted or not) + if (highlight) { + int endOffset = offsetAtt.endOffset(); + if (upto < endOffset) { + addNonMatch(sb, text.substring(upto)); + } + return sb.toString(); + } else { + return text; } - return sb.toString(); + + } catch (IOException e) { + return null; } } @@ -299,81 +282,66 @@ protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, sb.append(surface.substring(prefixToken.length())); } - public RecordCursor> lookup(CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { + public RecordCursor lookup() throws IOException { + // Determine the tokens from the query key + final boolean phraseQueryNeeded = query.startsWith("\"") && query.endsWith("\""); + final String searchKey = phraseQueryNeeded ? query.substring(1, query.length() - 1) : query; + List tokens = new ArrayList<>(); + final String prefixToken = getQueryTokens(searchKey, tokens); - final BooleanClause.Occur occur; - if (allTermsRequired) { - occur = BooleanClause.Occur.MUST; - } else { - occur = BooleanClause.Occur.SHOULD; - } - - BooleanQuery.Builder query = new BooleanQuery.Builder(); - Set matchedTokens = new HashSet<>(); + IndexReader indexReader = getIndexReader(); + Set fieldNames = new HashSet<>(); + indexReader.leaves().forEach(leaf -> leaf.reader().getFieldInfos().forEach(fieldInfo -> fieldNames.add(fieldInfo.name))); + fieldNames.remove(LuceneIndexMaintainer.PRIMARY_KEY_FIELD_NAME); + fieldNames.remove(LuceneIndexMaintainer.PRIMARY_KEY_SEARCH_NAME); - final boolean phraseQueryNeeded = key.toString().startsWith("\"") && key.toString().endsWith("\""); - final String searchKey = phraseQueryNeeded ? key.toString().substring(1, key.toString().length() - 1) : key.toString(); + final Set tokenSet = new HashSet<>(tokens); + Query finalQuery = phraseQueryNeeded + ? buildQueryForPhraseMatching(fieldNames, tokens, prefixToken) + : buildQueryForTermsMatching(fieldNames, tokenSet, prefixToken); - String prefixToken = phraseQueryNeeded - ? buildQueryForPhraseMatching(fieldNames, query, matchedTokens, searchKey, occur) - : buildQueryForTermsMatching(fieldNames, query, matchedTokens, searchKey, occur); + IndexSearcher searcher = new IndexSearcher(indexReader, executor); + TopDocs topDocs = searcher.search(finalQuery, limit); + if (timer != null) { + timer.increment(LuceneEvents.Counts.LUCENE_SCAN_MATCHED_AUTO_COMPLETE_SUGGESTIONS, topDocs.scoreDocs.length); + } + return createResults(searcher, topDocs, tokenSet, prefixToken); + } - if (contextQuery != null) { - boolean allMustNot = true; - for (BooleanClause clause : contextQuery.clauses()) { - if (clause.getOccur() != BooleanClause.Occur.MUST_NOT) { - allMustNot = false; - break; - } + @Nullable + private Query buildQueryForPhraseMatching(@Nonnull Collection fieldNames, + @Nonnull List matchedTokens, + @Nullable String prefixToken) { + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + for (String field : fieldNames) { + PhraseQuery.Builder phraseQueryBuilder = new PhraseQuery.Builder(); + for (String token : matchedTokens) { + phraseQueryBuilder.add(new Term(field, token)); } - - if (allMustNot) { - // All are MUST_NOT: add the contextQuery to the main query instead (not as sub-query) - for (BooleanClause clause : contextQuery.clauses()) { - query.add(clause); - } - } else if (allTermsRequired == false) { - // We must carefully upgrade the query clauses to MUST: - BooleanQuery.Builder newQuery = new BooleanQuery.Builder(); - newQuery.add(query.build(), BooleanClause.Occur.MUST); - newQuery.add(contextQuery, BooleanClause.Occur.MUST); - query = newQuery; + Query fieldQuery; + if (prefixToken == null) { + fieldQuery = phraseQueryBuilder.build(); } else { - // Add contextQuery as sub-query - query.add(contextQuery, BooleanClause.Occur.MUST); + fieldQuery = getPhrasePrefixQuery(field, phraseQueryBuilder.build(), prefixToken); } + queryBuilder.add(fieldQuery, BooleanClause.Occur.SHOULD); } - - Query finalQuery = finishQuery(query, allTermsRequired); - - try { - IndexReader indexReader = getIndexReader(); - IndexSearcher searcher = new IndexSearcher(indexReader, executor); - TopDocs topDocs = searcher.search(finalQuery, limit); - return createResults(searcher, topDocs, limit, key, doHighlight, matchedTokens, prefixToken); - } finally { - // searcher not closed todo - } + queryBuilder.setMinimumNumberShouldMatch(1); + return queryBuilder.build(); } - @Nullable - private String buildQueryForPhraseMatching(@Nonnull List fieldNames, @Nonnull BooleanQuery.Builder query, @Nonnull Set matchedTokens, - @Nonnull String searchKey, @Nonnull BooleanClause.Occur occur) throws IOException { + private String getQueryTokens(String searchKey, @Nonnull List tokens) throws IOException { String prefixToken = null; try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(searchKey))) { - //long t0 = System.currentTimeMillis(); ts.reset(); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); String lastToken = null; - PhraseQuery.Builder phraseQueryBuilder = new PhraseQuery.Builder(); int maxEndOffset = -1; while (ts.incrementToken()) { if (lastToken != null) { - matchedTokens.add(lastToken); - final String probeToken = lastToken; - fieldNames.forEach(f -> phraseQueryBuilder.add(new Term(f, probeToken))); + tokens.add(lastToken); } lastToken = termAtt.toString(); if (lastToken != null) { @@ -389,18 +357,13 @@ private String buildQueryForPhraseMatching(@Nonnull List fieldNames, @No // string (e.g. whitespace), so that if query does // not end with a space we show prefix matches for // that token: - final String probeToken = lastToken; - fieldNames.forEach(f -> query.add(getPhrasePrefixQuery(f, phraseQueryBuilder.build(), probeToken), occur)); prefixToken = lastToken; } else { // Use TermQuery for an exact match if there were // trailing discarded chars (e.g. whitespace), so // that if query ends with a space we only show // exact matches for that term: - matchedTokens.add(lastToken); - final String probeToken = lastToken; - fieldNames.forEach(f -> phraseQueryBuilder.add(new Term(f, probeToken))); - query.add(phraseQueryBuilder.build(), occur); + tokens.add(lastToken); } } } @@ -408,55 +371,26 @@ private String buildQueryForPhraseMatching(@Nonnull List fieldNames, @No } @Nullable - private String buildQueryForTermsMatching(@Nonnull List fieldNames, @Nonnull BooleanQuery.Builder query, @Nonnull Set matchedTokens, - @Nonnull String searchKey, @Nonnull BooleanClause.Occur occur) throws IOException { - String prefixToken = null; - try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(searchKey))) { - //long t0 = System.currentTimeMillis(); - ts.reset(); - final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); - String lastToken = null; - int maxEndOffset = -1; - while (ts.incrementToken()) { - if (lastToken != null) { - matchedTokens.add(lastToken); - final String probeToken = lastToken; - fieldNames.forEach(f -> query.add(new TermQuery(new Term(f, probeToken)), occur)); - } - lastToken = termAtt.toString(); - if (lastToken != null) { - maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset()); - } + private Query buildQueryForTermsMatching(@Nonnull Collection fieldNames, + @Nonnull Set tokenSet, + @Nullable String prefixToken) { + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + + // Construct a query that is essentially: + // - in any field, + // - all of the tokens must occur (with the last one as a prefix) + for (String field : fieldNames) { + BooleanQuery.Builder fieldQuery = new BooleanQuery.Builder(); + for (String token : tokenSet) { + fieldQuery.add(new TermQuery(new Term(field, token)), BooleanClause.Occur.MUST); } - ts.end(); - - if (lastToken != null) { - List lastQuery; - if (maxEndOffset == offsetAtt.endOffset()) { - // Use PrefixQuery (or the ngram equivalent) when - // there was no trailing discarded chars in the - // string (e.g. whitespace), so that if query does - // not end with a space we show prefix matches for - // that token: - lastQuery = getLastTokenQuery(fieldNames, lastToken); - prefixToken = lastToken; - } else { - // Use TermQuery for an exact match if there were - // trailing discarded chars (e.g. whitespace), so - // that if query ends with a space we only show - // exact matches for that term: - final String probeToken = lastToken; - matchedTokens.add(lastToken); - lastQuery = fieldNames.stream().map(f -> new TermQuery(new Term(f, probeToken))).collect(Collectors.toList()); - } - - if (lastQuery != null) { - lastQuery.forEach(q -> query.add(q, occur)); - } + if (prefixToken != null) { + fieldQuery.add(new PrefixQuery(new Term(field, prefixToken)), BooleanClause.Occur.MUST); } + queryBuilder.add(fieldQuery.build(), BooleanClause.Occur.SHOULD); } - return prefixToken; + queryBuilder.setMinimumNumberShouldMatch(1); + return queryBuilder.build(); } private Query getPhrasePrefixQuery(@Nonnull String fieldName, @Nonnull PhraseQuery phraseQuery, @Nonnull String lastToken) { @@ -472,57 +406,88 @@ private Query getPhrasePrefixQuery(@Nonnull String fieldName, @Nonnull PhraseQue return spanQuery.build(); } - /** Subclass can override this to tweak the Query before - * searching. */ - protected Query finishQuery(BooleanQuery.Builder in, boolean allTermsRequired) { - return in.build(); - } - - /** This is called if the last token isn't ended - * (e.g. user did not type a space after it). Return an - * appropriate Query clause to add to the BooleanQuery. */ - protected List getLastTokenQuery(List fieldNames, final String token) throws IOException { - return fieldNames.stream().map(f -> new PrefixQuery(new Term(f, token))).collect(Collectors.toList()); - } - - protected RecordCursor> createResults(IndexSearcher searcher, TopDocs topDocs, int num, - CharSequence charSequence, - boolean doHighlight, Set matchedTokens, String prefixToken) + protected RecordCursor createResults(IndexSearcher searcher, + TopDocs topDocs, + Set matchedTokens, + String prefixToken) throws IOException { - return RecordCursor.fromIterator(Arrays.stream(topDocs.scoreDocs).iterator()) - .mapPipelined(scoreDoc -> { - try { - IndexableField primaryKey = searcher.doc(scoreDoc.doc).getField(LuceneIndexMaintainer.PRIMARY_KEY_FIELD_NAME); - BytesRef pk = primaryKey.binaryValue(); - return state.store.loadRecordAsync(Tuple.fromBytes(pk.bytes)); - } catch (IOException e) { - throw new RuntimeException(e); - } - }, state.store.getPipelineSize(PipelineOperation.KEY_TO_RECORD)) + return RecordCursor.fromIterator(executor, Arrays.stream(topDocs.scoreDocs).iterator()) + .mapPipelined(scoreDoc -> constructIndexEntryFromScoreDoc(searcher, scoreDoc, matchedTokens, prefixToken), state.store.getPipelineSize(PipelineOperation.KEY_TO_RECORD)) .filter(Objects::nonNull) - .map(state.store::queriedRecord) - .map(result -> { - - final KeyExpression rootExpression = state.index.getRootExpression(); - final List indexKeys = rootExpression.evaluate(result); - String value = (String) indexKeys.get(0).values().get(0); // TODO - try { - if (highlight) { - value = highlight(value, matchedTokens, prefixToken); - } - } catch (IOException e) { - throw new RuntimeException(e); - } - Tuple key = result.getPrimaryKey().add("text").add(value); // TODO - if (groupingKey != null) { - key = groupingKey.addAll(key); + .mapResult(wrappingResult -> { + if (wrappingResult.hasNext()) { + return wrappingResult.get(); + } else { + // TODO: Handle the underlying cursor terminating early + // This will result in the query ending whenever the underlying cursor terminates, + // which mainly is a problem in that it doesn't return the right NoNextReason if we + // hit some limit. This is mostly not a problem until this cursor can accept + // continuations. + return RecordCursorResult.exhausted(); } - IndexEntry indexEntry = new IndexEntry(state.index, key, Tuple.from(100)); // TODO - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("Suggestion read as an index entry={}", indexEntry); - } - return RecordCursorResult.withNextValue(indexEntry, continuationHelper("", 100, "sdfds".getBytes())); }); } + private CompletableFuture> constructIndexEntryFromScoreDoc(IndexSearcher searcher, ScoreDoc scoreDoc, Set queryTokens, @Nullable String prefixToken) { + try { + IndexableField primaryKey = searcher.doc(scoreDoc.doc).getField(LuceneIndexMaintainer.PRIMARY_KEY_FIELD_NAME); + BytesRef pk = primaryKey.binaryValue(); + return state.store.loadRecordAsync(Tuple.fromBytes(pk.bytes)).thenApply(record -> { + if (record == null) { + // No document found. Return original record. + return null; + } + // Extract the indexed fields from the document again + final List documentFields = LuceneDocumentFromRecord.getRecordFields(state.index.getRootExpression(), record) + .get(groupingKey == null ? TupleHelpers.EMPTY : groupingKey); + + // Search each field to find the first match. + final int maxTextLength = Objects.requireNonNull(state.context.getPropertyStorage() + .getPropertyValue(LuceneRecordContextProperties.LUCENE_AUTO_COMPLETE_TEXT_SIZE_UPPER_LIMIT)); + @Nullable LuceneDocumentFromRecord.DocumentField matchedField = null; + @Nullable String matchedText = null; + for (LuceneDocumentFromRecord.DocumentField documentField : documentFields) { + Object fieldValue = documentField.getValue(); + if (fieldValue instanceof String) { + String text = (String) fieldValue; + if (text.length() > maxTextLength) { + // Apply the text length filter before searching through the text for the + // matched terms + continue; + } + String match = searchAllMaybeHighlight(text, queryTokens, prefixToken, highlight); + if (match != null) { + matchedField = documentField; + matchedText = match; + break; + } + } + } + + if (matchedField == null) { + return null; + } + Tuple key = Tuple.from(matchedField.getFieldName(), matchedText); + if (groupingKey != null) { + key = groupingKey.addAll(key); + } + // TODO: Add the primary key to the index entry + // Not having the primary key is fine for auto-complete queries that just want the + // text, but queries wanting to do something with both the auto-completed text and the + // original record need to do something else + IndexEntry indexEntry = new IndexEntry(state.index, key, Tuple.from(scoreDoc.score)); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Suggestion read as an index entry={}", indexEntry); + } + + // TODO: this cursor does not support real continuations (yet) + // However, if we want to use the "searchAfter" to resume this scan, this is the + // continuation we'd need for it + RecordCursorContinuation continuation = LuceneCursorContinuation.fromScoreDoc(scoreDoc); + return RecordCursorResult.withNextValue(indexEntry, continuation); + }); + } catch (IOException e) { + return CompletableFuture.failedFuture(new RecordCoreException("unable to read document from Lucene", e)); + } + } } diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneCursorContinuation.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneCursorContinuation.java new file mode 100644 index 0000000000..5ff5c3356b --- /dev/null +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneCursorContinuation.java @@ -0,0 +1,78 @@ +/* + * LuceneCursorContinuation.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2015-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb.record.lucene; + +import com.apple.foundationdb.record.RecordCursorContinuation; +import com.google.protobuf.ByteString; +import org.apache.lucene.search.ScoreDoc; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +/** + * Continuation from scanning a Lucene index. This wraps the LuceneIndexContinuation protobuf message, + * which contains enough information to use the Lucene + * {@link org.apache.lucene.search.IndexSearcher#searchAfter(ScoreDoc, org.apache.lucene.search.Query, int) searchAfter} + * feature to resume a query. + */ +class LuceneCursorContinuation implements RecordCursorContinuation { + @Nonnull + private final LuceneContinuationProto.LuceneIndexContinuation protoContinuation; + + private volatile byte[] byteContinuation; + + private LuceneCursorContinuation(@Nonnull LuceneContinuationProto.LuceneIndexContinuation protoContinuation) { + this.protoContinuation = protoContinuation; + } + + @Nullable + @Override + public byte[] toBytes() { + if (byteContinuation == null) { + synchronized (this) { + if (byteContinuation == null) { + byteContinuation = toByteString().toByteArray(); + } + } + } + return byteContinuation; + } + + @Nonnull + @Override + public ByteString toByteString() { + return protoContinuation.toByteString(); + } + + @Override + public boolean isEnd() { + return false; + } + + public static LuceneCursorContinuation fromScoreDoc(ScoreDoc scoreDoc) { + return new LuceneCursorContinuation(LuceneContinuationProto.LuceneIndexContinuation.newBuilder() + .setDoc(scoreDoc.doc) + .setShard(scoreDoc.shardIndex) + .setScore(scoreDoc.score) + .build() + ); + } +} diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexMaintainer.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexMaintainer.java index 5cf59f31ca..70a72a89c4 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexMaintainer.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexMaintainer.java @@ -64,7 +64,6 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.Query; -import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; import org.slf4j.Logger; @@ -74,7 +73,6 @@ import javax.annotation.Nullable; import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -95,7 +93,6 @@ public class LuceneIndexMaintainer extends StandardIndexMaintainer { private static final Logger LOG = LoggerFactory.getLogger(LuceneIndexMaintainer.class); private final FDBDirectoryManager directoryManager; private final AnalyzerChooser indexAnalyzerChooser; - private final AnalyzerChooser autoCompleteIndexAnalyzerChooser; private final AnalyzerChooser autoCompleteQueryAnalyzerChooser; protected static final String PRIMARY_KEY_FIELD_NAME = "p"; // TODO: Need to find reserved names.. protected static final String PRIMARY_KEY_SEARCH_NAME = "s"; // TODO: Need to find reserved names.. @@ -108,7 +105,6 @@ public LuceneIndexMaintainer(@Nonnull final IndexMaintainerState state, @Nonnull this.executor = executor; this.directoryManager = FDBDirectoryManager.getManager(state); this.indexAnalyzerChooser = LuceneAnalyzerRegistryImpl.instance().getLuceneAnalyzerChooserPair(state.index, LuceneAnalyzerType.FULL_TEXT).getLeft(); - this.autoCompleteIndexAnalyzerChooser = LuceneAnalyzerRegistryImpl.instance().getLuceneAnalyzerChooserPair(state.index, LuceneAnalyzerType.AUTO_COMPLETE).getLeft(); this.autoCompleteQueryAnalyzerChooser = LuceneAnalyzerRegistryImpl.instance().getLuceneAnalyzerChooserPair(state.index, LuceneAnalyzerType.AUTO_COMPLETE).getRight(); this.autoCompleteEnabled = state.index.getBooleanOption(LuceneIndexOptions.AUTO_COMPLETE_ENABLED, false); this.highlightForAutoCompleteIfEnabled = state.index.getBooleanOption(LuceneIndexOptions.AUTO_COMPLETE_HIGHLIGHT, false); @@ -152,7 +148,7 @@ public RecordCursor scan(@Nonnull final IndexScanBounds scanBounds, } LuceneScanAutoComplete scanAutoComplete = (LuceneScanAutoComplete)scanBounds; return new LuceneAutoCompleteResultCursor(scanAutoComplete.getKeyToComplete(), - executor, scanProperties, getAutocompleteQueryAnalyzer(List.of(scanAutoComplete.getKeyToComplete())), state, scanAutoComplete.getGroupKey(), List.of("text"), highlightForAutoCompleteIfEnabled); + executor, scanProperties, getAutocompleteQueryAnalyzer(List.of(scanAutoComplete.getKeyToComplete())), state, scanAutoComplete.getGroupKey(), highlightForAutoCompleteIfEnabled); } if (scanType.equals(LuceneScanTypes.BY_LUCENE_SPELL_CHECK)) { @@ -169,16 +165,14 @@ public RecordCursor scan(@Nonnull final IndexScanBounds scanBounds, /** * Insert a field into the document and add a suggestion into the suggester if needed. - * @return whether a suggestion has been added to the suggester */ @SuppressWarnings("java:S3776") - private boolean insertField(LuceneDocumentFromRecord.DocumentField field, final Document document) { + private void insertField(LuceneDocumentFromRecord.DocumentField field, final Document document) { final String fieldName = field.getFieldName(); final Object value = field.getValue(); final Field luceneField; final Field sortedField; final StoredField storedField; - boolean suggestionAdded = false; switch (field.getType()) { case TEXT: luceneField = new Field(fieldName, (String) value, getTextFieldType(field)); @@ -220,7 +214,6 @@ private boolean insertField(LuceneDocumentFromRecord.DocumentField field, final if (storedField != null) { document.add(storedField); } - return suggestionAdded; } private void writeDocument(@Nonnull List fields, Tuple groupingKey, diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneRecordCursor.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneRecordCursor.java index 210e8bba0f..46c0123997 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneRecordCursor.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneRecordCursor.java @@ -21,10 +21,10 @@ package com.apple.foundationdb.record.lucene; import com.apple.foundationdb.annotation.API; -import com.apple.foundationdb.record.ByteArrayContinuation; import com.apple.foundationdb.record.IndexEntry; import com.apple.foundationdb.record.RecordCoreException; import com.apple.foundationdb.record.RecordCursorContinuation; +import com.apple.foundationdb.record.RecordCursorEndContinuation; import com.apple.foundationdb.record.RecordCursorResult; import com.apple.foundationdb.record.RecordCursorVisitor; import com.apple.foundationdb.record.ScanProperties; @@ -258,14 +258,9 @@ public CompletableFuture> onNext() { @Nonnull private RecordCursorContinuation continuationHelper() { if (currentPosition >= topDocs.scoreDocs.length && limitRemaining > 0) { - return ByteArrayContinuation.fromNullable(null); + return RecordCursorEndContinuation.END; } else { - LuceneContinuationProto.LuceneIndexContinuation continuation = LuceneContinuationProto.LuceneIndexContinuation.newBuilder() - .setDoc(searchAfter.doc) - .setScore(searchAfter.score) - .setShard(searchAfter.shardIndex) - .build(); - return ByteArrayContinuation.fromNullable(continuation.toByteArray()); + return LuceneCursorContinuation.fromScoreDoc(searchAfter); } } diff --git a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java index 87074049da..89b1be7650 100644 --- a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java +++ b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java @@ -80,6 +80,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester; import org.apache.lucene.util.BytesRef; +import org.hamcrest.Matchers; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; @@ -90,7 +91,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.SequenceInputStream; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; @@ -117,6 +117,7 @@ import static com.apple.foundationdb.record.query.plan.match.PlanMatchers.indexScan; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.allOf; +import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.empty; import static org.hamcrest.Matchers.equalTo; @@ -145,7 +146,8 @@ public class LuceneIndexTest extends FDBRecordStoreTestBase { private static final Index SIMPLE_TEXT_WITH_AUTO_COMPLETE_NO_FREQS_POSITIONS = new Index("Simple_with_auto_complete", function(LuceneFunctionNames.LUCENE_TEXT, concat(field("text"), - function(LuceneFunctionNames.LUCENE_AUTO_COMPLETE_FIELD_INDEX_OPTIONS, value(LuceneFunctionNames.LuceneFieldIndexOptions.DOCS.name())))), + function(LuceneFunctionNames.LUCENE_AUTO_COMPLETE_FIELD_INDEX_OPTIONS, value(LuceneFunctionNames.LuceneFieldIndexOptions.DOCS.name())), + function(LuceneFunctionNames.LUCENE_FULL_TEXT_FIELD_INDEX_OPTIONS, value(LuceneFunctionNames.LuceneFieldIndexOptions.DOCS.name())))), LuceneIndexTypes.LUCENE, ImmutableMap.of(LuceneIndexOptions.AUTO_COMPLETE_ENABLED, "true", LuceneIndexOptions.AUTO_COMPLETE_MIN_PREFIX_SIZE, "3")); @@ -1009,7 +1011,7 @@ void searchForAutoCompleteWithLoadingNoRecords(boolean withTermVectors) throws E null, ScanProperties.FORWARD_SCAN).asList().get(); assertTrue(results.isEmpty()); - assertEquals(0, context.getTimer().getCounter(LuceneEvents.Counts.LUCENE_SCAN_MATCHED_AUTO_COMPLETE_SUGGESTIONS).getCount()); + assertEquals(0, context.getTimer().getCount(LuceneEvents.Counts.LUCENE_SCAN_MATCHED_AUTO_COMPLETE_SUGGESTIONS)); } } @@ -1052,22 +1054,11 @@ void searchForAutoCompleteCrossingMultipleFields(boolean withTermVectors) throws // Assert the suggestions' keys List suggestions = results.stream().map(i -> (String) i.getKey().get(i.getKeySize() - 1)).collect(Collectors.toList()); - assertEquals(ImmutableList.of("good evening", "Good night", "Good morning", "Good afternoon", "I'm good", "That's really good!"), suggestions); + assertThat(suggestions, containsInAnyOrder("good evening", "Good night", "Good morning", "Good afternoon", "I'm good", "That's really good!")); // Assert the corresponding field for the suggestions List fields = results.stream().map(i -> (String) i.getKey().get(i.getKeySize() - 2)).collect(Collectors.toList()); assertEquals(ImmutableList.of("text", "text", "text", "text", "text2", "text2"), fields); - - // Assert the suggestions are sorted according to their values, which are determined by the position of the term into the indexed text - List values = results.stream().map(i -> (Long) i.getValue().get(0)).collect(Collectors.toList()); - List valuesSorted = new ArrayList<>(values); - Collections.sort(valuesSorted, Collections.reverseOrder()); - assertEquals(valuesSorted, values); - assertEquals(values.get(0), values.get(1)); - assertEquals(values.get(1), values.get(2)); - assertEquals(values.get(2), values.get(3)); - assertTrue(values.get(3) > values.get(4)); - assertTrue(values.get(4) > values.get(5)); } } @@ -1153,7 +1144,6 @@ void testAutoCompleteSearchForPhrase() throws Exception { "states united as a country", "states have been united as a country", "all the states united as a country", - "all the states have been united as a country", "welcome to the united states of america", "The countries are united kingdom, france, the states")); @@ -1211,7 +1201,6 @@ void testAutoCompleteSearchWithHighlightForPhrase() throws Exception { "states united as a country", "states have been united as a country", "all the states united as a country", - "all the states have been united as a country", "welcome to the united states of america", "The countries are united kingdom, france, the states")); @@ -1558,7 +1547,7 @@ void testDeleteWhereAutoComplete(boolean withTermVectors) { } commit(context); } - // Re-initialized the builder so the LUCENE_INDEX_COMPRESSION_ENABLED prop is not added twice + // Re-initialize the builder so the LUCENE_INDEX_COMPRESSION_ENABLED prop is not added twice storageBuilder = RecordLayerPropertyStorage.newBuilder() .addProp(LuceneRecordContextProperties.LUCENE_AUTO_COMPLETE_WITH_TERM_VECTORS, withTermVectors); try (FDBRecordContext context = openContext(storageBuilder)) { @@ -1567,9 +1556,11 @@ void testDeleteWhereAutoComplete(boolean withTermVectors) { List autoCompleted = recordStore.scanIndex(COMPLEX_MULTI_GROUPED_WITH_AUTO_COMPLETE, groupedAutoComplete(COMPLEX_MULTI_GROUPED_WITH_AUTO_COMPLETE, "hello", group), null, ScanProperties.FORWARD_SCAN) .asList() .join(); - assertThat(autoCompleted, hasSize(1)); - Tuple key = autoCompleted.get(0).getKey(); - assertEquals(Tuple.from("text", String.format("hello there %d", group)), TupleHelpers.subTuple(key, key.size() - 2, key.size())); + assertThat(autoCompleted, hasSize(10)); + for (IndexEntry entry : autoCompleted) { + Tuple key = entry.getKey(); + assertEquals(Tuple.from("text", String.format("hello there %d", group)), TupleHelpers.subTuple(key, key.size() - 2, key.size())); + } } final int groupToDelete = maxGroup / 2; @@ -1582,9 +1573,11 @@ void testDeleteWhereAutoComplete(boolean withTermVectors) { if (group == groupToDelete) { assertThat(autoCompleted, empty()); } else { - assertThat(autoCompleted, hasSize(1)); - Tuple key = autoCompleted.get(0).getKey(); - assertEquals(Tuple.from("text", String.format("hello there %d", group)), TupleHelpers.subTuple(key, key.size() - 2, key.size())); + assertThat(autoCompleted, hasSize(10)); + for (IndexEntry entry : autoCompleted) { + Tuple key = entry.getKey(); + assertEquals(Tuple.from("text", String.format("hello there %d", group)), TupleHelpers.subTuple(key, key.size() - 2, key.size())); + } } } } @@ -1722,7 +1715,7 @@ private void searchForAutoCompleteAndAssert(String query, boolean matches, boole (String) i.getKey().get(i.getKeySize() - 1), (String) i.getKey().get(i.getKeySize() - 2), LuceneScanTypes.BY_LUCENE_AUTO_COMPLETE)); - assertEquals(6, context.getTimer().getCounter(LuceneEvents.Counts.LUCENE_SCAN_MATCHED_AUTO_COMPLETE_SUGGESTIONS).getCount()); + assertEquals(6, context.getTimer().getCount(LuceneEvents.Counts.LUCENE_SCAN_MATCHED_AUTO_COMPLETE_SUGGESTIONS)); assertAutoCompleteEntriesAndSegmentInfoStoredInCompoundFile(recordStore.indexSubspace(SIMPLE_TEXT_WITH_AUTO_COMPLETE), context, "_0.cfs", true); } @@ -1809,7 +1802,7 @@ private void queryAndAssertAutoCompleteSuggestionsReturned(@Nonnull Index index, assertEquals(expectedSuggestions.size(), results.size()); List suggestions = results.stream().map(i -> i.getKey().getString(i.getKeySize() - 1)).collect(Collectors.toList()); - assertEquals(expectedSuggestions, suggestions); + assertThat(suggestions, containsInAnyOrder(expectedSuggestions.stream().map(Matchers::equalTo).collect(Collectors.toList()))); } private void assertDocumentPartialRecordFromIndexEntry(@Nonnull RecordType recordType, @Nonnull IndexEntry indexEntry,