From 5a9765ec25091f2e6b7647c185e06b52e5be3041 Mon Sep 17 00:00:00 2001 From: Mike McMahon Date: Tue, 13 Dec 2022 16:07:26 -0800 Subject: [PATCH 1/3] Move some highlighting-related methods into a separate class. --- .../LuceneAutoCompleteResultCursor.java | 186 +------- .../lucene/LuceneDocumentFromRecord.java | 39 -- .../lucene/LuceneHighlightTermsPlan.java | 28 +- .../record/lucene/LuceneHighlighting.java | 430 ++++++++++++++++++ ...ceneIndexKeyValueToPartialRecordUtils.java | 110 ----- .../LuceneAutoCompleteResultCursorTest.java | 2 +- .../lucene/LuceneDocumentFromRecordTest.java | 20 +- .../record/lucene/LuceneIndexTest.java | 2 +- 8 files changed, 444 insertions(+), 373 deletions(-) create mode 100644 fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlighting.java diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java index ee12b7b564..549cd27f13 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java @@ -72,14 +72,12 @@ import javax.annotation.Nullable; import java.io.IOException; import java.io.StringReader; -import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Objects; import java.util.Set; import java.util.concurrent.CompletableFuture; @@ -95,10 +93,6 @@ public class LuceneAutoCompleteResultCursor implements BaseCursor { private static final Logger LOGGER = LoggerFactory.getLogger(LuceneAutoCompleteResultCursor.class); - private static final int tokenCountBeforeHighlighted = 3; - private static final int tokenCountAfterHighlighted = 3; - private static final String highlightedTextConnector = "... "; - @Nonnull private final Executor executor; @Nonnull @@ -196,184 +190,6 @@ private void performLookup() throws IOException { } } - @SuppressWarnings("squid:S3776") // Cognitive complexity is too high. Candidate for later refactoring - @Nullable - @VisibleForTesting - static String searchAllMaybeHighlight(@Nonnull String fieldName, @Nonnull Analyzer queryAnalyzer, @Nonnull String text, - @Nonnull Set matchedTokens, @Nullable String prefixToken, - boolean allMatchingRequired, - @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { - try (TokenStream ts = queryAnalyzer.tokenStream(fieldName, new StringReader(text))) { - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); - ts.reset(); - StringBuilder sb = luceneQueryHighlightParameters.isHighlight() ? new StringBuilder() : null; - int upto = 0; - Set matchedInText = new HashSet<>(); - boolean matchedPrefix = false; - ArrayDeque pres = new ArrayDeque<>(); - ArrayDeque ends = new ArrayDeque<>(); - int lastMatchPos = -tokenCountAfterHighlighted - 1; - int currentPos = 0; - while (ts.incrementToken()) { - String token = termAtt.toString(); - int startOffset = offsetAtt.startOffset(); - int endOffset = offsetAtt.endOffset(); - if (upto < startOffset) { - if (luceneQueryHighlightParameters.isHighlight()) { - if (luceneQueryHighlightParameters.isCutSnippets()) { - if (currentPos - lastMatchPos <= tokenCountAfterHighlighted + 1) { - addNonMatch(sb, text.substring(upto, startOffset)); - } else { - pres.add(text.substring(upto, startOffset)); - if (pres.size() > tokenCountBeforeHighlighted) { - pres.poll(); - } - if (ends.size() < luceneQueryHighlightParameters.getSnippedSize() - tokenCountAfterHighlighted) { - ends.add(text.substring(upto, startOffset)); - } - } - } else { - addNonMatch(sb, text.substring(upto, startOffset)); - } - } - upto = startOffset; - } else if (upto > startOffset) { - continue; - } - - if (matchedTokens.contains(token)) { - // Token matches. - if (luceneQueryHighlightParameters.isHighlight()) { - if (luceneQueryHighlightParameters.isCutSnippets() && currentPos - lastMatchPos > tokenCountBeforeHighlighted + tokenCountAfterHighlighted + 1) { - addNonMatch(sb, highlightedTextConnector); - } - while (!pres.isEmpty()) { - addNonMatch(sb, pres.poll()); - } - ends.clear(); - int start = startOffset; - while (start < endOffset) { - int index = text.toLowerCase(Locale.ROOT).indexOf(token, start); - if (index < 0 || index >= endOffset) { - addNonMatch(sb, text.substring(start, endOffset)); - break; - } - int actualStartOffset = index; - int actualEndOffset = index + token.length(); - addNonMatch(sb, text.substring(start, index)); - String substring = text.substring(actualStartOffset, actualEndOffset); - if (substring.equalsIgnoreCase(token) && !tokenAlreadyHighlighted(text, actualStartOffset, actualEndOffset, - luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) { - addWholeMatch(sb, substring, - luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag()); - } else { - addNonMatch(sb, substring); - } - start = actualEndOffset; - } - } - upto = endOffset; - matchedInText.add(token); - lastMatchPos = currentPos; - } else if (prefixToken != null && token.startsWith(prefixToken)) { - if (luceneQueryHighlightParameters.isHighlight()) { - if (!tokenAlreadyHighlighted(text, startOffset, endOffset, - luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) { - addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken, - luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag()); - } else { - addNonMatch(sb, text.substring(startOffset, endOffset)); - } - } - upto = endOffset; - matchedPrefix = true; - } - currentPos++; - } - ts.end(); - - if (allMatchingRequired && ((prefixToken != null && !matchedPrefix) || (matchedInText.size() < matchedTokens.size()))) { - // Query text not actually found in document text. Return null - return null; - } - - // Text was found. Return text (highlighted or not) - if (luceneQueryHighlightParameters.isHighlight()) { - int endOffset = offsetAtt.endOffset(); - if (upto < endOffset && !luceneQueryHighlightParameters.isCutSnippets()) { - addNonMatch(sb, text.substring(upto)); - } else if (luceneQueryHighlightParameters.isCutSnippets()) { - while (!ends.isEmpty()) { - addNonMatch(sb, ends.poll()); - } - addNonMatch(sb, highlightedTextConnector); - } - return sb.toString(); - } else { - return text; - } - - } catch (IOException e) { - return null; - } - } - - // Check this before highlighting tokens, so the highlighting is idempotent - private static boolean tokenAlreadyHighlighted(@Nonnull String text, int startOffset, int endOffset, - @Nonnull String leftTag, @Nonnull String rightTag) { - return startOffset - leftTag.length() >= 0 - && endOffset + rightTag.length() > text.length() - && text.startsWith(leftTag, startOffset - 3) - && text.startsWith(rightTag, endOffset); - } - - /** Called while highlighting a single result, to append a - * non-matching chunk of text from the suggestion to the - * provided fragments list. - * @param sb The {@code StringBuilder} to append to - * @param text The text chunk to add - */ - private static void addNonMatch(StringBuilder sb, String text) { - sb.append(text); - } - - /** Called while highlighting a single result, to append - * the whole matched token to the provided fragments list. - * @param sb The {@code StringBuilder} to append to - * @param surface The surface form (original) text - * @param leftTag the tag to add left to the surface - * @param rightTag the tag to add right to the surface - */ - private static void addWholeMatch(StringBuilder sb, String surface, String leftTag, String rightTag) { - sb.append(leftTag); - sb.append(surface); - sb.append(rightTag); - } - - /** Called while highlighting a single result, to append a - * matched prefix token, to the provided fragments list. - * @param sb The {@code StringBuilder} to append to - * @param surface The fragment of the surface form - * (indexed during build, corresponding to - * this match - * @param prefixToken The prefix of the token that matched - * @param leftTag the tag to add left to the surface - * @param rightTag the tag to add right to the surface - */ - private static void addPrefixMatch(StringBuilder sb, String surface, String prefixToken, String leftTag, String rightTag) { - // TODO: apps can try to invert their analysis logic - // here, e.g. downcase the two before checking prefix: - if (prefixToken.length() >= surface.length()) { - addWholeMatch(sb, surface, leftTag, rightTag); - return; - } - sb.append(leftTag); - sb.append(surface.substring(0, prefixToken.length())); - sb.append(rightTag); - sb.append(surface.substring(prefixToken.length())); - } - @SuppressWarnings("PMD.CloseResource") public RecordCursor lookup() throws IOException { // Determine the tokens from the query key @@ -612,7 +428,7 @@ private RecordCursor findIndexEntriesInRecord(ScoreDocAndRecord scor // matched terms return null; } - String match = searchAllMaybeHighlight(documentField.getFieldName(), queryAnalyzer, text, queryTokens, prefixToken, true, + String match = LuceneHighlighting.searchAllMaybeHighlight(documentField.getFieldName(), queryAnalyzer, text, queryTokens, prefixToken, true, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(highlight)); if (match == null) { // Text not found in this field diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java index 1014412d2e..97b71da39b 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java @@ -38,12 +38,9 @@ import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Objects; -import java.util.Set; /** * Helper class for converting {@link FDBRecord}s to Lucene documents. @@ -134,42 +131,6 @@ public static List getFields(@Nonnull KeyExpr return fields.getFields(); } - // Modify the Lucene fields of a record message with highlighting the terms from the given termMap - @Nonnull - public static void highlightTermsInMessage(@Nonnull KeyExpression expression, @Nonnull Message.Builder builder, @Nonnull Map> termMap, - @Nonnull LuceneAnalyzerCombinationProvider analyzerSelector, - @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { - LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource recordRebuildSource = new LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource<>(null, builder.getDescriptorForType(), builder, builder.build()); - - LuceneIndexExpressions.getFields(expression, recordRebuildSource, - (source, fieldName, value, type, stored, sorted, overriddenKeyRanges, groupingKeyIndex, keyIndex, fieldConfigsIgnored) -> { - Set terms = new HashSet<>(); - terms.addAll(termMap.getOrDefault(fieldName, Collections.emptySet())); - terms.addAll(termMap.getOrDefault("", Collections.emptySet())); - if (terms.isEmpty()) { - return; - } - for (Map.Entry entry : source.message.getAllFields().entrySet()) { - Object entryValue = entry.getValue(); - if (entryValue instanceof String && entryValue.equals(value) - && terms.stream().filter(t -> ((String) entryValue).toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))).findAny().isPresent()) { - String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValue).getAnalyzer(), (String) entryValue, termMap.get(fieldName), null, false, luceneQueryHighlightParameters); - source.buildMessage(highlightedText, entry.getKey(), null, null, true, 0); - } else if (entryValue instanceof List) { - int index = 0; - for (Object entryValueElement : ((List) entryValue)) { - if (entryValueElement instanceof String && entryValueElement.equals(value) - && terms.stream().filter(t -> ((String) entryValueElement).toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))).findAny().isPresent()) { - String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValueElement).getAnalyzer(), (String) entryValueElement, termMap.get(fieldName), null, false, luceneQueryHighlightParameters); - source.buildMessage(highlightedText, entry.getKey(), null, null, true, index); - } - index++; - } - } - } - }, null); - } - protected static class FDBRecordSource implements LuceneIndexExpressions.RecordSource> { @Nonnull private final FDBRecord rec; diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlightTermsPlan.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlightTermsPlan.java index 4e34090813..f68c1d207b 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlightTermsPlan.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlightTermsPlan.java @@ -22,15 +22,11 @@ import com.apple.foundationdb.record.EvaluationContext; import com.apple.foundationdb.record.ExecuteProperties; -import com.apple.foundationdb.record.IndexEntry; import com.apple.foundationdb.record.ObjectPlanHash; import com.apple.foundationdb.record.PlanHashable; import com.apple.foundationdb.record.RecordCursor; import com.apple.foundationdb.record.provider.common.StoreTimer; -import com.apple.foundationdb.record.provider.foundationdb.FDBIndexedRecord; -import com.apple.foundationdb.record.provider.foundationdb.FDBQueriedRecord; import com.apple.foundationdb.record.provider.foundationdb.FDBRecordStoreBase; -import com.apple.foundationdb.record.provider.foundationdb.FDBStoredRecord; import com.apple.foundationdb.record.query.plan.cascades.AliasMap; import com.apple.foundationdb.record.query.plan.cascades.CorrelationIdentifier; import com.apple.foundationdb.record.query.plan.cascades.GroupExpressionRef; @@ -83,29 +79,7 @@ public RecordCursor executePlan(@Nonnull final @Nonnull final ExecuteProperties executeProperties) { final RecordCursor results = getInnerPlan().executePlan(store, context, continuation, executeProperties); - return results .map(result -> QueryResult.fromQueriedRecord(highlightTermsInRecord(result.getQueriedRecord()))); - } - - @Nullable - @SuppressWarnings("unchecked") - private FDBQueriedRecord highlightTermsInRecord(@Nullable FDBQueriedRecord queriedRecord) { - if (queriedRecord == null) { - return queriedRecord; - } - IndexEntry indexEntry = queriedRecord.getIndexEntry(); - if (!(indexEntry instanceof LuceneRecordCursor.ScoreDocIndexEntry)) { - return queriedRecord; - } - LuceneRecordCursor.ScoreDocIndexEntry docIndexEntry = (LuceneRecordCursor.ScoreDocIndexEntry)indexEntry; - if (!docIndexEntry.getLuceneQueryHighlightParameters().isHighlight()) { - return queriedRecord; - } - M message = queriedRecord.getRecord(); - M.Builder builder = message.toBuilder(); - LuceneDocumentFromRecord.highlightTermsInMessage(docIndexEntry.getIndexKey(), builder, - docIndexEntry.getTermMap(), docIndexEntry.getAnalyzerSelector(), docIndexEntry.getLuceneQueryHighlightParameters()); - FDBStoredRecord storedRecord = queriedRecord.getStoredRecord().asBuilder().setRecord((M) builder.build()).build(); - return FDBQueriedRecord.indexed(new FDBIndexedRecord<>(indexEntry, storedRecord)); + return results .map(result -> QueryResult.fromQueriedRecord(LuceneHighlighting.highlightTermsInRecord(result.getQueriedRecord()))); } @Override diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlighting.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlighting.java new file mode 100644 index 0000000000..b3f4600da2 --- /dev/null +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlighting.java @@ -0,0 +1,430 @@ +/* + * LuceneHighlighting.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb.record.lucene; + +import com.apple.foundationdb.record.IndexEntry; +import com.apple.foundationdb.record.metadata.Key; +import com.apple.foundationdb.record.metadata.expressions.FieldKeyExpression; +import com.apple.foundationdb.record.metadata.expressions.KeyExpression; +import com.apple.foundationdb.record.provider.foundationdb.FDBIndexedRecord; +import com.apple.foundationdb.record.provider.foundationdb.FDBQueriedRecord; +import com.apple.foundationdb.record.provider.foundationdb.FDBStoredRecord; +import com.google.protobuf.Descriptors; +import com.google.protobuf.Message; +import org.apache.commons.lang3.StringUtils; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +/** + * Helper class for highlighting search matches. + */ +public class LuceneHighlighting { + private static final int tokenCountBeforeHighlighted = 3; + private static final int tokenCountAfterHighlighted = 3; + private static final String highlightedTextConnector = "... "; + + private LuceneHighlighting() { + } + + @SuppressWarnings("squid:S3776") // Cognitive complexity is too high. Candidate for later refactoring + @Nullable + static String searchAllMaybeHighlight(@Nonnull String fieldName, @Nonnull Analyzer queryAnalyzer, @Nonnull String text, + @Nonnull Set matchedTokens, @Nullable String prefixToken, + boolean allMatchingRequired, + @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { + try (TokenStream ts = queryAnalyzer.tokenStream(fieldName, new StringReader(text))) { + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); + ts.reset(); + StringBuilder sb = luceneQueryHighlightParameters.isHighlight() ? new StringBuilder() : null; + int upto = 0; + Set matchedInText = new HashSet<>(); + boolean matchedPrefix = false; + ArrayDeque pres = new ArrayDeque<>(); + ArrayDeque ends = new ArrayDeque<>(); + int lastMatchPos = -tokenCountAfterHighlighted - 1; + int currentPos = 0; + while (ts.incrementToken()) { + String token = termAtt.toString(); + int startOffset = offsetAtt.startOffset(); + int endOffset = offsetAtt.endOffset(); + if (upto < startOffset) { + if (luceneQueryHighlightParameters.isHighlight()) { + if (luceneQueryHighlightParameters.isCutSnippets()) { + if (currentPos - lastMatchPos <= tokenCountAfterHighlighted + 1) { + addNonMatch(sb, text.substring(upto, startOffset)); + } else { + pres.add(text.substring(upto, startOffset)); + if (pres.size() > tokenCountBeforeHighlighted) { + pres.poll(); + } + if (ends.size() < luceneQueryHighlightParameters.getSnippedSize() - tokenCountAfterHighlighted) { + ends.add(text.substring(upto, startOffset)); + } + } + } else { + addNonMatch(sb, text.substring(upto, startOffset)); + } + } + upto = startOffset; + } else if (upto > startOffset) { + continue; + } + + if (matchedTokens.contains(token)) { + // Token matches. + if (luceneQueryHighlightParameters.isHighlight()) { + if (luceneQueryHighlightParameters.isCutSnippets() && currentPos - lastMatchPos > tokenCountBeforeHighlighted + tokenCountAfterHighlighted + 1) { + addNonMatch(sb, highlightedTextConnector); + } + while (!pres.isEmpty()) { + addNonMatch(sb, pres.poll()); + } + ends.clear(); + int start = startOffset; + while (start < endOffset) { + int index = text.toLowerCase(Locale.ROOT).indexOf(token, start); + if (index < 0 || index >= endOffset) { + addNonMatch(sb, text.substring(start, endOffset)); + break; + } + int actualStartOffset = index; + int actualEndOffset = index + token.length(); + addNonMatch(sb, text.substring(start, index)); + String substring = text.substring(actualStartOffset, actualEndOffset); + if (substring.equalsIgnoreCase(token) && !tokenAlreadyHighlighted(text, actualStartOffset, actualEndOffset, + luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) { + addWholeMatch(sb, substring, + luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag()); + } else { + addNonMatch(sb, substring); + } + start = actualEndOffset; + } + } + upto = endOffset; + matchedInText.add(token); + lastMatchPos = currentPos; + } else if (prefixToken != null && token.startsWith(prefixToken)) { + if (luceneQueryHighlightParameters.isHighlight()) { + if (!tokenAlreadyHighlighted(text, startOffset, endOffset, + luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) { + addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken, + luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag()); + } else { + addNonMatch(sb, text.substring(startOffset, endOffset)); + } + } + upto = endOffset; + matchedPrefix = true; + } + currentPos++; + } + ts.end(); + + if (allMatchingRequired && ((prefixToken != null && !matchedPrefix) || (matchedInText.size() < matchedTokens.size()))) { + // Query text not actually found in document text. Return null + return null; + } + + // Text was found. Return text (highlighted or not) + if (luceneQueryHighlightParameters.isHighlight()) { + int endOffset = offsetAtt.endOffset(); + if (upto < endOffset && !luceneQueryHighlightParameters.isCutSnippets()) { + addNonMatch(sb, text.substring(upto)); + } else if (luceneQueryHighlightParameters.isCutSnippets()) { + while (!ends.isEmpty()) { + addNonMatch(sb, ends.poll()); + } + addNonMatch(sb, highlightedTextConnector); + } + return sb.toString(); + } else { + return text; + } + + } catch (IOException e) { + return null; + } + } + + // Check this before highlighting tokens, so the highlighting is idempotent + private static boolean tokenAlreadyHighlighted(@Nonnull String text, int startOffset, int endOffset, + @Nonnull String leftTag, @Nonnull String rightTag) { + return startOffset - leftTag.length() >= 0 + && endOffset + rightTag.length() <= text.length() + && text.startsWith(leftTag, startOffset - leftTag.length()) + && text.startsWith(rightTag, endOffset); + } + + /** Called while highlighting a single result, to append a + * non-matching chunk of text from the suggestion to the + * provided fragments list. + * @param sb The {@code StringBuilder} to append to + * @param text The text chunk to add + */ + private static void addNonMatch(StringBuilder sb, String text) { + sb.append(text); + } + + /** Called while highlighting a single result, to append + * the whole matched token to the provided fragments list. + * @param sb The {@code StringBuilder} to append to + * @param surface The surface form (original) text + * @param leftTag the tag to add left to the surface + * @param rightTag the tag to add right to the surface + */ + private static void addWholeMatch(StringBuilder sb, String surface, String leftTag, String rightTag) { + sb.append(leftTag); + sb.append(surface); + sb.append(rightTag); + } + + /** Called while highlighting a single result, to append a + * matched prefix token, to the provided fragments list. + * @param sb The {@code StringBuilder} to append to + * @param surface The fragment of the surface form + * (indexed during build, corresponding to + * this match + * @param prefixToken The prefix of the token that matched + * @param leftTag the tag to add left to the surface + * @param rightTag the tag to add right to the surface + */ + private static void addPrefixMatch(StringBuilder sb, String surface, String prefixToken, String leftTag, String rightTag) { + // TODO: apps can try to invert their analysis logic + // here, e.g. downcase the two before checking prefix: + if (prefixToken.length() >= surface.length()) { + addWholeMatch(sb, surface, leftTag, rightTag); + return; + } + sb.append(leftTag); + sb.append(surface.substring(0, prefixToken.length())); + sb.append(rightTag); + sb.append(surface.substring(prefixToken.length())); + } + + @Nullable + @SuppressWarnings("unchecked") + public static FDBQueriedRecord highlightTermsInRecord(@Nullable FDBQueriedRecord queriedRecord) { + if (queriedRecord == null) { + return queriedRecord; + } + IndexEntry indexEntry = queriedRecord.getIndexEntry(); + if (!(indexEntry instanceof LuceneRecordCursor.ScoreDocIndexEntry)) { + return queriedRecord; + } + LuceneRecordCursor.ScoreDocIndexEntry docIndexEntry = (LuceneRecordCursor.ScoreDocIndexEntry)indexEntry; + if (!docIndexEntry.getLuceneQueryHighlightParameters().isHighlight()) { + return queriedRecord; + } + M message = queriedRecord.getRecord(); + M.Builder builder = message.toBuilder(); + highlightTermsInMessage(docIndexEntry.getIndexKey(), builder, + docIndexEntry.getTermMap(), docIndexEntry.getAnalyzerSelector(), docIndexEntry.getLuceneQueryHighlightParameters()); + FDBStoredRecord storedRecord = queriedRecord.getStoredRecord().asBuilder().setRecord((M) builder.build()).build(); + return FDBQueriedRecord.indexed(new FDBIndexedRecord<>(indexEntry, storedRecord)); + } + + // Modify the Lucene fields of a record message with highlighting the terms from the given termMap + @Nonnull + public static void highlightTermsInMessage(@Nonnull KeyExpression expression, @Nonnull Message.Builder builder, @Nonnull Map> termMap, + @Nonnull LuceneAnalyzerCombinationProvider analyzerSelector, + @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { + RecordRebuildSource recordRebuildSource = new RecordRebuildSource<>(null, builder.getDescriptorForType(), builder, builder.build()); + + LuceneIndexExpressions.getFields(expression, recordRebuildSource, + (source, fieldName, value, type, stored, sorted, overriddenKeyRanges, groupingKeyIndex, keyIndex, fieldConfigsIgnored) -> { + if (type != LuceneIndexExpressions.DocumentFieldType.TEXT) { + return; + } + Set terms = getFieldTerms(termMap, fieldName); + if (terms.isEmpty()) { + return; + } + for (Map.Entry entry : source.message.getAllFields().entrySet()) { + final Descriptors.FieldDescriptor entryDescriptor = entry.getKey(); + final Object entryValue = entry.getValue(); + if (entryValue instanceof String) { + buildIfMatch(source, fieldName, value, + entryDescriptor, entryValue, 0, + terms, analyzerSelector, luceneQueryHighlightParameters); + } else if (entryValue instanceof List) { + int index = 0; + for (Object entryValueElement : ((List) entryValue)) { + buildIfMatch(source, fieldName, value, + entryDescriptor, entryValueElement, index, + terms, analyzerSelector, luceneQueryHighlightParameters); + index++; + } + } + } + }, null); + } + + private static void buildIfMatch(RecordRebuildSource source, String fieldName, Object fieldValue, + Descriptors.FieldDescriptor entryDescriptor, Object entryValue, int index, + @Nonnull Set terms, + @Nonnull LuceneAnalyzerCombinationProvider analyzerSelector, + @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { + if (entryValue.equals(fieldValue) && terms.stream().anyMatch(t -> StringUtils.containsIgnoreCase((String)entryValue, t))) { + String highlightedText = searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String)entryValue).getAnalyzer(), (String)entryValue, terms, null, false, luceneQueryHighlightParameters); + source.buildMessage(highlightedText, entryDescriptor, null, null, true, index); + } + } + + static class RecordRebuildSource implements LuceneIndexExpressions.RecordSource> { + @Nullable + public final RecordRebuildSource parent; + @Nonnull + public final Descriptors.Descriptor descriptor; + @Nullable + public final Descriptors.FieldDescriptor fieldDescriptor; + @Nonnull + public final Message.Builder builder; + public final Message message; + public final int indexIfRepeated; + + RecordRebuildSource(@Nullable RecordRebuildSource parent, @Nonnull Descriptors.Descriptor descriptor, @Nonnull Message.Builder builder, @Nonnull Message message) { + //this.rec = rec; + this.parent = parent; + this.descriptor = descriptor; + this.fieldDescriptor = null; + this.builder = builder; + this.message = message; + this.indexIfRepeated = 0; + } + + RecordRebuildSource(@Nullable RecordRebuildSource parent, @Nonnull Descriptors.FieldDescriptor fieldDescriptor, @Nonnull Message.Builder builder, @Nonnull Message message, int indexIfRepeated) { + //this.rec = rec; + this.parent = parent; + this.descriptor = fieldDescriptor.getMessageType(); + this.fieldDescriptor = fieldDescriptor; + this.builder = builder; + this.message = message; + this.indexIfRepeated = indexIfRepeated; + } + + @Override + public Descriptors.Descriptor getDescriptor() { + return descriptor; + } + + @Override + public Iterable> getChildren(@Nonnull FieldKeyExpression parentExpression) { + final String parentField = parentExpression.getFieldName(); + final Descriptors.FieldDescriptor parentFieldDescriptor = descriptor.findFieldByName(parentField); + + final List> children = new ArrayList<>(); + int index = 0; + for (Key.Evaluated evaluated : parentExpression.evaluateMessage(null, message)) { + final Message submessage = (Message)evaluated.toList().get(0); + if (submessage != null) { + if (parentFieldDescriptor.isRepeated()) { + children.add(new RecordRebuildSource(this, parentFieldDescriptor, + builder.newBuilderForField(parentFieldDescriptor), + submessage, index++)); + } else { + children.add(new RecordRebuildSource(this, parentFieldDescriptor, + builder.getFieldBuilder(parentFieldDescriptor), + submessage, index)); + } + } + } + return children; + } + + @Override + public Iterable getValues(@Nonnull FieldKeyExpression fieldExpression) { + final List values = new ArrayList<>(); + for (Key.Evaluated evaluated : fieldExpression.evaluateMessage(null, message)) { + Object value = evaluated.getObject(0); + if (value != null) { + values.add(value); + } + } + return values; + } + + @SuppressWarnings("java:S3776") + public void buildMessage(@Nullable Object value, Descriptors.FieldDescriptor subFieldDescriptor, @Nullable String customizedKey, @Nullable String mappedKeyField, boolean forLuceneField, int index) { + final Descriptors.FieldDescriptor mappedKeyFieldDescriptor = mappedKeyField == null ? null : descriptor.findFieldByName(mappedKeyField); + if (mappedKeyFieldDescriptor != null) { + if (customizedKey == null) { + return; + } + builder.setField(mappedKeyFieldDescriptor, customizedKey); + } + + if (value == null) { + return; + } + if (subFieldDescriptor.isRepeated()) { + if (subFieldDescriptor.getJavaType().equals(Descriptors.FieldDescriptor.JavaType.MESSAGE)) { + Message.Builder subBuilder = builder.newBuilderForField(subFieldDescriptor); + subBuilder.mergeFrom((Message) builder.getRepeatedField(subFieldDescriptor, index)).mergeFrom((Message) value); + builder.setRepeatedField(subFieldDescriptor, index, subBuilder.build()); + } else { + builder.setRepeatedField(subFieldDescriptor, index, value); + } + + } else { + int count = builder.getAllFields().size(); + if (message != null && count == 0) { + builder.mergeFrom(message); + } + builder.setField(subFieldDescriptor, value); + } + + if (parent != null) { + parent.buildMessage(builder.build(), this.fieldDescriptor, mappedKeyFieldDescriptor == null ? customizedKey : null, mappedKeyFieldDescriptor == null ? mappedKeyField : null, forLuceneField, indexIfRepeated); + } + } + } + + @Nonnull + private static Set getFieldTerms(@Nonnull Map> termMap, @Nonnull String fieldName) { + final Set terms = new HashSet<>(); + final Set forField = termMap.get(fieldName); + if (forField != null) { + terms.addAll(forField); + } + final Set forAll = termMap.get(""); + if (forAll != null) { + terms.addAll(forAll); + } + return terms; + } + +} diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexKeyValueToPartialRecordUtils.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexKeyValueToPartialRecordUtils.java index c9d54dbc4c..f818e38b65 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexKeyValueToPartialRecordUtils.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexKeyValueToPartialRecordUtils.java @@ -23,7 +23,6 @@ import com.apple.foundationdb.record.IndexEntry; import com.apple.foundationdb.record.RecordCoreException; import com.apple.foundationdb.record.logging.LogMessageKeys; -import com.apple.foundationdb.record.metadata.Key; import com.apple.foundationdb.record.metadata.expressions.FieldKeyExpression; import com.apple.foundationdb.record.metadata.expressions.GroupingKeyExpression; import com.apple.foundationdb.record.metadata.expressions.KeyExpression; @@ -201,115 +200,6 @@ private static Pair, List> getOriginalAndMappedFieldElement return Pair.of(fixedFieldNames, dynamicFieldNames); } - static class RecordRebuildSource implements LuceneIndexExpressions.RecordSource> { - @Nullable - public final RecordRebuildSource parent; - @Nonnull - public final Descriptors.Descriptor descriptor; - @Nullable - public final Descriptors.FieldDescriptor fieldDescriptor; - @Nonnull - public final Message.Builder builder; - public final Message message; - public final int indexIfRepeated; - - RecordRebuildSource(@Nullable RecordRebuildSource parent, @Nonnull Descriptors.Descriptor descriptor, @Nonnull Message.Builder builder, @Nonnull Message message) { - //this.rec = rec; - this.parent = parent; - this.descriptor = descriptor; - this.fieldDescriptor = null; - this.builder = builder; - this.message = message; - this.indexIfRepeated = 0; - } - - RecordRebuildSource(@Nullable RecordRebuildSource parent, @Nonnull Descriptors.FieldDescriptor fieldDescriptor, @Nonnull Message.Builder builder, @Nonnull Message message, int indexIfRepeated) { - //this.rec = rec; - this.parent = parent; - this.descriptor = fieldDescriptor.getMessageType(); - this.fieldDescriptor = fieldDescriptor; - this.builder = builder; - this.message = message; - this.indexIfRepeated = indexIfRepeated; - } - - @Override - public Descriptors.Descriptor getDescriptor() { - return descriptor; - } - - @Override - public Iterable> getChildren(@Nonnull FieldKeyExpression parentExpression) { - final String parentField = parentExpression.getFieldName(); - final Descriptors.FieldDescriptor parentFieldDescriptor = descriptor.findFieldByName(parentField); - - final List> children = new ArrayList<>(); - int index = 0; - for (Key.Evaluated evaluated : parentExpression.evaluateMessage(null, message)) { - final Message submessage = (Message)evaluated.toList().get(0); - if (submessage != null) { - if (parentFieldDescriptor.isRepeated()) { - children.add(new RecordRebuildSource(this, parentFieldDescriptor, - builder.newBuilderForField(parentFieldDescriptor), - submessage, index++)); - } else { - children.add(new RecordRebuildSource(this, parentFieldDescriptor, - builder.getFieldBuilder(parentFieldDescriptor), - submessage, index)); - } - } - } - return children; - } - - @Override - public Iterable getValues(@Nonnull FieldKeyExpression fieldExpression) { - final List values = new ArrayList<>(); - for (Key.Evaluated evaluated : fieldExpression.evaluateMessage(null, message)) { - Object value = evaluated.getObject(0); - if (value != null) { - values.add(value); - } - } - return values; - } - - @SuppressWarnings("java:S3776") - public void buildMessage(@Nullable Object value, Descriptors.FieldDescriptor subFieldDescriptor, @Nullable String customizedKey, @Nullable String mappedKeyField, boolean forLuceneField, int index) { - final Descriptors.FieldDescriptor mappedKeyFieldDescriptor = mappedKeyField == null ? null : descriptor.findFieldByName(mappedKeyField); - if (mappedKeyFieldDescriptor != null) { - if (customizedKey == null) { - return; - } - builder.setField(mappedKeyFieldDescriptor, customizedKey); - } - - if (value == null) { - return; - } - if (subFieldDescriptor.isRepeated()) { - if (subFieldDescriptor.getJavaType().equals(Descriptors.FieldDescriptor.JavaType.MESSAGE)) { - Message.Builder subBuilder = builder.newBuilderForField(subFieldDescriptor); - subBuilder.mergeFrom((Message) builder.getRepeatedField(subFieldDescriptor, index)).mergeFrom((Message) value); - builder.setRepeatedField(subFieldDescriptor, index, subBuilder.build()); - } else { - builder.setRepeatedField(subFieldDescriptor, index, value); - } - - } else { - int count = builder.getAllFields().size(); - if (message != null && count == 0) { - builder.mergeFrom(message); - } - builder.setField(subFieldDescriptor, value); - } - - if (parent != null) { - parent.buildMessage(builder.build(), this.fieldDescriptor, mappedKeyFieldDescriptor == null ? customizedKey : null, mappedKeyFieldDescriptor == null ? mappedKeyField : null, forLuceneField, indexIfRepeated); - } - } - } - /** * A {@link com.apple.foundationdb.record.lucene.LuceneIndexExpressions.RecordSource} implementation to build the partial record message. */ diff --git a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java index 0907090d7f..f498879fb8 100644 --- a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java +++ b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java @@ -126,7 +126,7 @@ private static void assertSearchMatches(String queryString, List expecte assertEquals(expectedPrefixToken, prefixToken); Set queryTokenSet = new HashSet<>(tokens); - @Nullable String match = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight("text", analyzer, text, queryTokenSet, prefixToken, true, + @Nullable String match = LuceneHighlighting.searchAllMaybeHighlight("text", analyzer, text, queryTokenSet, prefixToken, true, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(highlight)); assertEquals(expectedMatch, match); } diff --git a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecordTest.java b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecordTest.java index 05f98316e7..03994d0a8b 100644 --- a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecordTest.java +++ b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecordTest.java @@ -66,7 +66,7 @@ void simple() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "some" for text field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("text", Set.of("some")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("text", Set.of("some")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("some text", builder.build().getText()); @@ -99,7 +99,7 @@ void group() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "text" for text field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("text", Set.of("text")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("text", Set.of("text")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("more text", builder.build().getText()); @@ -137,7 +137,7 @@ void multi() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "text" for text field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("text", Set.of("text")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("text", Set.of("text")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("some text", builder.build().getText(0)); assertEquals("other text", builder.build().getText(1)); @@ -183,7 +183,7 @@ void biGroup() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "text" for text field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("text2", Set.of("text")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("text2", Set.of("text")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("first text", builder.build().getText()); assertEquals("second text", builder.build().getText2()); @@ -229,7 +229,7 @@ void uncorrelatedMap() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "v2" for entry_value field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("entry_value", Set.of("v2")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("entry_value", Set.of("v2")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("v1", builder.build().getEntry(0).getValue()); assertEquals("v2", builder.build().getEntry(1).getValue()); @@ -271,7 +271,7 @@ void map() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "v2" for k2 field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("k2", Set.of("v2")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("k2", Set.of("v2")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("v1", builder.build().getEntry(0).getValue()); assertEquals("v2", builder.build().getEntry(1).getValue()); @@ -316,7 +316,7 @@ void groupedMap() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "v20" for k2 field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("k2", Set.of("v20")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("k2", Set.of("v20")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("v10", builder.build().getEntry(0).getValue()); assertEquals("v20", builder.build().getEntry(1).getValue()); @@ -366,7 +366,7 @@ void groupingMap() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "2val" for entry_second_value field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("entry_second_value", Set.of("2val")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("entry_second_value", Set.of("2val")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("val", builder.build().getEntry(0).getValue()); assertEquals("2val", builder.build().getEntry(0).getSecondValue()); @@ -419,7 +419,7 @@ void groupingMapWithExtra() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "second" for entry_second_value field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("entry_second_value", Set.of("second")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("entry_second_value", Set.of("second")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("first", builder.build().getEntry(0).getValue()); assertEquals("second", builder.build().getEntry(0).getSecondValue()); @@ -470,7 +470,7 @@ void mapWithSubMessage() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "testValue" for entry_k1_value field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("entry_k1_value", Set.of("testvalue")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("entry_k1_value", Set.of("testvalue")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("testValue", builder.build().getEntry(0).getSubEntry().getValue()); diff --git a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java index d00dfe7441..fbe12aa417 100644 --- a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java +++ b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java @@ -2327,7 +2327,7 @@ private FDBStoredRecord possiblyHighlightedStoredRecord(F } M message = indexedRecord.getRecord(); M.Builder builder = message.toBuilder(); - LuceneDocumentFromRecord.highlightTermsInMessage(docIndexEntry.getIndexKey(), builder, + LuceneHighlighting.highlightTermsInMessage(docIndexEntry.getIndexKey(), builder, docIndexEntry.getTermMap(), docIndexEntry.getAnalyzerSelector(), docIndexEntry.getLuceneQueryHighlightParameters()); return storedRecord.asBuilder().setRecord((M) builder.build()).build(); } From 2caab0d056e2d321854ef9777e59f96e22311654 Mon Sep 17 00:00:00 2001 From: Mike McMahon Date: Wed, 14 Dec 2022 10:16:35 -0800 Subject: [PATCH 2/3] Give highlighting control of rewriting. --- .../foundationdb/record/lucene/LucenePlanner.java | 6 +++--- .../record/lucene/LuceneScanQueryParameters.java | 10 ++++++++-- .../foundationdb/record/lucene/FDBLuceneQueryTest.java | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LucenePlanner.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LucenePlanner.java index 1790861a4a..78888cfe6b 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LucenePlanner.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LucenePlanner.java @@ -123,7 +123,7 @@ private ScoredPlan planLucene(@Nonnull CandidateScan candidateScan, QueryComponent queryComponent = state.groupingComparisons.isEmpty() ? state.filter : filterMask.getUnsatisfiedFilter(); // Special scans like auto-complete cannot be combined with regular queries. LuceneScanParameters scanParameters = getSpecialScan(state, filterMask, queryComponent); - boolean hasHighlight = false; + boolean hasRewrite = false; if (scanParameters == null) { // Scan by means of normal Lucene search API. LuceneQueryClause query = getQueryForFilter(state, filter, new ArrayList<>(), filterMask); @@ -137,7 +137,7 @@ private ScoredPlan planLucene(@Nonnull CandidateScan candidateScan, LuceneScanQueryParameters.LuceneQueryHighlightParameters highlightParameters = getHighlightParameters(queryComponent); scanParameters = new LuceneScanQueryParameters(groupingComparisons, query, state.sort, state.storedFields, state.storedFieldTypes, highlightParameters); - hasHighlight = highlightParameters.isHighlight(); + hasRewrite = highlightParameters.isHighlight() && highlightParameters.isRewriteRecords(); } // Wrap in plan. @@ -148,7 +148,7 @@ private ScoredPlan planLucene(@Nonnull CandidateScan candidateScan, if (filterMask.allSatisfied()) { filterMask.setSatisfied(true); } - if (hasHighlight) { + if (hasRewrite) { plan = new LuceneHighlightTermsPlan(plan); } return new ScoredPlan(plan, filterMask.getUnsatisfiedFilters(), Collections.emptyList(), 11 - filterMask.getUnsatisfiedFilters().size(), diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneScanQueryParameters.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneScanQueryParameters.java index 018f277ad6..f7efbbe8fe 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneScanQueryParameters.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneScanQueryParameters.java @@ -213,22 +213,24 @@ public static class LuceneQueryHighlightParameters { private final boolean cutSnippets; private final int snippedSize; + private final boolean rewriteRecords; public LuceneQueryHighlightParameters(boolean highlight) { this(highlight, false); } public LuceneQueryHighlightParameters(boolean highlight, boolean cutSnippets) { - this(highlight, DEFAULT_LEFT_TAG, DEFAULT_RIGHT_TAG, cutSnippets, DEFAULT_SNIPPETS_SIZE); + this(highlight, DEFAULT_LEFT_TAG, DEFAULT_RIGHT_TAG, cutSnippets, DEFAULT_SNIPPETS_SIZE, true); } public LuceneQueryHighlightParameters(boolean highlight, @Nonnull String leftTag, @Nonnull String rightTag, - boolean cutSnippets, int snippedSize) { + boolean cutSnippets, int snippedSize, boolean rewriteRecords) { this.highlight = highlight; this.leftTag = leftTag; this.rightTag = rightTag; this.cutSnippets = cutSnippets; this.snippedSize = snippedSize; + this.rewriteRecords = rewriteRecords; } public boolean isHighlight() { @@ -253,5 +255,9 @@ public boolean isCutSnippets() { public int getSnippedSize() { return snippedSize; } + + public boolean isRewriteRecords() { + return rewriteRecords; + } } } diff --git a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneQueryTest.java b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneQueryTest.java index 4fa8b34fa3..1714cb5bc9 100644 --- a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneQueryTest.java +++ b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneQueryTest.java @@ -369,7 +369,7 @@ void luceneQueryCustomizedHighlighting() throws Exception { final QueryComponent filter = new LuceneQueryComponent(LuceneQueryComponent.Type.QUERY_HIGHLIGHT, "layer", false, Lists.newArrayList(), true, - new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true, "", "", true, 6)); + new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true, "", "", true, 6, true)); RecordQuery query = RecordQuery.newBuilder() .setRecordType(TextIndexTestUtils.SIMPLE_DOC) .setFilter(filter) From b9cf0905a01994cf69ef8136cb1f3a0886c2f505 Mon Sep 17 00:00:00 2001 From: Mike McMahon Date: Wed, 14 Dec 2022 11:28:46 -0800 Subject: [PATCH 3/3] Add another method to get highlighted positions instead of rewriting. --- .../LuceneAutoCompleteResultCursor.java | 2 +- .../lucene/LuceneDocumentFromRecord.java | 5 + .../record/lucene/LuceneHighlighting.java | 122 +++++++++++++++--- .../record/lucene/FDBLuceneQueryTest.java | 40 ++++++ .../LuceneAutoCompleteResultCursorTest.java | 2 +- 5 files changed, 153 insertions(+), 18 deletions(-) diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java index 549cd27f13..59abf52858 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java @@ -429,7 +429,7 @@ private RecordCursor findIndexEntriesInRecord(ScoreDocAndRecord scor return null; } String match = LuceneHighlighting.searchAllMaybeHighlight(documentField.getFieldName(), queryAnalyzer, text, queryTokens, prefixToken, true, - new LuceneScanQueryParameters.LuceneQueryHighlightParameters(highlight)); + new LuceneScanQueryParameters.LuceneQueryHighlightParameters(highlight), null); if (match == null) { // Text not found in this field return null; diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java index 97b71da39b..f0900d6a22 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java @@ -142,6 +142,11 @@ public FDBRecordSource(@Nonnull final FDBRecord rec, @Nonnull final Message m this.message = message; } + @Nonnull + public Message getMessage() { + return message; + } + @Override public Descriptors.Descriptor getDescriptor() { return message.getDescriptorForType(); diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlighting.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlighting.java index b3f4600da2..651d692bfb 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlighting.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlighting.java @@ -26,10 +26,12 @@ import com.apple.foundationdb.record.metadata.expressions.KeyExpression; import com.apple.foundationdb.record.provider.foundationdb.FDBIndexedRecord; import com.apple.foundationdb.record.provider.foundationdb.FDBQueriedRecord; +import com.apple.foundationdb.record.provider.foundationdb.FDBRecord; import com.apple.foundationdb.record.provider.foundationdb.FDBStoredRecord; import com.google.protobuf.Descriptors; import com.google.protobuf.Message; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -41,6 +43,7 @@ import java.io.StringReader; import java.util.ArrayDeque; import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Locale; @@ -63,7 +66,8 @@ private LuceneHighlighting() { static String searchAllMaybeHighlight(@Nonnull String fieldName, @Nonnull Analyzer queryAnalyzer, @Nonnull String text, @Nonnull Set matchedTokens, @Nullable String prefixToken, boolean allMatchingRequired, - @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { + @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters, + @Nullable List> highlightedPositions) { try (TokenStream ts = queryAnalyzer.tokenStream(fieldName, new StringReader(text))) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); @@ -127,7 +131,8 @@ static String searchAllMaybeHighlight(@Nonnull String fieldName, @Nonnull Analyz if (substring.equalsIgnoreCase(token) && !tokenAlreadyHighlighted(text, actualStartOffset, actualEndOffset, luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) { addWholeMatch(sb, substring, - luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag()); + luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag(), + highlightedPositions); } else { addNonMatch(sb, substring); } @@ -142,7 +147,8 @@ static String searchAllMaybeHighlight(@Nonnull String fieldName, @Nonnull Analyz if (!tokenAlreadyHighlighted(text, startOffset, endOffset, luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) { addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken, - luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag()); + luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag(), + highlightedPositions); } else { addNonMatch(sb, text.substring(startOffset, endOffset)); } @@ -183,7 +189,8 @@ static String searchAllMaybeHighlight(@Nonnull String fieldName, @Nonnull Analyz // Check this before highlighting tokens, so the highlighting is idempotent private static boolean tokenAlreadyHighlighted(@Nonnull String text, int startOffset, int endOffset, @Nonnull String leftTag, @Nonnull String rightTag) { - return startOffset - leftTag.length() >= 0 + return (leftTag.length() > 0 || rightTag.length() > 0) + && startOffset - leftTag.length() >= 0 && endOffset + rightTag.length() <= text.length() && text.startsWith(leftTag, startOffset - leftTag.length()) && text.startsWith(rightTag, endOffset); @@ -199,39 +206,53 @@ private static void addNonMatch(StringBuilder sb, String text) { sb.append(text); } - /** Called while highlighting a single result, to append - * the whole matched token to the provided fragments list. + /** + * Called while highlighting a single result, to append + * the whole matched token to the provided fragments list. + * * @param sb The {@code StringBuilder} to append to - * @param surface The surface form (original) text + * @param surface The surface form (original) text * @param leftTag the tag to add left to the surface * @param rightTag the tag to add right to the surface */ - private static void addWholeMatch(StringBuilder sb, String surface, String leftTag, String rightTag) { + private static void addWholeMatch(StringBuilder sb, String surface, String leftTag, String rightTag, + @Nullable List> highlightedPositions) { + int start = sb.length(); sb.append(leftTag); sb.append(surface); sb.append(rightTag); + if (highlightedPositions != null) { + highlightedPositions.add(Pair.of(start, sb.length())); + } } - /** Called while highlighting a single result, to append a - * matched prefix token, to the provided fragments list. + /** + * Called while highlighting a single result, to append a + * matched prefix token, to the provided fragments list. + * * @param sb The {@code StringBuilder} to append to - * @param surface The fragment of the surface form - * (indexed during build, corresponding to - * this match + * @param surface The fragment of the surface form + * (indexed during build, corresponding to + * this match * @param prefixToken The prefix of the token that matched * @param leftTag the tag to add left to the surface * @param rightTag the tag to add right to the surface */ - private static void addPrefixMatch(StringBuilder sb, String surface, String prefixToken, String leftTag, String rightTag) { + private static void addPrefixMatch(StringBuilder sb, String surface, String prefixToken, String leftTag, String rightTag, + @Nullable List> highlightedPositions) { // TODO: apps can try to invert their analysis logic // here, e.g. downcase the two before checking prefix: if (prefixToken.length() >= surface.length()) { - addWholeMatch(sb, surface, leftTag, rightTag); + addWholeMatch(sb, surface, leftTag, rightTag, highlightedPositions); return; } + int start = sb.length(); sb.append(leftTag); sb.append(surface.substring(0, prefixToken.length())); sb.append(rightTag); + if (highlightedPositions != null) { + highlightedPositions.add(Pair.of(start, sb.length())); + } sb.append(surface.substring(prefixToken.length())); } @@ -299,7 +320,7 @@ private static void buildIfMatch(RecordRebuildSource sour @Nonnull LuceneAnalyzerCombinationProvider analyzerSelector, @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { if (entryValue.equals(fieldValue) && terms.stream().anyMatch(t -> StringUtils.containsIgnoreCase((String)entryValue, t))) { - String highlightedText = searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String)entryValue).getAnalyzer(), (String)entryValue, terms, null, false, luceneQueryHighlightParameters); + String highlightedText = searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String)entryValue).getAnalyzer(), (String)entryValue, terms, null, false, luceneQueryHighlightParameters, null); source.buildMessage(highlightedText, entryDescriptor, null, null, true, index); } } @@ -427,4 +448,73 @@ private static Set getFieldTerms(@Nonnull Map> termM return terms; } + /** + * Result of {@link #highlightedTermsForMessage}. + */ + public static class HighlightedTerm { + private final String fieldName; + private final String snippet; + private final List> highlightedPositions; + + public HighlightedTerm(final String fieldName, final String snippet, final List> highlightedPositions) { + this.fieldName = fieldName; + this.snippet = snippet; + this.highlightedPositions = highlightedPositions; + } + + public String getFieldName() { + return fieldName; + } + + public String getSnippet() { + return snippet; + } + + public List> getHighlightedPositions() { + return highlightedPositions; + } + } + + @Nonnull + public static List highlightedTermsForMessage(@Nullable FDBQueriedRecord queriedRecord) { + if (queriedRecord == null) { + return Collections.emptyList(); + } + IndexEntry indexEntry = queriedRecord.getIndexEntry(); + if (!(indexEntry instanceof LuceneRecordCursor.ScoreDocIndexEntry)) { + return Collections.emptyList(); + } + LuceneRecordCursor.ScoreDocIndexEntry docIndexEntry = (LuceneRecordCursor.ScoreDocIndexEntry)indexEntry; + if (!docIndexEntry.getLuceneQueryHighlightParameters().isHighlight() || + docIndexEntry.getLuceneQueryHighlightParameters().isRewriteRecords()) { + return Collections.emptyList(); + } + return highlightedTermsForMessage(queriedRecord, queriedRecord.getRecord(), + docIndexEntry.getIndexKey(), docIndexEntry.getTermMap(), docIndexEntry.getAnalyzerSelector(), docIndexEntry.getLuceneQueryHighlightParameters()); + } + + // Modify the Lucene fields of a record message with highlighting the terms from the given termMap + @Nonnull + public static List highlightedTermsForMessage(@Nonnull FDBRecord rec, M message, + @Nonnull KeyExpression expression, @Nonnull Map> termMap, @Nonnull LuceneAnalyzerCombinationProvider analyzerSelector, + @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { + List result = new ArrayList<>(); + LuceneIndexExpressions.getFields(expression, new LuceneDocumentFromRecord.FDBRecordSource<>(rec, message), + (source, fieldName, value, type, stored, sorted, overriddenKeyRanges, groupingKeyIndex, keyIndex, fieldConfigsIgnored) -> { + if (type != LuceneIndexExpressions.DocumentFieldType.TEXT) { + return; + } + Set terms = getFieldTerms(termMap, fieldName); + if (terms.isEmpty()) { + return; + } + if (value instanceof String && terms.stream().anyMatch(t -> StringUtils.containsIgnoreCase((String)value, t))) { + List> highlightedPositions = new ArrayList<>(); + String highlightedText = searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String)value).getAnalyzer(), (String)value, terms, null, false, luceneQueryHighlightParameters, highlightedPositions); + result.add(new HighlightedTerm(fieldName, highlightedText, highlightedPositions)); + } + }, null); + return result; + } + } diff --git a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneQueryTest.java b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneQueryTest.java index 1714cb5bc9..23755d4c1a 100644 --- a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneQueryTest.java +++ b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneQueryTest.java @@ -386,6 +386,46 @@ void luceneQueryCustomizedHighlighting() throws Exception { } } + @Test + void luceneQueryHighlightingPositions() throws Exception { + try (FDBRecordContext context = openContext()) { + openRecordStore(context); + final String text = "record record record record record record " + + "layer " + + "record record record record record record record record record record " + + "layer " + + "record record " + + "layer " + + "record record record record record record record record"; + TestRecordsTextProto.SimpleDocument simpleDocument = TestRecordsTextProto.SimpleDocument.newBuilder().setDocId(0).setGroup(0).setText(text).build(); + recordStore.saveRecord(simpleDocument); + + final QueryComponent filter = new LuceneQueryComponent(LuceneQueryComponent.Type.QUERY_HIGHLIGHT, + "layer", false, Lists.newArrayList(), true, + new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true, "", "", true, 6, false)); + RecordQuery query = RecordQuery.newBuilder() + .setRecordType(TextIndexTestUtils.SIMPLE_DOC) + .setFilter(filter) + .build(); + + RecordQueryPlan plan = planner.plan(query); + + List> queriedRecordList = recordStore.executeQuery(plan).asList().get(); + assertEquals(1, queriedRecordList.size()); + FDBQueriedRecord queriedRecord = queriedRecordList.get(0); + + List highlightedTerms = LuceneHighlighting.highlightedTermsForMessage(queriedRecord); + assertEquals(1, highlightedTerms.size()); + LuceneHighlighting.HighlightedTerm highlightedTerm = highlightedTerms.get(0); + assertEquals("text", highlightedTerm.getFieldName()); + assertEquals("... record record record layer record record record ... record record record layer record record layer record record record record record record ... ", highlightedTerm.getSnippet()); + assertEquals(List.of(Pair.of(25, 30), Pair.of(77, 82), Pair.of(97, 102)), highlightedTerm.getHighlightedPositions()); + for (Pair pos : highlightedTerm.getHighlightedPositions()) { + assertEquals("layer", highlightedTerm.getSnippet().substring(pos.getLeft(), pos.getRight())); + } + } + } + @ParameterizedTest(name = "testSynonym[shouldDeferFetch={0}]") @BooleanSource void testSynonym(boolean shouldDeferFetch) throws Exception { diff --git a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java index f498879fb8..8b4415f484 100644 --- a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java +++ b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java @@ -127,7 +127,7 @@ private static void assertSearchMatches(String queryString, List expecte Set queryTokenSet = new HashSet<>(tokens); @Nullable String match = LuceneHighlighting.searchAllMaybeHighlight("text", analyzer, text, queryTokenSet, prefixToken, true, - new LuceneScanQueryParameters.LuceneQueryHighlightParameters(highlight)); + new LuceneScanQueryParameters.LuceneQueryHighlightParameters(highlight), null); assertEquals(expectedMatch, match); } }