Merge pull request #1911 from MMcM/lucene_highlight

Resolves #1862: Lucene search with highlighting the terms
FoundationDB · Nov 15, 2022 · 37c92de · 37c92de
2 parents 167a0c1 + d4532d1
commit 37c92de
Show file tree

Hide file tree

Showing 21 changed files with 999 additions and 144 deletions.
diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md
@@ -25,7 +25,7 @@ The Guava dependency version has been updated to 31.1. Projects may need to chec
 * **Performance** Improvement 3 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
 * **Performance** Improvement 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
 * **Performance** Improvement 5 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
-* **Feature** Feature 1 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
+* **Feature** Lucene search with highlighting the terms [(Issue #1862)](https://github.com/FoundationDB/fdb-record-layer/issues/1862)
 * **Feature** Feature 2 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
 * **Feature** Feature 3 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
 * **Feature** Feature 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)

diff --git a/...re/src/main/java/com/apple/foundationdb/record/provider/foundationdb/FDBStoredRecord.java b/...re/src/main/java/com/apple/foundationdb/record/provider/foundationdb/FDBStoredRecord.java
@@ -150,6 +150,14 @@ public static <M extends Message> FDBStoredRecordBuilder<M> newBuilder(@Nonnull
         return new FDBStoredRecordBuilder<>(protoRecord);
     }
 
+    /**
+     * Get a builder with the parameters of this stored record.
+     * @return a builder
+     */
+    public FDBStoredRecordBuilder<M> asBuilder() {
+        return new FDBStoredRecordBuilder<>(this);
+    }
+
     /**
      * Copy this record with a different version.
      * @param recordVersion new version

diff --git a/...main/java/com/apple/foundationdb/record/provider/foundationdb/FDBStoredRecordBuilder.java b/...main/java/com/apple/foundationdb/record/provider/foundationdb/FDBStoredRecordBuilder.java
@@ -59,6 +59,18 @@ public FDBStoredRecordBuilder(@Nonnull M protoRecord) {
         this.protoRecord = protoRecord;
     }
 
+    public FDBStoredRecordBuilder(@Nonnull FDBStoredRecord<M> record) {
+        this.protoRecord = record.getRecord();
+        this.primaryKey = record.getPrimaryKey();
+        this.recordType = record.getRecordType();
+        this.keyCount = record.getKeyCount();
+        this.keySize = record.getKeySize();
+        this.valueSize = record.getValueSize();
+        this.split = record.isSplit();
+        this.recordVersion = record.getVersion();
+        this.versionedInline = record.isVersionedInline();
+    }
+
     @Override
     @Nonnull
     public Tuple getPrimaryKey() {

diff --git a/...ne/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java b/...ne/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java
@@ -72,12 +72,14 @@
 import javax.annotation.Nullable;
 import java.io.IOException;
 import java.io.StringReader;
+import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Objects;
 import java.util.Set;
 import java.util.concurrent.CompletableFuture;
@@ -92,6 +94,11 @@
  */
 public class LuceneAutoCompleteResultCursor implements BaseCursor<IndexEntry> {
     private static final Logger LOGGER = LoggerFactory.getLogger(LuceneAutoCompleteResultCursor.class);
+
+    private static final int tokenCountBeforeHighlighted = 3;
+    private static final int tokenCountAfterHighlighted = 3;
+    private static final String highlightedTextConnector = "... ";
+
     @Nonnull
     private final Executor executor;
     @Nonnull
@@ -192,22 +199,43 @@ private void performLookup() throws IOException {
     @SuppressWarnings("squid:S3776") // Cognitive complexity is too high. Candidate for later refactoring
     @Nullable
     @VisibleForTesting
-    static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<String> matchedTokens, @Nullable String prefixToken, boolean highlight) {
-        try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
+    static String searchAllMaybeHighlight(@Nonnull String fieldName, @Nonnull Analyzer queryAnalyzer, @Nonnull String text,
+                                          @Nonnull Set<String> matchedTokens, @Nullable String prefixToken,
+                                          boolean allMatchingRequired,
+                                          @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) {
+        try (TokenStream ts = queryAnalyzer.tokenStream(fieldName, new StringReader(text))) {
             CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
             OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
             ts.reset();
-            StringBuilder sb = highlight ? new StringBuilder() : null;
+            StringBuilder sb = luceneQueryHighlightParameters.isHighlight() ? new StringBuilder() : null;
             int upto = 0;
             Set<String> matchedInText = new HashSet<>();
             boolean matchedPrefix = false;
+            ArrayDeque<String> pres = new ArrayDeque<>();
+            ArrayDeque<String> ends = new ArrayDeque<>();
+            int lastMatchPos = -tokenCountAfterHighlighted - 1;
+            int currentPos = 0;
             while (ts.incrementToken()) {
                 String token = termAtt.toString();
                 int startOffset = offsetAtt.startOffset();
                 int endOffset = offsetAtt.endOffset();
                 if (upto < startOffset) {
-                    if (highlight) {
-                        addNonMatch(sb, text.substring(upto, startOffset));
+                    if (luceneQueryHighlightParameters.isHighlight()) {
+                        if (luceneQueryHighlightParameters.isCutSnippets()) {
+                            if (currentPos - lastMatchPos <= tokenCountAfterHighlighted + 1) {
+                                addNonMatch(sb, text.substring(upto, startOffset));
+                            } else {
+                                pres.add(text.substring(upto, startOffset));
+                                if (pres.size() > tokenCountBeforeHighlighted) {
+                                    pres.poll();
+                                }
+                                if (ends.size() < luceneQueryHighlightParameters.getSnippedSize() - tokenCountAfterHighlighted) {
+                                    ends.add(text.substring(upto, startOffset));
+                                }
+                            }
+                        } else {
+                            addNonMatch(sb, text.substring(upto, startOffset));
+                        }
                     }
                     upto = startOffset;
                 } else if (upto > startOffset) {
@@ -216,31 +244,66 @@ static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<S
 
                 if (matchedTokens.contains(token)) {
                     // Token matches.
-                    if (highlight) {
-                        addWholeMatch(sb, text.substring(startOffset, endOffset));
+                    if (luceneQueryHighlightParameters.isHighlight()) {
+                        if (luceneQueryHighlightParameters.isCutSnippets() && currentPos - lastMatchPos > tokenCountBeforeHighlighted + tokenCountAfterHighlighted + 1) {
+                            addNonMatch(sb, highlightedTextConnector);
+                        }
+                        while (!pres.isEmpty()) {
+                            addNonMatch(sb, pres.poll());
+                        }
+                        ends.clear();
+                        int start = startOffset;
+                        while (start < endOffset) {
+                            int index = text.toLowerCase(Locale.ROOT).indexOf(token, start);
+                            int actualStartOffset = index;
+                            int actualEndOffset = index + token.length();
+                            addNonMatch(sb, text.substring(start, index));
+                            String substring = text.substring(actualStartOffset, actualEndOffset);
+                            if (substring.equalsIgnoreCase(token) && !tokenAlreadyHighlighted(text, actualStartOffset, actualEndOffset,
+                                    luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) {
+                                addWholeMatch(sb, substring,
+                                        luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag());
+                            } else {
+                                addNonMatch(sb, substring);
+                            }
+                            start = actualEndOffset;
+                        }
                     }
                     upto = endOffset;
                     matchedInText.add(token);
+                    lastMatchPos = currentPos;
                 } else if (prefixToken != null && token.startsWith(prefixToken)) {
-                    if (highlight) {
-                        addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken);
+                    if (luceneQueryHighlightParameters.isHighlight()) {
+                        if (!tokenAlreadyHighlighted(text, startOffset, endOffset,
+                                luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) {
+                            addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken,
+                                    luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag());
+                        } else {
+                            addNonMatch(sb, text.substring(startOffset, endOffset));
+                        }
                     }
                     upto = endOffset;
                     matchedPrefix = true;
                 }
+                currentPos++;
             }
             ts.end();
 
-            if ((prefixToken != null && !matchedPrefix) || (matchedInText.size() < matchedTokens.size())) {
+            if (allMatchingRequired && ((prefixToken != null && !matchedPrefix) || (matchedInText.size() < matchedTokens.size()))) {
                 // Query text not actually found in document text. Return null
                 return null;
             }
 
             // Text was found. Return text (highlighted or not)
-            if (highlight) {
+            if (luceneQueryHighlightParameters.isHighlight()) {
                 int endOffset = offsetAtt.endOffset();
-                if (upto < endOffset) {
+                if (upto < endOffset && !luceneQueryHighlightParameters.isCutSnippets()) {
                     addNonMatch(sb, text.substring(upto));
+                } else if (luceneQueryHighlightParameters.isCutSnippets()) {
+                    while (!ends.isEmpty()) {
+                        addNonMatch(sb, ends.poll());
+                    }
+                    addNonMatch(sb, highlightedTextConnector);
                 }
                 return sb.toString();
             } else {
@@ -252,6 +315,15 @@ static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<S
         }
     }
 
+    // Check this before highlighting tokens, so the highlighting is idempotent
+    private static boolean tokenAlreadyHighlighted(@Nonnull String text, int startOffset, int endOffset,
+                                                   @Nonnull String leftTag, @Nonnull String rightTag) {
+        return startOffset - leftTag.length() >= 0
+               && endOffset + rightTag.length() > text.length()
+               && text.startsWith(leftTag, startOffset - 3)
+               && text.startsWith(rightTag, endOffset);
+    }
+
     /** Called while highlighting a single result, to append a
      *  non-matching chunk of text from the suggestion to the
      *  provided fragments list.
@@ -266,11 +338,13 @@ private static void addNonMatch(StringBuilder sb, String text) {
      *  the whole matched token to the provided fragments list.
      * @param sb The {@code StringBuilder} to append to
      *  @param surface The surface form (original) text
+     * @param leftTag the tag to add left to the surface
+     * @param rightTag the tag to add right to the surface
      */
-    private static void addWholeMatch(StringBuilder sb, String surface) {
-        sb.append("<b>");
+    private static void addWholeMatch(StringBuilder sb, String surface, String leftTag, String rightTag) {
+        sb.append(leftTag);
         sb.append(surface);
-        sb.append("</b>");
+        sb.append(rightTag);
     }
 
     /** Called while highlighting a single result, to append a
@@ -280,17 +354,19 @@ private static void addWholeMatch(StringBuilder sb, String surface) {
      *        (indexed during build, corresponding to
      *        this match
      * @param prefixToken The prefix of the token that matched
+     * @param leftTag the tag to add left to the surface
+     * @param rightTag the tag to add right to the surface
      */
-    private static void addPrefixMatch(StringBuilder sb, String surface, String prefixToken) {
+    private static void addPrefixMatch(StringBuilder sb, String surface, String prefixToken, String leftTag, String rightTag) {
         // TODO: apps can try to invert their analysis logic
         // here, e.g. downcase the two before checking prefix:
         if (prefixToken.length() >= surface.length()) {
-            addWholeMatch(sb, surface);
+            addWholeMatch(sb, surface, leftTag, rightTag);
             return;
         }
-        sb.append("<b>");
+        sb.append(leftTag);
         sb.append(surface.substring(0, prefixToken.length()));
-        sb.append("</b>");
+        sb.append(rightTag);
         sb.append(surface.substring(prefixToken.length()));
     }
 
@@ -532,7 +608,8 @@ private RecordCursor<IndexEntry> findIndexEntriesInRecord(ScoreDocAndRecord scor
                 // matched terms
                 return null;
             }
-            String match = searchAllMaybeHighlight(queryAnalyzer, text, queryTokens, prefixToken, highlight);
+            String match = searchAllMaybeHighlight(documentField.getFieldName(), queryAnalyzer, text, queryTokens, prefixToken, true,
+                    new LuceneScanQueryParameters.LuceneQueryHighlightParameters(highlight));
             if (match == null) {
                 // Text not found in this field
                 return null;

diff --git a/...r-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java b/...r-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java
@@ -38,9 +38,12 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
+import java.util.Set;
 
 /**
  * Helper class for converting {@link FDBRecord}s to Lucene documents.
@@ -131,6 +134,42 @@ public static <M extends Message> List<DocumentField> getFields(@Nonnull KeyExpr
         return fields.getFields();
     }
 
+    // Modify the Lucene fields of a record message with highlighting the terms from the given termMap
+    @Nonnull
+    public static <M extends Message> void highlightTermsInMessage(@Nonnull KeyExpression expression, @Nonnull Message.Builder builder, @Nonnull Map<String, Set<String>> termMap,
+                                                                   @Nonnull LuceneAnalyzerCombinationProvider analyzerSelector,
+                                                                   @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) {
+        LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource<M> recordRebuildSource = new LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource<>(null, builder.getDescriptorForType(), builder, builder.build());
+
+        LuceneIndexExpressions.getFields(expression, recordRebuildSource,
+                (source, fieldName, value, type, stored, sorted, overriddenKeyRanges, groupingKeyIndex, keyIndex, fieldConfigsIgnored) -> {
+                    Set<String> terms = new HashSet<>();
+                    terms.addAll(termMap.getOrDefault(fieldName, Collections.emptySet()));
+                    terms.addAll(termMap.getOrDefault("", Collections.emptySet()));
+                    if (terms.isEmpty()) {
+                        return;
+                    }
+                    for (Map.Entry<Descriptors.FieldDescriptor, Object> entry : source.message.getAllFields().entrySet()) {
+                        Object entryValue = entry.getValue();
+                        if (entryValue instanceof String && entryValue.equals(value)
+                                && terms.stream().filter(t -> ((String) entryValue).toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))).findAny().isPresent()) {
+                            String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValue).getAnalyzer(), (String) entryValue, termMap.get(fieldName), null, false, luceneQueryHighlightParameters);
+                            source.buildMessage(highlightedText, entry.getKey(), null, null, true, 0);
+                        } else if (entryValue instanceof List) {
+                            int index = 0;
+                            for (Object entryValueElement : ((List) entryValue)) {
+                                if (entryValueElement instanceof String && entryValueElement.equals(value)
+                                        && terms.stream().filter(t -> ((String) entryValueElement).toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))).findAny().isPresent()) {
+                                    String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValueElement).getAnalyzer(), (String) entryValueElement, termMap.get(fieldName), null, false, luceneQueryHighlightParameters);
+                                    source.buildMessage(highlightedText, entry.getKey(), null, null, true, index);
+                                }
+                                index++;
+                            }
+                        }
+                    }
+                }, null);
+    }
+
     protected static class FDBRecordSource<M extends Message> implements LuceneIndexExpressions.RecordSource<FDBRecordSource<M>> {
         @Nonnull
         private final FDBRecord<M> rec;

diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneEvents.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneEvents.java
@@ -186,6 +186,8 @@ public enum Counts implements StoreTimer.Count {
         LUCENE_SHARED_CACHE_HITS("lucene shared cache hits", false),
         /** Block to read came not in shared cache. **/
         LUCENE_SHARED_CACHE_MISSES("lucene shared cache misses", false),
+        /** Plan contains highlight operator. **/
+        PLAN_HIGHLIGHT_TERMS("lucene highlight plans", false),
         ;
 
         private final String title;

diff --git a/...rc/main/java/com/apple/foundationdb/record/lucene/LuceneFunctionKeyExpressionFactory.java b/...rc/main/java/com/apple/foundationdb/record/lucene/LuceneFunctionKeyExpressionFactory.java
@@ -48,7 +48,8 @@ public List<FunctionKeyExpression.Builder> getBuilders() {
                 new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_FULL_TEXT_FIELD_WITH_TERM_VECTOR_POSITIONS, LuceneFunctionKeyExpression.LuceneFieldConfig::new),
                 new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_AUTO_COMPLETE_FIELD_INDEX_OPTIONS, LuceneFunctionKeyExpression.LuceneFieldConfig::new),
                 new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_SORT_BY_RELEVANCE, LuceneFunctionKeyExpression.LuceneSortBy::new),
-                new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_SORT_BY_DOCUMENT_NUMBER, LuceneFunctionKeyExpression.LuceneSortBy::new)
+                new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_SORT_BY_DOCUMENT_NUMBER, LuceneFunctionKeyExpression.LuceneSortBy::new),
+                new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_HIGHLIGHT_TAG, LuceneFunctionKeyExpression.LuceneFieldConfig::new)
         );
     }
 }
diff --git a/...-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneFunctionNames.java b/...-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneFunctionNames.java
@@ -38,6 +38,8 @@ public class LuceneFunctionNames {
     public static final String LUCENE_SORT_BY_RELEVANCE = "lucene_sort_by_relevance";
     public static final String LUCENE_SORT_BY_DOCUMENT_NUMBER = "lucene_sort_by_document_number";
 
+    public static final String LUCENE_HIGHLIGHT_TAG = "lucene_highlight_tag";
+
     private LuceneFunctionNames() {
     }