Skip to content

Commit

Permalink
Merge pull request #1911 from MMcM/lucene_highlight
Browse files Browse the repository at this point in the history
Resolves #1862: Lucene search with highlighting the terms
  • Loading branch information
MMcM committed Nov 15, 2022
2 parents 167a0c1 + d4532d1 commit 37c92de
Show file tree
Hide file tree
Showing 21 changed files with 999 additions and 144 deletions.
2 changes: 1 addition & 1 deletion docs/ReleaseNotes.md
Expand Up @@ -25,7 +25,7 @@ The Guava dependency version has been updated to 31.1. Projects may need to chec
* **Performance** Improvement 3 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Performance** Improvement 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Performance** Improvement 5 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Feature 1 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Lucene search with highlighting the terms [(Issue #1862)](https://github.com/FoundationDB/fdb-record-layer/issues/1862)
* **Feature** Feature 2 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Feature 3 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Feature 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
Expand Down
Expand Up @@ -150,6 +150,14 @@ public static <M extends Message> FDBStoredRecordBuilder<M> newBuilder(@Nonnull
return new FDBStoredRecordBuilder<>(protoRecord);
}

/**
* Get a builder with the parameters of this stored record.
* @return a builder
*/
public FDBStoredRecordBuilder<M> asBuilder() {
return new FDBStoredRecordBuilder<>(this);
}

/**
* Copy this record with a different version.
* @param recordVersion new version
Expand Down
Expand Up @@ -59,6 +59,18 @@ public FDBStoredRecordBuilder(@Nonnull M protoRecord) {
this.protoRecord = protoRecord;
}

public FDBStoredRecordBuilder(@Nonnull FDBStoredRecord<M> record) {
this.protoRecord = record.getRecord();
this.primaryKey = record.getPrimaryKey();
this.recordType = record.getRecordType();
this.keyCount = record.getKeyCount();
this.keySize = record.getKeySize();
this.valueSize = record.getValueSize();
this.split = record.isSplit();
this.recordVersion = record.getVersion();
this.versionedInline = record.isVersionedInline();
}

@Override
@Nonnull
public Tuple getPrimaryKey() {
Expand Down
Expand Up @@ -72,12 +72,14 @@
import javax.annotation.Nullable;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
Expand All @@ -92,6 +94,11 @@
*/
public class LuceneAutoCompleteResultCursor implements BaseCursor<IndexEntry> {
private static final Logger LOGGER = LoggerFactory.getLogger(LuceneAutoCompleteResultCursor.class);

private static final int tokenCountBeforeHighlighted = 3;
private static final int tokenCountAfterHighlighted = 3;
private static final String highlightedTextConnector = "... ";

@Nonnull
private final Executor executor;
@Nonnull
Expand Down Expand Up @@ -192,22 +199,43 @@ private void performLookup() throws IOException {
@SuppressWarnings("squid:S3776") // Cognitive complexity is too high. Candidate for later refactoring
@Nullable
@VisibleForTesting
static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<String> matchedTokens, @Nullable String prefixToken, boolean highlight) {
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
static String searchAllMaybeHighlight(@Nonnull String fieldName, @Nonnull Analyzer queryAnalyzer, @Nonnull String text,
@Nonnull Set<String> matchedTokens, @Nullable String prefixToken,
boolean allMatchingRequired,
@Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) {
try (TokenStream ts = queryAnalyzer.tokenStream(fieldName, new StringReader(text))) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
StringBuilder sb = highlight ? new StringBuilder() : null;
StringBuilder sb = luceneQueryHighlightParameters.isHighlight() ? new StringBuilder() : null;
int upto = 0;
Set<String> matchedInText = new HashSet<>();
boolean matchedPrefix = false;
ArrayDeque<String> pres = new ArrayDeque<>();
ArrayDeque<String> ends = new ArrayDeque<>();
int lastMatchPos = -tokenCountAfterHighlighted - 1;
int currentPos = 0;
while (ts.incrementToken()) {
String token = termAtt.toString();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (upto < startOffset) {
if (highlight) {
addNonMatch(sb, text.substring(upto, startOffset));
if (luceneQueryHighlightParameters.isHighlight()) {
if (luceneQueryHighlightParameters.isCutSnippets()) {
if (currentPos - lastMatchPos <= tokenCountAfterHighlighted + 1) {
addNonMatch(sb, text.substring(upto, startOffset));
} else {
pres.add(text.substring(upto, startOffset));
if (pres.size() > tokenCountBeforeHighlighted) {
pres.poll();
}
if (ends.size() < luceneQueryHighlightParameters.getSnippedSize() - tokenCountAfterHighlighted) {
ends.add(text.substring(upto, startOffset));
}
}
} else {
addNonMatch(sb, text.substring(upto, startOffset));
}
}
upto = startOffset;
} else if (upto > startOffset) {
Expand All @@ -216,31 +244,66 @@ static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<S

if (matchedTokens.contains(token)) {
// Token matches.
if (highlight) {
addWholeMatch(sb, text.substring(startOffset, endOffset));
if (luceneQueryHighlightParameters.isHighlight()) {
if (luceneQueryHighlightParameters.isCutSnippets() && currentPos - lastMatchPos > tokenCountBeforeHighlighted + tokenCountAfterHighlighted + 1) {
addNonMatch(sb, highlightedTextConnector);
}
while (!pres.isEmpty()) {
addNonMatch(sb, pres.poll());
}
ends.clear();
int start = startOffset;
while (start < endOffset) {
int index = text.toLowerCase(Locale.ROOT).indexOf(token, start);
int actualStartOffset = index;
int actualEndOffset = index + token.length();
addNonMatch(sb, text.substring(start, index));
String substring = text.substring(actualStartOffset, actualEndOffset);
if (substring.equalsIgnoreCase(token) && !tokenAlreadyHighlighted(text, actualStartOffset, actualEndOffset,
luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) {
addWholeMatch(sb, substring,
luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag());
} else {
addNonMatch(sb, substring);
}
start = actualEndOffset;
}
}
upto = endOffset;
matchedInText.add(token);
lastMatchPos = currentPos;
} else if (prefixToken != null && token.startsWith(prefixToken)) {
if (highlight) {
addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken);
if (luceneQueryHighlightParameters.isHighlight()) {
if (!tokenAlreadyHighlighted(text, startOffset, endOffset,
luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) {
addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken,
luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag());
} else {
addNonMatch(sb, text.substring(startOffset, endOffset));
}
}
upto = endOffset;
matchedPrefix = true;
}
currentPos++;
}
ts.end();

if ((prefixToken != null && !matchedPrefix) || (matchedInText.size() < matchedTokens.size())) {
if (allMatchingRequired && ((prefixToken != null && !matchedPrefix) || (matchedInText.size() < matchedTokens.size()))) {
// Query text not actually found in document text. Return null
return null;
}

// Text was found. Return text (highlighted or not)
if (highlight) {
if (luceneQueryHighlightParameters.isHighlight()) {
int endOffset = offsetAtt.endOffset();
if (upto < endOffset) {
if (upto < endOffset && !luceneQueryHighlightParameters.isCutSnippets()) {
addNonMatch(sb, text.substring(upto));
} else if (luceneQueryHighlightParameters.isCutSnippets()) {
while (!ends.isEmpty()) {
addNonMatch(sb, ends.poll());
}
addNonMatch(sb, highlightedTextConnector);
}
return sb.toString();
} else {
Expand All @@ -252,6 +315,15 @@ static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<S
}
}

// Check this before highlighting tokens, so the highlighting is idempotent
private static boolean tokenAlreadyHighlighted(@Nonnull String text, int startOffset, int endOffset,
@Nonnull String leftTag, @Nonnull String rightTag) {
return startOffset - leftTag.length() >= 0
&& endOffset + rightTag.length() > text.length()
&& text.startsWith(leftTag, startOffset - 3)
&& text.startsWith(rightTag, endOffset);
}

/** Called while highlighting a single result, to append a
* non-matching chunk of text from the suggestion to the
* provided fragments list.
Expand All @@ -266,11 +338,13 @@ private static void addNonMatch(StringBuilder sb, String text) {
* the whole matched token to the provided fragments list.
* @param sb The {@code StringBuilder} to append to
* @param surface The surface form (original) text
* @param leftTag the tag to add left to the surface
* @param rightTag the tag to add right to the surface
*/
private static void addWholeMatch(StringBuilder sb, String surface) {
sb.append("<b>");
private static void addWholeMatch(StringBuilder sb, String surface, String leftTag, String rightTag) {
sb.append(leftTag);
sb.append(surface);
sb.append("</b>");
sb.append(rightTag);
}

/** Called while highlighting a single result, to append a
Expand All @@ -280,17 +354,19 @@ private static void addWholeMatch(StringBuilder sb, String surface) {
* (indexed during build, corresponding to
* this match
* @param prefixToken The prefix of the token that matched
* @param leftTag the tag to add left to the surface
* @param rightTag the tag to add right to the surface
*/
private static void addPrefixMatch(StringBuilder sb, String surface, String prefixToken) {
private static void addPrefixMatch(StringBuilder sb, String surface, String prefixToken, String leftTag, String rightTag) {
// TODO: apps can try to invert their analysis logic
// here, e.g. downcase the two before checking prefix:
if (prefixToken.length() >= surface.length()) {
addWholeMatch(sb, surface);
addWholeMatch(sb, surface, leftTag, rightTag);
return;
}
sb.append("<b>");
sb.append(leftTag);
sb.append(surface.substring(0, prefixToken.length()));
sb.append("</b>");
sb.append(rightTag);
sb.append(surface.substring(prefixToken.length()));
}

Expand Down Expand Up @@ -532,7 +608,8 @@ private RecordCursor<IndexEntry> findIndexEntriesInRecord(ScoreDocAndRecord scor
// matched terms
return null;
}
String match = searchAllMaybeHighlight(queryAnalyzer, text, queryTokens, prefixToken, highlight);
String match = searchAllMaybeHighlight(documentField.getFieldName(), queryAnalyzer, text, queryTokens, prefixToken, true,
new LuceneScanQueryParameters.LuceneQueryHighlightParameters(highlight));
if (match == null) {
// Text not found in this field
return null;
Expand Down
Expand Up @@ -38,9 +38,12 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

/**
* Helper class for converting {@link FDBRecord}s to Lucene documents.
Expand Down Expand Up @@ -131,6 +134,42 @@ public static <M extends Message> List<DocumentField> getFields(@Nonnull KeyExpr
return fields.getFields();
}

// Modify the Lucene fields of a record message with highlighting the terms from the given termMap
@Nonnull
public static <M extends Message> void highlightTermsInMessage(@Nonnull KeyExpression expression, @Nonnull Message.Builder builder, @Nonnull Map<String, Set<String>> termMap,
@Nonnull LuceneAnalyzerCombinationProvider analyzerSelector,
@Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) {
LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource<M> recordRebuildSource = new LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource<>(null, builder.getDescriptorForType(), builder, builder.build());

LuceneIndexExpressions.getFields(expression, recordRebuildSource,
(source, fieldName, value, type, stored, sorted, overriddenKeyRanges, groupingKeyIndex, keyIndex, fieldConfigsIgnored) -> {
Set<String> terms = new HashSet<>();
terms.addAll(termMap.getOrDefault(fieldName, Collections.emptySet()));
terms.addAll(termMap.getOrDefault("", Collections.emptySet()));
if (terms.isEmpty()) {
return;
}
for (Map.Entry<Descriptors.FieldDescriptor, Object> entry : source.message.getAllFields().entrySet()) {
Object entryValue = entry.getValue();
if (entryValue instanceof String && entryValue.equals(value)
&& terms.stream().filter(t -> ((String) entryValue).toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))).findAny().isPresent()) {
String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValue).getAnalyzer(), (String) entryValue, termMap.get(fieldName), null, false, luceneQueryHighlightParameters);
source.buildMessage(highlightedText, entry.getKey(), null, null, true, 0);
} else if (entryValue instanceof List) {
int index = 0;
for (Object entryValueElement : ((List) entryValue)) {
if (entryValueElement instanceof String && entryValueElement.equals(value)
&& terms.stream().filter(t -> ((String) entryValueElement).toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))).findAny().isPresent()) {
String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValueElement).getAnalyzer(), (String) entryValueElement, termMap.get(fieldName), null, false, luceneQueryHighlightParameters);
source.buildMessage(highlightedText, entry.getKey(), null, null, true, index);
}
index++;
}
}
}
}, null);
}

protected static class FDBRecordSource<M extends Message> implements LuceneIndexExpressions.RecordSource<FDBRecordSource<M>> {
@Nonnull
private final FDBRecord<M> rec;
Expand Down
Expand Up @@ -186,6 +186,8 @@ public enum Counts implements StoreTimer.Count {
LUCENE_SHARED_CACHE_HITS("lucene shared cache hits", false),
/** Block to read came not in shared cache. **/
LUCENE_SHARED_CACHE_MISSES("lucene shared cache misses", false),
/** Plan contains highlight operator. **/
PLAN_HIGHLIGHT_TERMS("lucene highlight plans", false),
;

private final String title;
Expand Down
Expand Up @@ -48,7 +48,8 @@ public List<FunctionKeyExpression.Builder> getBuilders() {
new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_FULL_TEXT_FIELD_WITH_TERM_VECTOR_POSITIONS, LuceneFunctionKeyExpression.LuceneFieldConfig::new),
new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_AUTO_COMPLETE_FIELD_INDEX_OPTIONS, LuceneFunctionKeyExpression.LuceneFieldConfig::new),
new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_SORT_BY_RELEVANCE, LuceneFunctionKeyExpression.LuceneSortBy::new),
new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_SORT_BY_DOCUMENT_NUMBER, LuceneFunctionKeyExpression.LuceneSortBy::new)
new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_SORT_BY_DOCUMENT_NUMBER, LuceneFunctionKeyExpression.LuceneSortBy::new),
new FunctionKeyExpression.BiFunctionBuilder(LuceneFunctionNames.LUCENE_HIGHLIGHT_TAG, LuceneFunctionKeyExpression.LuceneFieldConfig::new)
);
}
}
Expand Up @@ -38,6 +38,8 @@ public class LuceneFunctionNames {
public static final String LUCENE_SORT_BY_RELEVANCE = "lucene_sort_by_relevance";
public static final String LUCENE_SORT_BY_DOCUMENT_NUMBER = "lucene_sort_by_document_number";

public static final String LUCENE_HIGHLIGHT_TAG = "lucene_highlight_tag";

private LuceneFunctionNames() {
}

Expand Down

0 comments on commit 37c92de

Please sign in to comment.