Skip to content

Commit

Permalink
Resolves FoundationDB#1862: Lucene search with highlighting the terms
Browse files Browse the repository at this point in the history
  • Loading branch information
tian-yizuo committed Oct 18, 2022
1 parent 2344f14 commit 60fe9ee
Show file tree
Hide file tree
Showing 17 changed files with 537 additions and 115 deletions.
2 changes: 1 addition & 1 deletion docs/ReleaseNotes.md
Expand Up @@ -25,7 +25,7 @@ The Guava dependency version has been updated to 31.1. Projects may need to chec
* **Performance** Looking up logical values from `DirectoryLayerDirectory`s no longer needs to create new transactions [(Issue #1857)](https://github.com/FoundationDB/fdb-record-layer/issues/1857)
* **Performance** Improvement 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Performance** Improvement 5 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Feature 1 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Lucene search with highlighting the terms [(Issue #1862)](https://github.com/FoundationDB/fdb-record-layer/issues/1862)
* **Feature** Feature 2 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Feature 3 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Feature 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
Expand Down
Expand Up @@ -24,8 +24,10 @@
import com.apple.foundationdb.record.metadata.Index;
import com.apple.foundationdb.record.metadata.Key;
import com.apple.foundationdb.record.metadata.Key.Evaluated.NullStandin;
import com.apple.foundationdb.record.provider.foundationdb.FDBStoredRecord;
import com.apple.foundationdb.tuple.Tuple;
import com.apple.foundationdb.tuple.TupleHelpers;
import com.google.protobuf.Message;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
Expand Down Expand Up @@ -228,6 +230,17 @@ public IndexEntry subKey(int startIdx, int endIdx) {
return subKey;
}

/**
* Rewrite the fetched stored record if needed. The default behavior is to keep the original fetched record.
* @param record the fetched record to rewrite
* @param <M> type used to represent stored records
* @return the rewritten record
*/
@Nonnull
public <M extends Message> FDBStoredRecord<M> rewriteStoredRecord(@Nonnull FDBStoredRecord<M> record) {
return record;
}

private void checkIfNullTypeAvailable() {
// This indicates that the key/value was created from a tuple (i.e. likely values were read from an
// index entry in the database) and, therefore, we don't know what type of null it was when it was
Expand Down
Expand Up @@ -1311,7 +1311,7 @@ default CompletableFuture<FDBIndexedRecord<M>> loadIndexEntryRecord(@Nonnull fin
throw new RecordCoreException("Unexpected index orphan behavior: " + orphanBehavior);
}
}
return new FDBIndexedRecord<>(entry, rec);
return new FDBIndexedRecord<>(entry, entry.rewriteStoredRecord(rec));
});
}

Expand Down
Expand Up @@ -150,6 +150,25 @@ public static <M extends Message> FDBStoredRecordBuilder<M> newBuilder(@Nonnull
return new FDBStoredRecordBuilder<>(protoRecord);
}

/**
* Get a builder with the parameters of a given {@link FDBStoredRecord}
* @param record given record
* @param <M> type used to represent stored records
* @return a new builder
*/
@Nonnull
public static <M extends Message> FDBStoredRecordBuilder<M> newBuilder(@Nonnull FDBStoredRecord<M> record) {
return new FDBStoredRecordBuilder<>(record.getRecord())
.setPrimaryKey(record.getPrimaryKey())
.setRecordType(record.getRecordType())
.setKeyCount(record.getKeyCount())
.setKeySize(record.getKeySize())
.setValueSize(record.getValueSize())
.setSplit(record.isSplit())
.setVersion(record.getVersion())
.setVersionedInline(record.isVersionedInline());
}

/**
* Copy this record with a different version.
* @param recordVersion new version
Expand Down
Expand Up @@ -78,6 +78,7 @@
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
Expand Down Expand Up @@ -192,8 +193,10 @@ private void performLookup() throws IOException {
@SuppressWarnings("squid:S3776") // Cognitive complexity is too high. Candidate for later refactoring
@Nullable
@VisibleForTesting
static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<String> matchedTokens, @Nullable String prefixToken, boolean highlight) {
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
static String searchAllMaybeHighlight(@Nonnull String fieldName, @Nonnull Analyzer queryAnalyzer, @Nonnull String text,
@Nonnull Set<String> matchedTokens, @Nullable String prefixToken,
boolean highlight, boolean allMatchingRequired) {
try (TokenStream ts = queryAnalyzer.tokenStream(fieldName, new StringReader(text))) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
Expand All @@ -217,21 +220,38 @@ static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<S
if (matchedTokens.contains(token)) {
// Token matches.
if (highlight) {
addWholeMatch(sb, text.substring(startOffset, endOffset));
int start = startOffset;
while (start < endOffset) {
int index = text.toLowerCase(Locale.ROOT).indexOf(token, start);
int actualStartOffset = index;
int actualEndOffset = index + token.length();
addNonMatch(sb, text.substring(start, index));
String substring = text.substring(actualStartOffset, actualEndOffset);
if (substring.equalsIgnoreCase(token) && !tokenAlreadyHighlighted(text, actualStartOffset, actualEndOffset)) {
addWholeMatch(sb, substring);
} else {
addNonMatch(sb, substring);
}
start = actualEndOffset;
}
}
upto = endOffset;
matchedInText.add(token);
} else if (prefixToken != null && token.startsWith(prefixToken)) {
if (highlight) {
addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken);
if (text.substring(startOffset, endOffset).equalsIgnoreCase(token) && !tokenAlreadyHighlighted(text, startOffset, endOffset)) {
addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken);
} else {
addNonMatch(sb, text.substring(startOffset, endOffset));
}
}
upto = endOffset;
matchedPrefix = true;
}
}
ts.end();

if ((prefixToken != null && !matchedPrefix) || (matchedInText.size() < matchedTokens.size())) {
if (allMatchingRequired && ((prefixToken != null && !matchedPrefix) || (matchedInText.size() < matchedTokens.size()))) {
// Query text not actually found in document text. Return null
return null;
}
Expand All @@ -252,6 +272,14 @@ static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<S
}
}

// Check this before highlighting tokens, so the highlighting is idempotent
private static boolean tokenAlreadyHighlighted(String text, int startOffset, int endOffset) {
return startOffset - 3 >= 0
&& endOffset + 4 > text.length()
&& text.startsWith("<b>", startOffset - 3)
&& text.startsWith("</b>", endOffset);
}

/** Called while highlighting a single result, to append a
* non-matching chunk of text from the suggestion to the
* provided fragments list.
Expand Down Expand Up @@ -532,7 +560,7 @@ private RecordCursor<IndexEntry> findIndexEntriesInRecord(ScoreDocAndRecord scor
// matched terms
return null;
}
String match = searchAllMaybeHighlight(queryAnalyzer, text, queryTokens, prefixToken, highlight);
String match = searchAllMaybeHighlight(documentField.getFieldName(), queryAnalyzer, text, queryTokens, prefixToken, highlight, true);
if (match == null) {
// Text not found in this field
return null;
Expand Down
Expand Up @@ -38,9 +38,12 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

/**
* Helper class for converting {@link FDBRecord}s to Lucene documents.
Expand Down Expand Up @@ -131,6 +134,41 @@ public static <M extends Message> List<DocumentField> getFields(@Nonnull KeyExpr
return fields.getFields();
}

// Modify the Lucene fields of a record message with highlighting the terms from the given termMap
@Nonnull
public static <M extends Message> void highlightTermsInMessage(@Nonnull KeyExpression expression, @Nonnull Message.Builder builder, @Nonnull Map<String, Set<String>> termMap,
@Nonnull LuceneAnalyzerCombinationProvider analyzerSelector) {
LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource<M> recordRebuildSource = new LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource<>(null, builder.getDescriptorForType(), builder, builder.build());

LuceneIndexExpressions.getFields(expression, recordRebuildSource,
(source, fieldName, value, type, stored, sorted, overriddenKeyRanges, groupingKeyIndex, keyIndex, fieldConfigsIgnored) -> {
Set<String> terms = new HashSet<>();
terms.addAll(termMap.getOrDefault(fieldName, Collections.emptySet()));
terms.addAll(termMap.getOrDefault("", Collections.emptySet()));
if (terms.isEmpty()) {
return;
}
for (Map.Entry<Descriptors.FieldDescriptor, Object> entry : source.message.getAllFields().entrySet()) {
Object entryValue = entry.getValue();
if (entryValue instanceof String && entryValue.equals(value)
&& terms.stream().filter(t -> ((String) entryValue).toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))).findAny().isPresent()) {
String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValue).getAnalyzer(), (String) entryValue, termMap.get(fieldName), null, true, false);
source.buildMessage(highlightedText, entry.getKey(), null, null, true, 0);
} else if (entryValue instanceof List) {
int index = 0;
for (Object entryValueElement : ((List) entryValue)) {
if (entryValueElement instanceof String && entryValueElement.equals(value)
&& terms.stream().filter(t -> ((String) entryValueElement).toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))).findAny().isPresent()) {
String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValueElement).getAnalyzer(), (String) entryValueElement, termMap.get(fieldName), null, true, false);
source.buildMessage(highlightedText, entry.getKey(), null, null, true, index);
}
index++;
}
}
}
}, null);
}

protected static class FDBRecordSource<M extends Message> implements LuceneIndexExpressions.RecordSource<FDBRecordSource<M>> {
@Nonnull
private final FDBRecord<M> rec;
Expand Down
Expand Up @@ -23,6 +23,7 @@
import com.apple.foundationdb.record.IndexEntry;
import com.apple.foundationdb.record.RecordCoreException;
import com.apple.foundationdb.record.logging.LogMessageKeys;
import com.apple.foundationdb.record.metadata.Key;
import com.apple.foundationdb.record.metadata.expressions.FieldKeyExpression;
import com.apple.foundationdb.record.metadata.expressions.GroupingKeyExpression;
import com.apple.foundationdb.record.metadata.expressions.KeyExpression;
Expand Down Expand Up @@ -200,6 +201,115 @@ private static Pair<List<String>, List<String>> getOriginalAndMappedFieldElement
return Pair.of(fixedFieldNames, dynamicFieldNames);
}

static class RecordRebuildSource<M extends Message> implements LuceneIndexExpressions.RecordSource<RecordRebuildSource<M>> {
@Nullable
public final RecordRebuildSource<M> parent;
@Nonnull
public final Descriptors.Descriptor descriptor;
@Nullable
public final Descriptors.FieldDescriptor fieldDescriptor;
@Nonnull
public final Message.Builder builder;
public final Message message;
public final int indexIfRepeated;

RecordRebuildSource(@Nullable RecordRebuildSource<M> parent, @Nonnull Descriptors.Descriptor descriptor, @Nonnull Message.Builder builder, @Nonnull Message message) {
//this.rec = rec;
this.parent = parent;
this.descriptor = descriptor;
this.fieldDescriptor = null;
this.builder = builder;
this.message = message;
this.indexIfRepeated = 0;
}

RecordRebuildSource(@Nullable RecordRebuildSource<M> parent, @Nonnull Descriptors.FieldDescriptor fieldDescriptor, @Nonnull Message.Builder builder, @Nonnull Message message, int indexIfRepeated) {
//this.rec = rec;
this.parent = parent;
this.descriptor = fieldDescriptor.getMessageType();
this.fieldDescriptor = fieldDescriptor;
this.builder = builder;
this.message = message;
this.indexIfRepeated = indexIfRepeated;
}

@Override
public Descriptors.Descriptor getDescriptor() {
return descriptor;
}

@Override
public Iterable<RecordRebuildSource<M>> getChildren(@Nonnull FieldKeyExpression parentExpression) {
final String parentField = parentExpression.getFieldName();
final Descriptors.FieldDescriptor parentFieldDescriptor = descriptor.findFieldByName(parentField);

final List<RecordRebuildSource<M>> children = new ArrayList<>();
int index = 0;
for (Key.Evaluated evaluated : parentExpression.evaluateMessage(null, message)) {
final Message submessage = (Message)evaluated.toList().get(0);
if (submessage != null) {
if (parentFieldDescriptor.isRepeated()) {
children.add(new RecordRebuildSource<M>(this, parentFieldDescriptor,
builder.newBuilderForField(parentFieldDescriptor),
submessage, index++));
} else {
children.add(new RecordRebuildSource<M>(this, parentFieldDescriptor,
builder.getFieldBuilder(parentFieldDescriptor),
submessage, index));
}
}
}
return children;
}

@Override
public Iterable<Object> getValues(@Nonnull FieldKeyExpression fieldExpression) {
final List<Object> values = new ArrayList<>();
for (Key.Evaluated evaluated : fieldExpression.evaluateMessage(null, message)) {
Object value = evaluated.getObject(0);
if (value != null) {
values.add(value);
}
}
return values;
}

@SuppressWarnings("java:S3776")
public void buildMessage(@Nullable Object value, Descriptors.FieldDescriptor subFieldDescriptor, @Nullable String customizedKey, @Nullable String mappedKeyField, boolean forLuceneField, int index) {
final Descriptors.FieldDescriptor mappedKeyFieldDescriptor = mappedKeyField == null ? null : descriptor.findFieldByName(mappedKeyField);
if (mappedKeyFieldDescriptor != null) {
if (customizedKey == null) {
return;
}
builder.setField(mappedKeyFieldDescriptor, customizedKey);
}

if (value == null) {
return;
}
if (subFieldDescriptor.isRepeated()) {
if (subFieldDescriptor.getJavaType().equals(Descriptors.FieldDescriptor.JavaType.MESSAGE)) {
Message.Builder subBuilder = builder.newBuilderForField(subFieldDescriptor);
subBuilder.mergeFrom((Message) builder.getRepeatedField(subFieldDescriptor, index)).mergeFrom((Message) value);
builder.setRepeatedField(subFieldDescriptor, index, subBuilder.build());
} else {
builder.setRepeatedField(subFieldDescriptor, index, value);
}

} else {
int count = builder.getAllFields().size();
if (message != null && count == 0) {
builder.mergeFrom(message);
}
builder.setField(subFieldDescriptor, value);
}

if (parent != null) {
parent.buildMessage(builder.build(), this.fieldDescriptor, mappedKeyFieldDescriptor == null ? customizedKey : null, mappedKeyFieldDescriptor == null ? mappedKeyField : null, forLuceneField, indexIfRepeated);
}
}
}

/**
* A {@link com.apple.foundationdb.record.lucene.LuceneIndexExpressions.RecordSource} implementation to build the partial record message.
*/
Expand Down
Expand Up @@ -133,7 +133,8 @@ public RecordCursor<IndexEntry> scan(@Nonnull final IndexScanBounds scanBounds,
return new LuceneRecordCursor(executor, state.context.getPropertyStorage().getPropertyValue(LuceneRecordContextProperties.LUCENE_EXECUTOR_SERVICE),
state.context.getPropertyStorage().getPropertyValue(LuceneRecordContextProperties.LUCENE_INDEX_CURSOR_PAGE_SIZE),
scanProperties, state, scanQuery.getQuery(), scanQuery.getSort(), continuation,
scanQuery.getGroupKey(), scanQuery.getStoredFields(), scanQuery.getStoredFieldTypes());
scanQuery.getGroupKey(), scanQuery.isHighlight(),
scanQuery.getStoredFields(), scanQuery.getStoredFieldTypes(), indexAnalyzerSelector);
}

if (scanType.equals(LuceneScanTypes.BY_LUCENE_AUTO_COMPLETE)) {
Expand Down

0 comments on commit 60fe9ee

Please sign in to comment.