Skip to content

Commit

Permalink
Resolves FoundationDB#1862: Lucene search with highlighting the terms
Browse files Browse the repository at this point in the history
  • Loading branch information
tian-yizuo committed Oct 12, 2022
1 parent 8d5655d commit d5c334d
Show file tree
Hide file tree
Showing 19 changed files with 565 additions and 111 deletions.
2 changes: 1 addition & 1 deletion docs/ReleaseNotes.md
Expand Up @@ -38,7 +38,7 @@ This release also updates downstream dependency versions. Most notably, the prot
* **Performance** Improvement 3 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Performance** Improvement 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Performance** Improvement 5 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Feature 1 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Lucene search with highlighting the terms [(Issue #1862)](https://github.com/FoundationDB/fdb-record-layer/issues/1862)
* **Feature** Feature 2 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Feature 3 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
* **Feature** Feature 4 [(Issue #NNN)](https://github.com/FoundationDB/fdb-record-layer/issues/NNN)
Expand Down
Expand Up @@ -39,8 +39,12 @@

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Consumer;
import java.util.stream.Collectors;

/**
Expand All @@ -52,6 +56,8 @@ public class DynamicMessageRecordSerializer implements RecordSerializer<Message>

private static final DynamicMessageRecordSerializer INSTANCE = new DynamicMessageRecordSerializer();

private Map<CallbackType, List<Consumer<DynamicMessage.Builder>>> callbacks;

@Nonnull
public static RecordSerializer<Message> instance() {
return INSTANCE;
Expand All @@ -66,6 +72,35 @@ public RecordSerializer<Message> widen() {
return this;
}

@Override
public void addCallBack(@Nonnull CallbackType type, Consumer<DynamicMessage.Builder> callback) {
if (this.callbacks == null) {
this.callbacks = new HashMap<>();
}
this.callbacks.putIfAbsent(type, new ArrayList<>());
this.callbacks.get(type).add(callback);
}

@Override
public void runCallBacks(DynamicMessage.Builder builder) {
if (this.callbacks == null) {
return;
}
for (List<Consumer<DynamicMessage.Builder>> list : callbacks.values()) {
for (Consumer<DynamicMessage.Builder> consumer : list) {
consumer.accept(builder);
}
}
}

@Override
public void removeCallBacks(@Nonnull CallbackType type) {
if (this.callbacks == null) {
return;
}
this.callbacks.remove(type);
}

@Nonnull
@Override
public byte[] serialize(@Nonnull RecordMetaData metaData,
Expand Down Expand Up @@ -107,7 +142,10 @@ public Message deserialize(@Nonnull final RecordMetaData metaData,
try {
final Descriptors.Descriptor unionDescriptor = metaData.getUnionDescriptor();
final DynamicMessage unionMessage = deserializeUnion(unionDescriptor, primaryKey, serialized, metaData.getVersion());
return getUnionField(unionMessage, primaryKey).getRight();
DynamicMessage msg = getUnionField(unionMessage, primaryKey).getRight();
DynamicMessage.Builder builder = msg.toBuilder();
runCallBacks(builder);
return builder.build();
} finally {
if (timer != null) {
timer.recordSinceNanoTime(Events.DESERIALIZE_PROTOBUF_RECORD, startTime);
Expand Down
Expand Up @@ -26,12 +26,14 @@
import com.apple.foundationdb.record.metadata.RecordType;
import com.apple.foundationdb.tuple.Tuple;
import com.google.protobuf.Descriptors;
import com.google.protobuf.DynamicMessage;
import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.Message;
import com.google.protobuf.UninitializedMessageException;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.function.Consumer;
import java.util.function.Supplier;

/**
Expand Down Expand Up @@ -132,4 +134,13 @@ protected abstract M getUnionField(@Nonnull Descriptors.Descriptor unionDescript
public RecordSerializer<Message> widen() {
return new MessageBuilderRecordSerializer(builderSupplier::get);
}

@Override
public void addCallBack(@Nonnull CallbackType type, Consumer<DynamicMessage.Builder> callback) {}

@Override
public void runCallBacks(DynamicMessage.Builder builder) {}

@Override
public void removeCallBacks(@Nonnull CallbackType type) {}
}
Expand Up @@ -24,10 +24,12 @@
import com.apple.foundationdb.record.RecordMetaData;
import com.apple.foundationdb.record.metadata.RecordType;
import com.apple.foundationdb.tuple.Tuple;
import com.google.protobuf.DynamicMessage;
import com.google.protobuf.Message;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.function.Consumer;

/**
* A converter between a Protobuf record and a byte string stored in one or more values in the FDB key-value store.
Expand Down Expand Up @@ -61,6 +63,12 @@ public interface RecordSerializer<M extends Message> {
byte[] serialize(@Nonnull RecordMetaData metaData, @Nonnull RecordType recordType,
@Nonnull M rec, @Nullable StoreTimer timer);

void addCallBack(@Nonnull CallbackType type, Consumer<DynamicMessage.Builder> callback);

void runCallBacks(DynamicMessage.Builder m);

void removeCallBacks(@Nonnull CallbackType type);

/**
* Convert a byte array to a Protobuf record. This should be the inverse of the
* {@link #serialize(RecordMetaData, RecordType, Message, StoreTimer) serialize()}
Expand Down Expand Up @@ -187,4 +195,8 @@ public boolean isSize() {
return isSize;
}
}

enum CallbackType {
LUCENE_QUERY_HIGHLIGHT;
}
}
Expand Up @@ -28,6 +28,7 @@
import com.apple.foundationdb.record.metadata.RecordType;
import com.apple.foundationdb.tuple.Tuple;
import com.google.common.annotations.VisibleForTesting;
import com.google.protobuf.DynamicMessage;
import com.google.protobuf.Message;
import com.apple.foundationdb.annotation.SpotBugsSuppressWarnings;

Expand All @@ -37,6 +38,7 @@
import java.nio.ByteOrder;
import java.security.GeneralSecurityException;
import java.util.Arrays;
import java.util.function.Consumer;
import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;
Expand Down Expand Up @@ -335,6 +337,21 @@ public RecordSerializer<Message> widen() {
return new TransformedRecordSerializer<>(inner.widen(), compressWhenSerializing, compressionLevel, encryptWhenSerializing);
}

@Override
public void addCallBack(@Nonnull CallbackType type, Consumer<DynamicMessage.Builder> callback) {
inner.addCallBack(type, callback);
}

@Override
public void runCallBacks(DynamicMessage.Builder m) {
inner.runCallBacks(m);
}

@Override
public void removeCallBacks(@Nonnull CallbackType type) {
inner.removeCallBacks(type);
}

@Nonnull
public RecordSerializer<M> untransformed() {
return inner;
Expand Down
Expand Up @@ -192,8 +192,8 @@ private void performLookup() throws IOException {
@SuppressWarnings("squid:S3776") // Cognitive complexity is too high. Candidate for later refactoring
@Nullable
@VisibleForTesting
static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<String> matchedTokens, @Nullable String prefixToken, boolean highlight) {
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
static String searchAllMaybeHighlight(String fieldName, Analyzer queryAnalyzer, String text, Set<String> matchedTokens, @Nullable String prefixToken, boolean highlight, boolean allMatching) {
try (TokenStream ts = queryAnalyzer.tokenStream(fieldName, new StringReader(text))) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
Expand All @@ -217,13 +217,21 @@ static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<S
if (matchedTokens.contains(token)) {
// Token matches.
if (highlight) {
addWholeMatch(sb, text.substring(startOffset, endOffset));
if (text.substring(startOffset, endOffset).toLowerCase().equals(token) && !tokenAlreadyHighlighted(text, token, startOffset, endOffset)) {
addWholeMatch(sb, text.substring(startOffset, endOffset));
} else {
addNonMatch(sb, text.substring(startOffset, endOffset));
}
}
upto = endOffset;
matchedInText.add(token);
} else if (prefixToken != null && token.startsWith(prefixToken)) {
if (highlight) {
addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken);
if (text.substring(startOffset, endOffset).toLowerCase().equals(token) && !tokenAlreadyHighlighted(text, token, startOffset, endOffset)) {
addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken);
} else {
addNonMatch(sb, text.substring(startOffset, endOffset));
}
}
upto = endOffset;
matchedPrefix = true;
Expand All @@ -242,6 +250,25 @@ static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<S
if (upto < endOffset) {
addNonMatch(sb, text.substring(upto));
}
// Deal with the matching substrings if they are tokens
for (String token : matchedInText) {
int index = 0;
while (index < sb.length()) {
int matchIndex = sb.indexOf(token, index);
if (matchIndex < 0) {
break;
}
if (matchIndex > 2 && sb.substring(matchIndex - 3, matchIndex).equals("<b>")
&& matchIndex + token.length() + 4 <= sb.length()
&& sb.substring(matchIndex + token.length(), matchIndex + token.length() + 4).equals("</b>")) {
index = matchIndex + token.length() + 4;
} else {
sb.replace(matchIndex, matchIndex + token.length(), "<b>" + token + "</b>");
index = matchIndex + token.length() + 7;
}
}
}

return sb.toString();
} else {
return text;
Expand All @@ -252,6 +279,14 @@ static String searchAllMaybeHighlight(Analyzer queryAnalyzer, String text, Set<S
}
}

// Check this before highlighting tokens, so the highlighting is idempotent
private static boolean tokenAlreadyHighlighted(String text, String token, int startOffset, int endOffset) {
return startOffset - 3 >= 0
&& endOffset + 4 > text.length()
&& text.substring(startOffset - 3, startOffset).equals("<b>")
&& text.substring(endOffset, endOffset + 4).equals("</b>");
}

/** Called while highlighting a single result, to append a
* non-matching chunk of text from the suggestion to the
* provided fragments list.
Expand Down Expand Up @@ -532,7 +567,7 @@ private RecordCursor<IndexEntry> findIndexEntriesInRecord(ScoreDocAndRecord scor
// matched terms
return null;
}
String match = searchAllMaybeHighlight(queryAnalyzer, text, queryTokens, prefixToken, highlight);
String match = searchAllMaybeHighlight(documentField.getFieldName(), queryAnalyzer, text, queryTokens, prefixToken, highlight, true);
if (match == null) {
// Text not found in this field
return null;
Expand Down
Expand Up @@ -38,9 +38,11 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

/**
* Helper class for converting {@link FDBRecord}s to Lucene documents.
Expand Down Expand Up @@ -131,6 +133,40 @@ public static <M extends Message> List<DocumentField> getFields(@Nonnull KeyExpr
return fields.getFields();
}

@Nonnull
public static <M extends Message> void highlightTermsInMessage(@Nonnull KeyExpression expression, @Nonnull Message.Builder builder, @Nonnull Map<String, Set<String>> termMap,
@Nonnull LuceneAnalyzerCombinationProvider analyzerSelector) {
LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource<M> recordRebuildSource = new LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource<M>(null, builder.getDescriptorForType(), builder, builder.build());

LuceneIndexExpressions.getFields(expression, recordRebuildSource,
(source, fieldName, value, type, stored, sorted, overriddenKeyRanges, groupingKeyIndex, keyIndex, fieldConfigsIgnored) -> {
Set<String> terms = new HashSet<>();
terms.addAll(termMap.getOrDefault(fieldName, Collections.emptySet()));
terms.addAll(termMap.getOrDefault("", Collections.emptySet()));
if (terms.isEmpty()) {
return;
}
for (Map.Entry<Descriptors.FieldDescriptor, Object> entry : source.message.getAllFields().entrySet()) {
Object entryValue = entry.getValue();
if (entryValue instanceof String && entryValue.equals(value)
&& terms.stream().filter(t -> ((String) entryValue).toLowerCase().contains(t.toLowerCase())).findAny().isPresent()) {
String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValue).getAnalyzer(), (String) entryValue, termMap.get(fieldName), null, true, false);
source.buildMessage(highlightedText, entry.getKey(), null, null, true, 0);
} else if (entryValue instanceof List) {
int index = 0;
for (Object entryValueElement : ((List) entryValue)) {
if (entryValueElement instanceof String && entryValueElement.equals(value)
&& terms.stream().filter(t -> ((String) entryValueElement).toLowerCase().contains(t.toLowerCase())).findAny().isPresent()) {
String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValueElement).getAnalyzer(), (String) entryValueElement, termMap.get(fieldName), null, true, false);
source.buildMessage(highlightedText, entry.getKey(), null, null, true, index);
}
index++;
}
}
}
}, null);
}

protected static class FDBRecordSource<M extends Message> implements LuceneIndexExpressions.RecordSource<FDBRecordSource<M>> {
@Nonnull
private final FDBRecord<M> rec;
Expand Down
Expand Up @@ -337,6 +337,9 @@ public static <T extends RecordSource<T>> void getFieldsRecursively(@Nonnull Key
}
Descriptors.Descriptor recordDescriptor = source.getDescriptor();
Descriptors.FieldDescriptor fieldDescriptor = recordDescriptor.findFieldByName(fieldExpression.getFieldName());
if (fieldDescriptor == null) {
return;
}
DocumentFieldType fieldType;
if (fieldText) {
switch (fieldDescriptor.getJavaType()) {
Expand Down

0 comments on commit d5c334d

Please sign in to comment.